tesseract
3.03
|
00001 /********************************************************************** 00002 * File: cube_utils.h 00003 * Description: Declaration of the Cube Utilities Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2008 00006 * 00007 *(C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0(the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 // The CubeUtils class provides miscellaneous utility and helper functions 00021 // to the rest of the Cube Engine 00022 00023 #ifndef CUBE_UTILS_H 00024 #define CUBE_UTILS_H 00025 00026 #include <vector> 00027 #include <string> 00028 00029 #include "allheaders.h" 00030 #include "const.h" 00031 #include "char_set.h" 00032 #include "char_samp.h" 00033 00034 namespace tesseract { 00035 class CubeUtils { 00036 public: 00037 CubeUtils(); 00038 ~CubeUtils(); 00039 00040 // Converts a probability value to a cost by getting the -log() of the 00041 // probability value to a known base 00042 static int Prob2Cost(double prob_val); 00043 // Converts a cost to probability by getting the exp(-normalized cost) 00044 static double Cost2Prob(int cost); 00045 // Computes the length of a 32-bit char buffer 00046 static int StrLen(const char_32 *str); 00047 // Compares two 32-bit char buffers 00048 static int StrCmp(const char_32 *str1, const char_32 *str2); 00049 // Duplicates a 32-bit char buffer 00050 static char_32 *StrDup(const char_32 *str); 00051 // Creates a CharSamp from an Pix and a bounding box 00052 static CharSamp *CharSampleFromPix(Pix *pix, 00053 int left, int top, int wid, int hgt); 00054 // Creates a Pix from a CharSamp 00055 static Pix *PixFromCharSample(CharSamp *char_samp); 00056 // read the contents of a file to a string 00057 static bool ReadFileToString(const string &file_name, string *str); 00058 // split a string into vectors using any of the specified delimiters 00059 static void SplitStringUsing(const string &str, const string &delims, 00060 vector<string> *str_vec); 00061 // UTF-8 to UTF-32 convesion functions 00062 static void UTF8ToUTF32(const char *utf8_str, string_32 *str32); 00063 static void UTF32ToUTF8(const char_32 *utf32_str, string *str); 00064 // Returns true if input word has either 1) all-one-case, or 2) 00065 // first character upper-case, and remaining characters lower-case. 00066 // If char_set is not NULL, uses tesseract's unicharset functions 00067 // to determine case properties. Otherwise, uses C-locale-dependent 00068 // functions, which may be unreliable on non-ASCII characters. 00069 static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set); 00070 // Returns char_32 pointer to the lower-case-transformed version of 00071 // the input string or NULL on error. If char_set is NULL returns NULL. 00072 // Return array must be freed by caller. 00073 static char_32 *ToLower(const char_32 *str32, CharSet *char_set); 00074 // Returns char_32 pointer to the upper-case-transformed version of 00075 // the input string or NULL on error. If char_set is NULL returns NULL. 00076 // Return array must be freed by caller. 00077 static char_32 *ToUpper(const char_32 *str32, CharSet *char_set); 00078 private: 00079 static unsigned char *GetImageData(Pix *pix, 00080 int left, int top, int wid, int hgt); 00081 }; 00082 } // namespace tesseract 00083 #endif // CUBE_UTILS_H