tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/cube/cube_utils.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        cube_utils.h
00003  * Description: Declaration of the Cube Utilities Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  *(C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0(the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // The CubeUtils class provides miscellaneous utility and helper functions
00021 // to the rest of the Cube Engine
00022 
00023 #ifndef CUBE_UTILS_H
00024 #define CUBE_UTILS_H
00025 
00026 #include <vector>
00027 #include <string>
00028 
00029 #include "allheaders.h"
00030 #include "const.h"
00031 #include "char_set.h"
00032 #include "char_samp.h"
00033 
00034 namespace tesseract {
00035 class CubeUtils {
00036  public:
00037   CubeUtils();
00038   ~CubeUtils();
00039 
00040   // Converts a probability value to a cost by getting the -log() of the
00041   // probability value to a known base
00042   static int Prob2Cost(double prob_val);
00043   // Converts a cost to probability by getting the exp(-normalized cost)
00044   static double Cost2Prob(int cost);
00045   // Computes the length of a 32-bit char buffer
00046   static int StrLen(const char_32 *str);
00047   // Compares two 32-bit char buffers
00048   static int StrCmp(const char_32 *str1, const char_32 *str2);
00049   // Duplicates a 32-bit char buffer
00050   static char_32 *StrDup(const char_32 *str);
00051   // Creates a CharSamp from an Pix and a bounding box
00052   static CharSamp *CharSampleFromPix(Pix *pix,
00053                                      int left, int top, int wid, int hgt);
00054   // Creates a Pix from a CharSamp
00055   static Pix *PixFromCharSample(CharSamp *char_samp);
00056   // read the contents of a file to a string
00057   static bool ReadFileToString(const string &file_name, string *str);
00058   // split a string into vectors using any of the specified delimiters
00059   static void SplitStringUsing(const string &str, const string &delims,
00060                                vector<string> *str_vec);
00061   // UTF-8 to UTF-32 convesion functions
00062   static void UTF8ToUTF32(const char *utf8_str, string_32 *str32);
00063   static void UTF32ToUTF8(const char_32 *utf32_str, string *str);
00064   // Returns true if input word has either 1) all-one-case, or 2)
00065   // first character upper-case, and remaining characters lower-case.
00066   // If char_set is not NULL, uses tesseract's unicharset functions
00067   // to determine case properties. Otherwise, uses C-locale-dependent
00068   // functions, which may be unreliable on non-ASCII characters.
00069   static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set);
00070   // Returns char_32 pointer to the lower-case-transformed version of
00071   // the input string or NULL on error. If char_set is NULL returns NULL.
00072   // Return array must be freed by caller.
00073   static char_32 *ToLower(const char_32 *str32, CharSet *char_set);
00074   // Returns char_32 pointer to the upper-case-transformed version of
00075   // the input string or NULL on error. If char_set is NULL returns NULL.
00076   // Return array must be freed by caller.
00077   static char_32 *ToUpper(const char_32 *str32, CharSet *char_set);
00078  private:
00079   static unsigned char *GetImageData(Pix *pix,
00080                                      int left, int top, int wid, int hgt);
00081 };
00082 }  // namespace tesseract
00083 #endif  // CUBE_UTILS_H
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines