tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/cube/cube_utils.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        cube_utils.cpp
00003  * Description: Implementation of the Cube Utilities Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include <math.h>
00021 #include <string>
00022 #include <vector>
00023 #include "cube_utils.h"
00024 #include "char_set.h"
00025 #include "unichar.h"
00026 
00027 namespace tesseract {
00028 CubeUtils::CubeUtils() {
00029 }
00030 
00031 CubeUtils::~CubeUtils() {
00032 }
00033 
00034 // convert a prob to a cost (-ve log prob)
00035 int CubeUtils::Prob2Cost(double prob_val) {
00036   if (prob_val < MIN_PROB)   {
00037     return MIN_PROB_COST;
00038   }
00039   return static_cast<int>(-log(prob_val) * PROB2COST_SCALE);
00040 }
00041 
00042 // converts a cost to probability
00043 double CubeUtils::Cost2Prob(int cost) {
00044   return exp(-cost / PROB2COST_SCALE);
00045 }
00046 
00047 // computes the length of a NULL terminated char_32 string
00048 int CubeUtils::StrLen(const char_32 *char_32_ptr) {
00049   if (char_32_ptr == NULL) {
00050     return 0;
00051   }
00052   int len = -1;
00053   while (char_32_ptr[++len]);
00054   return len;
00055 }
00056 
00057 // compares two char_32 strings
00058 int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) {
00059   const char_32 *pch1 = str1;
00060   const char_32 *pch2 = str2;
00061 
00062   for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) {
00063     if ((*pch1) != (*pch2)) {
00064       return (*pch1) - (*pch2);
00065     }
00066   }
00067 
00068   if ((*pch1) == 0) {
00069     if ((*pch2) == 0) {
00070       return 0;
00071     } else {
00072       return -1;
00073     }
00074   } else {
00075     return 1;
00076   }
00077 }
00078 
00079 // Duplicates a 32-bit char buffer
00080 char_32 *CubeUtils::StrDup(const char_32 *str32) {
00081   int len = StrLen(str32);
00082   char_32 *new_str = new char_32[len + 1];
00083   if (new_str == NULL) {
00084     return NULL;
00085   }
00086   memcpy(new_str, str32, len * sizeof(*str32));
00087   new_str[len] = 0;
00088   return new_str;
00089 }
00090 
00091 // creates a char samp from a specified portion of the image
00092 CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top,
00093                                        int wid, int hgt) {
00094   // get the raw img data from the image
00095   unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt);
00096   if (temp_buff == NULL) {
00097     return NULL;
00098   }
00099 
00100   // create a char samp from temp buffer
00101   CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff);
00102 
00103   // clean up temp buffer
00104   delete []temp_buff;
00105   return char_samp;
00106 }
00107 
00108 // create a B/W image from a char_sample
00109 Pix *CubeUtils::PixFromCharSample(CharSamp *char_samp) {
00110   // parameter check
00111   if (char_samp == NULL) {
00112     return NULL;
00113   }
00114 
00115   // get the raw data
00116   int stride = char_samp->Stride();
00117   int wid = char_samp->Width();
00118   int hgt = char_samp->Height();
00119 
00120   Pix *pix = pixCreate(wid, hgt, 1);
00121   if (pix == NULL) {
00122     return NULL;
00123   }
00124 
00125   // copy the contents
00126   unsigned char *line = char_samp->RawData();
00127   for (int y = 0; y < hgt ; y++, line += stride) {
00128     for (int x = 0; x < wid; x++) {
00129       if (line[x] != 0) {
00130         pixSetPixel(pix, x, y, 0);
00131       } else {
00132         pixSetPixel(pix, x, y, 255);
00133       }
00134     }
00135   }
00136 
00137   return pix;
00138 }
00139 
00140 // creates a raw buffer from the specified location of the pix
00141 unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top,
00142                                        int wid, int hgt) {
00143   // skip invalid dimensions
00144   if (left < 0 || top < 0 || wid < 0 || hgt < 0 ||
00145       (left + wid) > pix->w || (top + hgt) > pix->h ||
00146       pix->d != 1) {
00147     return NULL;
00148   }
00149 
00150   // copy the char img to a temp buffer
00151   unsigned char *temp_buff = new unsigned char[wid * hgt];
00152   if (temp_buff == NULL) {
00153     return NULL;
00154   }
00155   l_int32 w;
00156   l_int32 h;
00157   l_int32 d;
00158   l_int32 wpl;
00159   l_uint32 *line;
00160   l_uint32 *data;
00161 
00162   pixGetDimensions(pix, &w, &h, &d);
00163   wpl = pixGetWpl(pix);
00164   data = pixGetData(pix);
00165   line = data + (top * wpl);
00166 
00167   for (int y = 0, off = 0; y < hgt ; y++) {
00168     for (int x = 0; x < wid; x++, off++) {
00169       temp_buff[off] = GET_DATA_BIT(line, x + left) ? 0 : 255;
00170     }
00171     line += wpl;
00172   }
00173   return temp_buff;
00174 }
00175 
00176 // read file contents to a string
00177 bool CubeUtils::ReadFileToString(const string &file_name, string *str) {
00178   str->clear();
00179   FILE *fp = fopen(file_name.c_str(), "rb");
00180   if (fp == NULL) {
00181     return false;
00182   }
00183 
00184   // get the size of the size
00185   fseek(fp, 0, SEEK_END);
00186   int file_size = ftell(fp);
00187   if (file_size < 1) {
00188     fclose(fp);
00189     return false;
00190   }
00191   // adjust string size
00192   str->reserve(file_size);
00193   // read the contents
00194   rewind(fp);
00195   char *buff = new char[file_size];
00196   if (buff == NULL) {
00197     fclose(fp);
00198     return false;
00199   }
00200   int read_bytes = fread(buff, 1, static_cast<int>(file_size), fp);
00201   if (read_bytes == file_size) {
00202     str->append(buff, file_size);
00203   }
00204   delete []buff;
00205   fclose(fp);
00206   return (read_bytes == file_size);
00207 }
00208 
00209 // splits a string into vectors based on specified delimiters
00210 void CubeUtils::SplitStringUsing(const string &str,
00211                                  const string &delims,
00212                                  vector<string> *str_vec) {
00213   // Optimize the common case where delims is a single character.
00214   if (delims[0] != '\0' && delims[1] == '\0') {
00215     char c = delims[0];
00216     const char* p = str.data();
00217     const char* end = p + str.size();
00218     while (p != end) {
00219       if (*p == c) {
00220         ++p;
00221       } else {
00222         const char* start = p;
00223         while (++p != end && *p != c);
00224         str_vec->push_back(string(start, p - start));
00225       }
00226     }
00227     return;
00228   }
00229 
00230   string::size_type begin_index, end_index;
00231   begin_index = str.find_first_not_of(delims);
00232   while (begin_index != string::npos) {
00233     end_index = str.find_first_of(delims, begin_index);
00234     if (end_index == string::npos) {
00235       str_vec->push_back(str.substr(begin_index));
00236       return;
00237     }
00238     str_vec->push_back(str.substr(begin_index, (end_index - begin_index)));
00239     begin_index = str.find_first_not_of(delims, end_index);
00240   }
00241 }
00242 
00243 // UTF-8 to UTF-32 convesion functions
00244 void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) {
00245   str32->clear();
00246   int len = strlen(utf8_str);
00247   int step = 0;
00248   for (int ch = 0; ch < len; ch += step) {
00249     step = UNICHAR::utf8_step(utf8_str + ch);
00250     if (step > 0) {
00251       UNICHAR uni_ch(utf8_str + ch, step);
00252       (*str32) += uni_ch.first_uni();
00253     }
00254   }
00255 }
00256 
00257 // UTF-8 to UTF-32 convesion functions
00258 void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) {
00259   str->clear();
00260   for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++)  {
00261     UNICHAR uni_ch((*ch_32));
00262     char *utf8 = uni_ch.utf8_str();
00263     if (utf8 != NULL) {
00264       (*str) += utf8;
00265       delete []utf8;
00266     }
00267   }
00268 }
00269 
00270 bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
00271   bool all_one_case = true;
00272   bool capitalized;
00273   bool prev_upper;
00274   bool prev_lower;
00275   bool first_upper;
00276   bool first_lower;
00277   bool cur_upper;
00278   bool cur_lower;
00279 
00280   string str8;
00281   if (!char_set) {
00282     // If cube char_set is missing, use C-locale-dependent functions
00283     // on UTF8 characters to determine case properties.
00284     first_upper = isupper(str32[0]);
00285     first_lower = islower(str32[0]);
00286     if (first_upper)
00287       capitalized = true;
00288     prev_upper = first_upper;
00289     prev_lower = islower(str32[0]);
00290     for (int c = 1; str32[c] != 0; ++c) {
00291       cur_upper = isupper(str32[c]);
00292       cur_lower = islower(str32[c]);
00293       if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
00294         all_one_case = false;
00295       if (cur_upper)
00296         capitalized = false;
00297       prev_upper = cur_upper;
00298       prev_lower = cur_lower;
00299     }
00300   } else {
00301     UNICHARSET *unicharset = char_set->InternalUnicharset();
00302     // Use UNICHARSET functions to determine case properties
00303     first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
00304     first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
00305     if (first_upper)
00306       capitalized = true;
00307     prev_upper = first_upper;
00308     prev_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
00309 
00310     for (int c = 1; c < StrLen(str32); ++c) {
00311       cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
00312       cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
00313       if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
00314         all_one_case = false;
00315       if (cur_upper)
00316         capitalized = false;
00317       prev_upper = cur_upper;
00318       prev_lower = cur_lower;
00319     }
00320   }
00321   return all_one_case || capitalized;
00322 }
00323 
00324 char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) {
00325   if (!char_set) {
00326     return NULL;
00327   }
00328   UNICHARSET *unicharset = char_set->InternalUnicharset();
00329   int len = StrLen(str32);
00330   char_32 *lower = new char_32[len + 1];
00331   if (!lower)
00332     return NULL;
00333   for (int i = 0; i < len; ++i) {
00334     char_32 ch = str32[i];
00335     if (ch == INVALID_UNICHAR_ID) {
00336       delete [] lower;
00337       return NULL;
00338     }
00339     // convert upper-case characters to lower-case
00340     if (unicharset->get_isupper(char_set->ClassID(ch))) {
00341       UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
00342       const char_32 *str32_lower = char_set->ClassString(uid_lower);
00343       // expect lower-case version of character to be a single character
00344       if (!str32_lower || StrLen(str32_lower) != 1) {
00345         delete [] lower;
00346         return NULL;
00347       }
00348       lower[i] = str32_lower[0];
00349     } else {
00350       lower[i] = ch;
00351     }
00352   }
00353   lower[len] = 0;
00354   return lower;
00355 }
00356 
00357 char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) {
00358   if (!char_set) {
00359     return NULL;
00360   }
00361   UNICHARSET *unicharset = char_set->InternalUnicharset();
00362   int len = StrLen(str32);
00363   char_32 *upper = new char_32[len + 1];
00364   if (!upper)
00365     return NULL;
00366   for (int i = 0; i < len; ++i) {
00367     char_32 ch = str32[i];
00368     if (ch == INVALID_UNICHAR_ID) {
00369       delete [] upper;
00370       return NULL;
00371     }
00372     // convert lower-case characters to upper-case
00373     if (unicharset->get_islower(char_set->ClassID(ch))) {
00374       UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
00375       const char_32 *str32_upper = char_set->ClassString(uid_upper);
00376       // expect upper-case version of character to be a single character
00377       if (!str32_upper || StrLen(str32_upper) != 1) {
00378         delete [] upper;
00379         return NULL;
00380       }
00381       upper[i] = str32_upper[0];
00382     } else {
00383       upper[i] = ch;
00384     }
00385   }
00386   upper[len] = 0;
00387   return upper;
00388 }
00389 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines