tesseract
3.03
|
00001 /********************************************************************** 00002 * File: cube_utils.cpp 00003 * Description: Implementation of the Cube Utilities Class 00004 * Author: Ahmad Abdulkader 00005 * Created: 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <math.h> 00021 #include <string> 00022 #include <vector> 00023 #include "cube_utils.h" 00024 #include "char_set.h" 00025 #include "unichar.h" 00026 00027 namespace tesseract { 00028 CubeUtils::CubeUtils() { 00029 } 00030 00031 CubeUtils::~CubeUtils() { 00032 } 00033 00034 // convert a prob to a cost (-ve log prob) 00035 int CubeUtils::Prob2Cost(double prob_val) { 00036 if (prob_val < MIN_PROB) { 00037 return MIN_PROB_COST; 00038 } 00039 return static_cast<int>(-log(prob_val) * PROB2COST_SCALE); 00040 } 00041 00042 // converts a cost to probability 00043 double CubeUtils::Cost2Prob(int cost) { 00044 return exp(-cost / PROB2COST_SCALE); 00045 } 00046 00047 // computes the length of a NULL terminated char_32 string 00048 int CubeUtils::StrLen(const char_32 *char_32_ptr) { 00049 if (char_32_ptr == NULL) { 00050 return 0; 00051 } 00052 int len = -1; 00053 while (char_32_ptr[++len]); 00054 return len; 00055 } 00056 00057 // compares two char_32 strings 00058 int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) { 00059 const char_32 *pch1 = str1; 00060 const char_32 *pch2 = str2; 00061 00062 for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) { 00063 if ((*pch1) != (*pch2)) { 00064 return (*pch1) - (*pch2); 00065 } 00066 } 00067 00068 if ((*pch1) == 0) { 00069 if ((*pch2) == 0) { 00070 return 0; 00071 } else { 00072 return -1; 00073 } 00074 } else { 00075 return 1; 00076 } 00077 } 00078 00079 // Duplicates a 32-bit char buffer 00080 char_32 *CubeUtils::StrDup(const char_32 *str32) { 00081 int len = StrLen(str32); 00082 char_32 *new_str = new char_32[len + 1]; 00083 if (new_str == NULL) { 00084 return NULL; 00085 } 00086 memcpy(new_str, str32, len * sizeof(*str32)); 00087 new_str[len] = 0; 00088 return new_str; 00089 } 00090 00091 // creates a char samp from a specified portion of the image 00092 CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top, 00093 int wid, int hgt) { 00094 // get the raw img data from the image 00095 unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt); 00096 if (temp_buff == NULL) { 00097 return NULL; 00098 } 00099 00100 // create a char samp from temp buffer 00101 CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff); 00102 00103 // clean up temp buffer 00104 delete []temp_buff; 00105 return char_samp; 00106 } 00107 00108 // create a B/W image from a char_sample 00109 Pix *CubeUtils::PixFromCharSample(CharSamp *char_samp) { 00110 // parameter check 00111 if (char_samp == NULL) { 00112 return NULL; 00113 } 00114 00115 // get the raw data 00116 int stride = char_samp->Stride(); 00117 int wid = char_samp->Width(); 00118 int hgt = char_samp->Height(); 00119 00120 Pix *pix = pixCreate(wid, hgt, 1); 00121 if (pix == NULL) { 00122 return NULL; 00123 } 00124 00125 // copy the contents 00126 unsigned char *line = char_samp->RawData(); 00127 for (int y = 0; y < hgt ; y++, line += stride) { 00128 for (int x = 0; x < wid; x++) { 00129 if (line[x] != 0) { 00130 pixSetPixel(pix, x, y, 0); 00131 } else { 00132 pixSetPixel(pix, x, y, 255); 00133 } 00134 } 00135 } 00136 00137 return pix; 00138 } 00139 00140 // creates a raw buffer from the specified location of the pix 00141 unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top, 00142 int wid, int hgt) { 00143 // skip invalid dimensions 00144 if (left < 0 || top < 0 || wid < 0 || hgt < 0 || 00145 (left + wid) > pix->w || (top + hgt) > pix->h || 00146 pix->d != 1) { 00147 return NULL; 00148 } 00149 00150 // copy the char img to a temp buffer 00151 unsigned char *temp_buff = new unsigned char[wid * hgt]; 00152 if (temp_buff == NULL) { 00153 return NULL; 00154 } 00155 l_int32 w; 00156 l_int32 h; 00157 l_int32 d; 00158 l_int32 wpl; 00159 l_uint32 *line; 00160 l_uint32 *data; 00161 00162 pixGetDimensions(pix, &w, &h, &d); 00163 wpl = pixGetWpl(pix); 00164 data = pixGetData(pix); 00165 line = data + (top * wpl); 00166 00167 for (int y = 0, off = 0; y < hgt ; y++) { 00168 for (int x = 0; x < wid; x++, off++) { 00169 temp_buff[off] = GET_DATA_BIT(line, x + left) ? 0 : 255; 00170 } 00171 line += wpl; 00172 } 00173 return temp_buff; 00174 } 00175 00176 // read file contents to a string 00177 bool CubeUtils::ReadFileToString(const string &file_name, string *str) { 00178 str->clear(); 00179 FILE *fp = fopen(file_name.c_str(), "rb"); 00180 if (fp == NULL) { 00181 return false; 00182 } 00183 00184 // get the size of the size 00185 fseek(fp, 0, SEEK_END); 00186 int file_size = ftell(fp); 00187 if (file_size < 1) { 00188 fclose(fp); 00189 return false; 00190 } 00191 // adjust string size 00192 str->reserve(file_size); 00193 // read the contents 00194 rewind(fp); 00195 char *buff = new char[file_size]; 00196 if (buff == NULL) { 00197 fclose(fp); 00198 return false; 00199 } 00200 int read_bytes = fread(buff, 1, static_cast<int>(file_size), fp); 00201 if (read_bytes == file_size) { 00202 str->append(buff, file_size); 00203 } 00204 delete []buff; 00205 fclose(fp); 00206 return (read_bytes == file_size); 00207 } 00208 00209 // splits a string into vectors based on specified delimiters 00210 void CubeUtils::SplitStringUsing(const string &str, 00211 const string &delims, 00212 vector<string> *str_vec) { 00213 // Optimize the common case where delims is a single character. 00214 if (delims[0] != '\0' && delims[1] == '\0') { 00215 char c = delims[0]; 00216 const char* p = str.data(); 00217 const char* end = p + str.size(); 00218 while (p != end) { 00219 if (*p == c) { 00220 ++p; 00221 } else { 00222 const char* start = p; 00223 while (++p != end && *p != c); 00224 str_vec->push_back(string(start, p - start)); 00225 } 00226 } 00227 return; 00228 } 00229 00230 string::size_type begin_index, end_index; 00231 begin_index = str.find_first_not_of(delims); 00232 while (begin_index != string::npos) { 00233 end_index = str.find_first_of(delims, begin_index); 00234 if (end_index == string::npos) { 00235 str_vec->push_back(str.substr(begin_index)); 00236 return; 00237 } 00238 str_vec->push_back(str.substr(begin_index, (end_index - begin_index))); 00239 begin_index = str.find_first_not_of(delims, end_index); 00240 } 00241 } 00242 00243 // UTF-8 to UTF-32 convesion functions 00244 void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) { 00245 str32->clear(); 00246 int len = strlen(utf8_str); 00247 int step = 0; 00248 for (int ch = 0; ch < len; ch += step) { 00249 step = UNICHAR::utf8_step(utf8_str + ch); 00250 if (step > 0) { 00251 UNICHAR uni_ch(utf8_str + ch, step); 00252 (*str32) += uni_ch.first_uni(); 00253 } 00254 } 00255 } 00256 00257 // UTF-8 to UTF-32 convesion functions 00258 void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) { 00259 str->clear(); 00260 for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++) { 00261 UNICHAR uni_ch((*ch_32)); 00262 char *utf8 = uni_ch.utf8_str(); 00263 if (utf8 != NULL) { 00264 (*str) += utf8; 00265 delete []utf8; 00266 } 00267 } 00268 } 00269 00270 bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) { 00271 bool all_one_case = true; 00272 bool capitalized; 00273 bool prev_upper; 00274 bool prev_lower; 00275 bool first_upper; 00276 bool first_lower; 00277 bool cur_upper; 00278 bool cur_lower; 00279 00280 string str8; 00281 if (!char_set) { 00282 // If cube char_set is missing, use C-locale-dependent functions 00283 // on UTF8 characters to determine case properties. 00284 first_upper = isupper(str32[0]); 00285 first_lower = islower(str32[0]); 00286 if (first_upper) 00287 capitalized = true; 00288 prev_upper = first_upper; 00289 prev_lower = islower(str32[0]); 00290 for (int c = 1; str32[c] != 0; ++c) { 00291 cur_upper = isupper(str32[c]); 00292 cur_lower = islower(str32[c]); 00293 if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) 00294 all_one_case = false; 00295 if (cur_upper) 00296 capitalized = false; 00297 prev_upper = cur_upper; 00298 prev_lower = cur_lower; 00299 } 00300 } else { 00301 UNICHARSET *unicharset = char_set->InternalUnicharset(); 00302 // Use UNICHARSET functions to determine case properties 00303 first_upper = unicharset->get_isupper(char_set->ClassID(str32[0])); 00304 first_lower = unicharset->get_islower(char_set->ClassID(str32[0])); 00305 if (first_upper) 00306 capitalized = true; 00307 prev_upper = first_upper; 00308 prev_lower = unicharset->get_islower(char_set->ClassID(str32[0])); 00309 00310 for (int c = 1; c < StrLen(str32); ++c) { 00311 cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c])); 00312 cur_lower = unicharset->get_islower(char_set->ClassID(str32[c])); 00313 if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) 00314 all_one_case = false; 00315 if (cur_upper) 00316 capitalized = false; 00317 prev_upper = cur_upper; 00318 prev_lower = cur_lower; 00319 } 00320 } 00321 return all_one_case || capitalized; 00322 } 00323 00324 char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) { 00325 if (!char_set) { 00326 return NULL; 00327 } 00328 UNICHARSET *unicharset = char_set->InternalUnicharset(); 00329 int len = StrLen(str32); 00330 char_32 *lower = new char_32[len + 1]; 00331 if (!lower) 00332 return NULL; 00333 for (int i = 0; i < len; ++i) { 00334 char_32 ch = str32[i]; 00335 if (ch == INVALID_UNICHAR_ID) { 00336 delete [] lower; 00337 return NULL; 00338 } 00339 // convert upper-case characters to lower-case 00340 if (unicharset->get_isupper(char_set->ClassID(ch))) { 00341 UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch)); 00342 const char_32 *str32_lower = char_set->ClassString(uid_lower); 00343 // expect lower-case version of character to be a single character 00344 if (!str32_lower || StrLen(str32_lower) != 1) { 00345 delete [] lower; 00346 return NULL; 00347 } 00348 lower[i] = str32_lower[0]; 00349 } else { 00350 lower[i] = ch; 00351 } 00352 } 00353 lower[len] = 0; 00354 return lower; 00355 } 00356 00357 char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) { 00358 if (!char_set) { 00359 return NULL; 00360 } 00361 UNICHARSET *unicharset = char_set->InternalUnicharset(); 00362 int len = StrLen(str32); 00363 char_32 *upper = new char_32[len + 1]; 00364 if (!upper) 00365 return NULL; 00366 for (int i = 0; i < len; ++i) { 00367 char_32 ch = str32[i]; 00368 if (ch == INVALID_UNICHAR_ID) { 00369 delete [] upper; 00370 return NULL; 00371 } 00372 // convert lower-case characters to upper-case 00373 if (unicharset->get_islower(char_set->ClassID(ch))) { 00374 UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch)); 00375 const char_32 *str32_upper = char_set->ClassString(uid_upper); 00376 // expect upper-case version of character to be a single character 00377 if (!str32_upper || StrLen(str32_upper) != 1) { 00378 delete [] upper; 00379 return NULL; 00380 } 00381 upper[i] = str32_upper[0]; 00382 } else { 00383 upper[i] = ch; 00384 } 00385 } 00386 upper[len] = 0; 00387 return upper; 00388 } 00389 } // namespace tesseract