tesseract
3.03
|
00001 00002 // File: unicharset.cpp 00003 // Description: Unicode character/ligature set class. 00004 // Author: Thomas Kielbus 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include <assert.h> 00021 #include <stdio.h> 00022 #include <string.h> 00023 00024 #include "tesscallback.h" 00025 #include "tprintf.h" 00026 #include "unichar.h" 00027 #include "unicharset.h" 00028 #include "params.h" 00029 00030 // Special character used in representing character fragments. 00031 static const char kSeparator = '|'; 00032 // Special character used in representing 'natural' character fragments. 00033 static const char kNaturalFlag = 'n'; 00034 00035 static const int ISALPHA_MASK = 0x1; 00036 static const int ISLOWER_MASK = 0x2; 00037 static const int ISUPPER_MASK = 0x4; 00038 static const int ISDIGIT_MASK = 0x8; 00039 static const int ISPUNCTUATION_MASK = 0x10; 00040 00041 // Y coordinate threshold for determining cap-height vs x-height. 00042 // TODO(rays) Bring the global definition down to the ccutil library level, 00043 // so this constant is relative to some other constants. 00044 static const int kMeanlineThreshold = 220; 00045 // Let C be the number of alpha chars for which all tops exceed 00046 // kMeanlineThreshold, and X the number of alpha chars for which all 00047 // tops are below kMeanlineThreshold, then if X > C * 00048 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than 00049 // half the alpha characters have upper or lower case, then the 00050 // unicharset "has x-height". 00051 const double kMinXHeightFraction = 0.25; 00052 const double kMinCapHeightFraction = 0.05; 00053 00054 /*static */ 00055 const char* UNICHARSET::kCustomLigatures[][2] = { 00056 {"ct", "\uE003"}, // c + t -> U+E003 00057 {"ſh", "\uE006"}, // long-s + h -> U+E006 00058 {"ſi", "\uE007"}, // long-s + i -> U+E007 00059 {"ſl", "\uE008"}, // long-s + l -> U+E008 00060 {"ſſ", "\uE009"}, // long-s + long-s -> U+E009 00061 {NULL, NULL} 00062 }; 00063 00064 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum. 00065 const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = { 00066 " ", 00067 "Joined", 00068 "|Broken|0|1" 00069 }; 00070 00071 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() { 00072 Init(); 00073 } 00074 00075 // Initialize all properties to sensible default values. 00076 void UNICHARSET::UNICHAR_PROPERTIES::Init() { 00077 isalpha = false; 00078 islower = false; 00079 isupper = false; 00080 isdigit = false; 00081 ispunctuation = false; 00082 isngram = false; 00083 enabled = false; 00084 SetRangesOpen(); 00085 script_id = 0; 00086 other_case = 0; 00087 mirror = 0; 00088 normed = ""; 00089 direction = UNICHARSET::U_LEFT_TO_RIGHT; 00090 fragment = NULL; 00091 } 00092 00093 // Sets all ranges wide open. Initialization default in case there are 00094 // no useful values available. 00095 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() { 00096 min_bottom = 0; 00097 max_bottom = MAX_UINT8; 00098 min_top = 0; 00099 max_top = MAX_UINT8; 00100 min_width = 0; 00101 max_width = MAX_INT16; 00102 min_bearing = 0; 00103 max_bearing = MAX_INT16; 00104 min_advance = 0; 00105 max_advance = MAX_INT16; 00106 } 00107 00108 // Sets all ranges to empty. Used before expanding with font-based data. 00109 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() { 00110 min_bottom = MAX_UINT8; 00111 max_bottom = 0; 00112 min_top = MAX_UINT8; 00113 max_top = 0; 00114 min_width = MAX_INT16; 00115 max_width = 0; 00116 min_bearing = MAX_INT16; 00117 max_bearing = 0; 00118 min_advance = MAX_INT16; 00119 max_advance = 0; 00120 } 00121 00122 // Returns true if any of the top/bottom/width/bearing/advance ranges is 00123 // emtpy. 00124 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const { 00125 return min_bottom > max_bottom || min_top > max_top || 00126 min_width > max_width || min_bearing > max_bearing || 00127 min_advance > max_advance; 00128 } 00129 00130 // Expands the ranges with the ranges from the src properties. 00131 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom( 00132 const UNICHAR_PROPERTIES& src) { 00133 UpdateRange(src.min_bottom, &min_bottom, &max_bottom); 00134 UpdateRange(src.max_bottom, &min_bottom, &max_bottom); 00135 UpdateRange(src.min_top, &min_top, &max_top); 00136 UpdateRange(src.max_top, &min_top, &max_top); 00137 UpdateRange(src.min_width, &min_width, &max_width); 00138 UpdateRange(src.max_width, &min_width, &max_width); 00139 UpdateRange(src.min_bearing, &min_bearing, &max_bearing); 00140 UpdateRange(src.max_bearing, &min_bearing, &max_bearing); 00141 UpdateRange(src.min_advance, &min_advance, &max_advance); 00142 UpdateRange(src.max_advance, &min_advance, &max_advance); 00143 } 00144 00145 // Copies the properties from src into this. 00146 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) { 00147 // Apart from the fragment, everything else can be done with a default copy. 00148 CHAR_FRAGMENT* saved_fragment = fragment; 00149 *this = src; // Bitwise copy. 00150 fragment = saved_fragment; 00151 } 00152 00153 UNICHARSET::UNICHARSET() : 00154 unichars(NULL), 00155 ids(), 00156 size_used(0), 00157 size_reserved(0), 00158 script_table(NULL), 00159 script_table_size_used(0), 00160 null_script("NULL") { 00161 clear(); 00162 for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) { 00163 unichar_insert(kSpecialUnicharCodes[i]); 00164 if (i == UNICHAR_JOINED) 00165 set_isngram(i, true); 00166 } 00167 } 00168 00169 UNICHARSET::~UNICHARSET() { 00170 clear(); 00171 } 00172 00173 void UNICHARSET::reserve(int unichars_number) { 00174 if (unichars_number > size_reserved) { 00175 UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number]; 00176 for (int i = 0; i < size_used; ++i) 00177 unichars_new[i] = unichars[i]; 00178 for (int j = size_used; j < unichars_number; ++j) { 00179 unichars_new[j].properties.script_id = add_script(null_script); 00180 } 00181 delete[] unichars; 00182 unichars = unichars_new; 00183 size_reserved = unichars_number; 00184 } 00185 } 00186 00187 const UNICHAR_ID 00188 UNICHARSET::unichar_to_id(const char* const unichar_repr) const { 00189 return ids.contains(unichar_repr) ? 00190 ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID; 00191 } 00192 00193 const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr, 00194 int length) const { 00195 assert(length > 0 && length <= UNICHAR_LEN); 00196 return ids.contains(unichar_repr, length) ? 00197 ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID; 00198 } 00199 00200 // Return the minimum number of bytes that matches a legal UNICHAR_ID, 00201 // while leaving the rest of the string encodable. Returns 0 if the 00202 // beginning of the string is not encodable. 00203 // WARNING: this function now encodes the whole string for precision. 00204 // Use encode_string in preference to repeatedly calling step. 00205 int UNICHARSET::step(const char* str) const { 00206 GenericVector<UNICHAR_ID> encoding; 00207 GenericVector<char> lengths; 00208 encode_string(str, true, &encoding, &lengths, NULL); 00209 if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0; 00210 return lengths[0]; 00211 } 00212 // As step except constraining the search to unichar-ids that are 00213 // self-normalized. Unlike step, does not encode the whole string, therefore 00214 // should be used on short strings (like those obtained from 00215 // get_normed_unichar.) 00216 int UNICHARSET::normed_step(const char* str) const { 00217 // Find the length of the first matching unicharset member. 00218 int length = ids.minmatch(str); 00219 if (length == 0) 00220 return 0; // Empty string or illegal char. 00221 00222 while (length <= UNICHAR_LEN) { 00223 if (ids.contains(str, length)) { 00224 int matched_id = unichar_to_id(str, length); 00225 const GenericVector<UNICHAR_ID>& matched_norms = normed_ids(matched_id); 00226 bool good_start = matched_norms.size() == 1 && 00227 matched_norms[0] == matched_id; 00228 if (str[length] == '\0') { 00229 return good_start ? length : 0; 00230 } 00231 if (normed_step(str + length) > 0) 00232 return length; // This length works! 00233 } else if (str[length] == '\0') { 00234 return 0; // Ran out of string. 00235 } 00236 ++length; 00237 } 00238 return 0; 00239 } 00240 00241 // Return whether the given UTF-8 string is encodable with this UNICHARSET. 00242 // If not encodable, write the first byte offset which cannot be converted 00243 // into the second (return) argument. 00244 bool UNICHARSET::encodable_string(const char *str, 00245 int *first_bad_position) const { 00246 GenericVector<UNICHAR_ID> encoding; 00247 return encode_string(str, true, &encoding, NULL, first_bad_position); 00248 } 00249 00250 // Encodes the given UTF-8 string with this UNICHARSET. 00251 // Returns true if the encoding succeeds completely, false if there is at 00252 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case 00253 // the rest of the string is still encoded. 00254 // If lengths is not NULL, then it is filled with the corresponding 00255 // byte length of each encoded UNICHAR_ID. 00256 bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure, 00257 GenericVector<UNICHAR_ID>* encoding, 00258 GenericVector<char>* lengths, 00259 int* encoded_length) const { 00260 GenericVector<UNICHAR_ID> working_encoding; 00261 GenericVector<char> working_lengths; 00262 GenericVector<char> best_lengths; 00263 encoding->truncate(0); // Just in case str is empty. 00264 int str_length = strlen(str); 00265 int str_pos = 0; 00266 bool perfect = true; 00267 while (str_pos < str_length) { 00268 encode_string(str, str_pos, str_length, &working_encoding, &working_lengths, 00269 &str_pos, encoding, &best_lengths); 00270 if (str_pos < str_length) { 00271 // This is a non-match. Skip one utf-8 character. 00272 perfect = false; 00273 if (give_up_on_failure) break; 00274 int step = UNICHAR::utf8_step(str + str_pos); 00275 if (step == 0) step = 1; 00276 encoding->push_back(INVALID_UNICHAR_ID); 00277 best_lengths.push_back(step); 00278 str_pos += step; 00279 working_encoding = *encoding; 00280 working_lengths = best_lengths; 00281 } 00282 } 00283 if (lengths != NULL) *lengths = best_lengths; 00284 if (encoded_length != NULL) *encoded_length = str_pos; 00285 return perfect; 00286 } 00287 00288 const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const { 00289 if (id == INVALID_UNICHAR_ID) { 00290 return INVALID_UNICHAR; 00291 } 00292 ASSERT_HOST(id < this->size()); 00293 return unichars[id].representation; 00294 } 00295 00296 const char* const UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const { 00297 if (id == INVALID_UNICHAR_ID) { 00298 return INVALID_UNICHAR; 00299 } 00300 ASSERT_HOST(id < this->size()); 00301 // Resolve from the kCustomLigatures table if this is a private encoding. 00302 if (get_isprivate(id)) { 00303 const char* ch = id_to_unichar(id); 00304 for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) { 00305 if (!strcmp(ch, kCustomLigatures[i][1])) { 00306 return kCustomLigatures[i][0]; 00307 } 00308 } 00309 } 00310 // Otherwise return the stored representation. 00311 return unichars[id].representation; 00312 } 00313 00314 // Return a STRING that reformats the utf8 str into the str followed 00315 // by its hex unicodes. 00316 STRING UNICHARSET::debug_utf8_str(const char* str) { 00317 STRING result = str; 00318 result += " ["; 00319 int step = 1; 00320 // Chop into unicodes and code each as hex. 00321 for (int i = 0; str[i] != '\0'; i += step) { 00322 char hex[sizeof(int) * 2 + 1]; 00323 step = UNICHAR::utf8_step(str + i); 00324 if (step == 0) { 00325 step = 1; 00326 sprintf(hex, "%x", str[i]); 00327 } else { 00328 UNICHAR ch(str + i, step); 00329 sprintf(hex, "%x", ch.first_uni()); 00330 } 00331 result += hex; 00332 result += " "; 00333 } 00334 result += "]"; 00335 return result; 00336 } 00337 00338 // Return a STRING containing debug information on the unichar, including 00339 // the id_to_unichar, its hex unicodes and the properties. 00340 STRING UNICHARSET::debug_str(UNICHAR_ID id) const { 00341 if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id)); 00342 const CHAR_FRAGMENT *fragment = this->get_fragment(id); 00343 if (fragment) { 00344 return fragment->to_string(); 00345 } 00346 const char* str = id_to_unichar(id); 00347 STRING result = debug_utf8_str(str); 00348 // Append a for lower alpha, A for upper alpha, and x if alpha but neither. 00349 if (get_isalpha(id)) { 00350 if (get_islower(id)) 00351 result += "a"; 00352 else if (get_isupper(id)) 00353 result += "A"; 00354 else 00355 result += "x"; 00356 } 00357 // Append 0 if a digit. 00358 if (get_isdigit(id)) { 00359 result += "0"; 00360 } 00361 // Append p is a punctuation symbol. 00362 if (get_ispunctuation(id)) { 00363 result += "p"; 00364 } 00365 return result; 00366 } 00367 00368 // Sets the normed_ids vector from the normed string. normed_ids is not 00369 // stored in the file, and needs to be set when the UNICHARSET is loaded. 00370 void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) { 00371 unichars[unichar_id].properties.normed_ids.truncate(0); 00372 int length = unichars[unichar_id].properties.normed.length(); 00373 const char* normed_str = unichars[unichar_id].properties.normed.string(); 00374 int step = 0; 00375 for (int offset = 0; offset < length; offset+= step) { 00376 step = normed_step(normed_str + offset); 00377 if (step == 0) { 00378 unichars[unichar_id].properties.normed_ids.truncate(0); 00379 unichars[unichar_id].properties.normed_ids.push_back(unichar_id); 00380 break; 00381 } 00382 int normed_id = unichar_to_id(normed_str + offset, step); 00383 ASSERT_HOST(normed_id >= 0); 00384 unichars[unichar_id].properties.normed_ids.push_back(normed_id); 00385 } 00386 } 00387 00388 // Returns whether the unichar id represents a unicode value in the private use 00389 // area. We use this range only internally to represent uncommon ligatures 00390 // (eg. 'ct') that do not have regular unicode values. 00391 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const { 00392 UNICHAR uc(id_to_unichar(unichar_id), -1); 00393 int uni = uc.first_uni(); 00394 return (uni >= 0xE000 && uni <= 0xF8FF); 00395 } 00396 00397 00398 // Sets all ranges to empty, so they can be expanded to set the values. 00399 void UNICHARSET::set_ranges_empty() { 00400 for (int id = 0; id < size_used; ++id) { 00401 unichars[id].properties.SetRangesEmpty(); 00402 } 00403 } 00404 00405 // Sets all the properties for this unicharset given a src unicharset with 00406 // everything set. The unicharsets don't have to be the same, and graphemes 00407 // are correctly accounted for. 00408 void UNICHARSET::PartialSetPropertiesFromOther(int start_index, 00409 const UNICHARSET& src) { 00410 for (int ch = start_index; ch < size_used; ++ch) { 00411 const char* utf8 = id_to_unichar(ch); 00412 UNICHAR_PROPERTIES properties; 00413 if (src.GetStrProperties(utf8, &properties)) { 00414 // Setup the script_id, other_case, and mirror properly. 00415 const char* script = src.get_script_from_script_id(properties.script_id); 00416 properties.script_id = add_script(script); 00417 const char* other_case = src.id_to_unichar(properties.other_case); 00418 if (contains_unichar(other_case)) { 00419 properties.other_case = unichar_to_id(other_case); 00420 } else { 00421 properties.other_case = ch; 00422 } 00423 const char* mirror_str = src.id_to_unichar(properties.mirror); 00424 if (contains_unichar(mirror_str)) { 00425 properties.mirror = unichar_to_id(mirror_str); 00426 } else { 00427 properties.mirror = ch; 00428 } 00429 unichars[ch].properties.CopyFrom(properties); 00430 set_normed_ids(ch); 00431 } else { 00432 tprintf("Failed to get properties for index %d = %s\n", ch, utf8); 00433 } 00434 } 00435 } 00436 00437 // Expands the tops and bottoms and widths for this unicharset given a 00438 // src unicharset with ranges in it. The unicharsets don't have to be the 00439 // same, and graphemes are correctly accounted for. 00440 void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) { 00441 for (int ch = 0; ch < size_used; ++ch) { 00442 const char* utf8 = id_to_unichar(ch); 00443 UNICHAR_PROPERTIES properties; 00444 if (src.GetStrProperties(utf8, &properties)) { 00445 // Expand just the ranges from properties. 00446 unichars[ch].properties.ExpandRangesFrom(properties); 00447 } 00448 } 00449 } 00450 00451 // Makes this a copy of src. Clears this completely first, so the automattic 00452 // ids will not be present in this if not in src. 00453 void UNICHARSET::CopyFrom(const UNICHARSET& src) { 00454 clear(); 00455 AppendOtherUnicharset(src); 00456 } 00457 00458 // For each id in src, if it does not occur in this, add it, as in 00459 // SetPropertiesFromOther, otherwise expand the ranges, as in 00460 // ExpandRangesFromOther. 00461 void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) { 00462 int initial_used = size_used; 00463 for (int ch = 0; ch < src.size_used; ++ch) { 00464 const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; 00465 const char* utf8 = src.id_to_unichar(ch); 00466 if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) { 00467 // Only use fully valid entries. 00468 tprintf("Bad properties for index %d, char %s: " 00469 "%d,%d %d,%d %d,%d %d,%d %d,%d\n", 00470 ch, utf8, src_props.min_bottom, src_props.max_bottom, 00471 src_props.min_top, src_props.max_top, 00472 src_props.min_width, src_props.max_width, 00473 src_props.min_bearing, src_props.max_bearing, 00474 src_props.min_advance, src_props.max_advance); 00475 continue; 00476 } 00477 int id = size_used; 00478 if (contains_unichar(utf8)) { 00479 id = unichar_to_id(utf8); 00480 // Just expand current ranges. 00481 unichars[id].properties.ExpandRangesFrom(src_props); 00482 } else { 00483 unichar_insert(utf8); 00484 unichars[id].properties.SetRangesEmpty(); 00485 } 00486 } 00487 // Set properties, including mirror and other_case, WITHOUT reordering 00488 // the unicharset. 00489 PartialSetPropertiesFromOther(initial_used, src); 00490 } 00491 00492 // Returns true if the acceptable ranges of the tops of the characters do 00493 // not overlap, making their x-height calculations distinct. 00494 bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const { 00495 int overlap = MIN(unichars[id1].properties.max_top, 00496 unichars[id2].properties.max_top) - 00497 MAX(unichars[id1].properties.min_top, 00498 unichars[id2].properties.min_top); 00499 return overlap <= 0; 00500 } 00501 00502 // Internal recursive version of encode_string above. 00503 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that 00504 // each UNICHAR_ID uses the least possible part of the utf8 str. 00505 // It does this by depth-first tail recursion on increasing length matches 00506 // to the UNICHARSET, saving the first encountered result that encodes the 00507 // maximum total length of str. It stops on a failure to encode to make 00508 // the overall process of encoding a partially failed string more efficient. 00509 // See unicharset.h for definition of the args. 00510 void UNICHARSET::encode_string(const char* str, int str_index, int str_length, 00511 GenericVector<UNICHAR_ID>* encoding, 00512 GenericVector<char>* lengths, 00513 int* best_total_length, 00514 GenericVector<UNICHAR_ID>* best_encoding, 00515 GenericVector<char>* best_lengths) const { 00516 if (str_index > *best_total_length) { 00517 // This is the best result so far. 00518 *best_total_length = str_index; 00519 *best_encoding = *encoding; 00520 if (best_lengths != NULL) 00521 *best_lengths = *lengths; 00522 } 00523 if (str_index == str_length) return; 00524 int encoding_index = encoding->size(); 00525 // Find the length of the first matching unicharset member. 00526 int length = ids.minmatch(str + str_index); 00527 if (length == 0 || str_index + length > str_length) return; 00528 do { 00529 if (ids.contains(str + str_index, length)) { 00530 // Successful encoding so far. 00531 UNICHAR_ID id = ids.unichar_to_id(str + str_index, length); 00532 encoding->push_back(id); 00533 lengths->push_back(length); 00534 encode_string(str, str_index + length, str_length, encoding, lengths, 00535 best_total_length, best_encoding, best_lengths); 00536 if (*best_total_length == str_length) 00537 return; // Tail recursion success! 00538 // Failed with that length, truncate back and try again. 00539 encoding->truncate(encoding_index); 00540 lengths->truncate(encoding_index); 00541 } 00542 int step = UNICHAR::utf8_step(str + str_index + length); 00543 if (step == 0) step = 1; 00544 length += step; 00545 } while (length <= UNICHAR_LEN && str_index + length <= str_length); 00546 } 00547 00548 // Gets the properties for a grapheme string, combining properties for 00549 // multiple characters in a meaningful way where possible. 00550 // Returns false if no valid match was found in the unicharset. 00551 // NOTE that script_id, mirror, and other_case refer to this unicharset on 00552 // return and will need translation if the target unicharset is different. 00553 bool UNICHARSET::GetStrProperties(const char* utf8_str, 00554 UNICHAR_PROPERTIES* props) const { 00555 props->Init(); 00556 props->SetRangesEmpty(); 00557 props->min_advance = 0; 00558 props->max_advance = 0; 00559 int total_unicodes = 0; 00560 GenericVector<UNICHAR_ID> encoding; 00561 if (!encode_string(utf8_str, true, &encoding, NULL, NULL)) 00562 return false; // Some part was invalid. 00563 for (int i = 0; i < encoding.size(); ++i) { 00564 int id = encoding[i]; 00565 const UNICHAR_PROPERTIES& src_props = unichars[id].properties; 00566 // Logical OR all the bools. 00567 if (src_props.isalpha) props->isalpha = true; 00568 if (src_props.islower) props->islower = true; 00569 if (src_props.isupper) props->isupper = true; 00570 if (src_props.isdigit) props->isdigit = true; 00571 if (src_props.ispunctuation) props->ispunctuation = true; 00572 if (src_props.isngram) props->isngram = true; 00573 if (src_props.enabled) props->enabled = true; 00574 // Min/max the tops/bottoms. 00575 UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom); 00576 UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom); 00577 UpdateRange(src_props.min_top, &props->min_top, &props->max_top); 00578 UpdateRange(src_props.max_top, &props->min_top, &props->max_top); 00579 int bearing = ClipToRange(props->min_advance + src_props.min_bearing, 00580 -MAX_INT16, MAX_INT16); 00581 if (total_unicodes == 0 || bearing < props->min_bearing) 00582 props->min_bearing = bearing; 00583 bearing = ClipToRange(props->max_advance + src_props.max_bearing, 00584 -MAX_INT16, MAX_INT16); 00585 if (total_unicodes == 0 || bearing < props->max_bearing) 00586 props->max_bearing = bearing; 00587 props->min_advance = ClipToRange(props->min_advance + src_props.min_advance, 00588 -MAX_INT16, MAX_INT16); 00589 props->max_advance = ClipToRange(props->max_advance + src_props.max_advance, 00590 -MAX_INT16, MAX_INT16); 00591 // With a single width, just use the widths stored in the unicharset. 00592 props->min_width = src_props.min_width; 00593 props->max_width = src_props.max_width; 00594 // Use the first script id, other_case, mirror, direction. 00595 // Note that these will need translation, except direction. 00596 if (total_unicodes == 0) { 00597 props->script_id = src_props.script_id; 00598 props->other_case = src_props.other_case; 00599 props->mirror = src_props.mirror; 00600 props->direction = src_props.direction; 00601 } 00602 // The normed string for the compound character is the concatenation of 00603 // the normed versions of the individual characters. 00604 props->normed += src_props.normed; 00605 ++total_unicodes; 00606 } 00607 if (total_unicodes > 1) { 00608 // Estimate the total widths from the advance - bearing. 00609 props->min_width = ClipToRange(props->min_advance - props->max_bearing, 00610 -MAX_INT16, MAX_INT16); 00611 props->max_width = ClipToRange(props->max_advance - props->min_bearing, 00612 -MAX_INT16, MAX_INT16); 00613 } 00614 return total_unicodes > 0; 00615 } 00616 00617 // TODO(rays) clean-up the order of functions to match unicharset.h. 00618 00619 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const { 00620 unsigned int properties = 0; 00621 if (this->get_isalpha(id)) 00622 properties |= ISALPHA_MASK; 00623 if (this->get_islower(id)) 00624 properties |= ISLOWER_MASK; 00625 if (this->get_isupper(id)) 00626 properties |= ISUPPER_MASK; 00627 if (this->get_isdigit(id)) 00628 properties |= ISDIGIT_MASK; 00629 if (this->get_ispunctuation(id)) 00630 properties |= ISPUNCTUATION_MASK; 00631 return properties; 00632 } 00633 00634 char UNICHARSET::get_chartype(UNICHAR_ID id) const { 00635 if (this->get_isupper(id)) return 'A'; 00636 if (this->get_islower(id)) return 'a'; 00637 if (this->get_isalpha(id)) return 'x'; 00638 if (this->get_isdigit(id)) return '0'; 00639 if (this->get_ispunctuation(id)) return 'p'; 00640 return 0; 00641 } 00642 00643 void UNICHARSET::unichar_insert(const char* const unichar_repr) { 00644 if (!ids.contains(unichar_repr)) { 00645 if (strlen(unichar_repr) > UNICHAR_LEN) { 00646 fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n", 00647 int(strlen(unichar_repr)), unichar_repr); 00648 return; 00649 } 00650 if (size_used == size_reserved) { 00651 if (size_used == 0) 00652 reserve(8); 00653 else 00654 reserve(2 * size_used); 00655 } 00656 00657 strcpy(unichars[size_used].representation, unichar_repr); 00658 this->set_script(size_used, null_script); 00659 // If the given unichar_repr represents a fragmented character, set 00660 // fragment property to a pointer to CHAR_FRAGMENT class instance with 00661 // information parsed from the unichar representation. Use the script 00662 // of the base unichar for the fragmented character if possible. 00663 CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr); 00664 this->unichars[size_used].properties.fragment = frag; 00665 if (frag != NULL && this->contains_unichar(frag->get_unichar())) { 00666 this->unichars[size_used].properties.script_id = 00667 this->get_script(frag->get_unichar()); 00668 } 00669 this->unichars[size_used].properties.enabled = true; 00670 ids.insert(unichar_repr, size_used); 00671 ++size_used; 00672 } 00673 } 00674 00675 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const { 00676 return ids.contains(unichar_repr); 00677 } 00678 00679 bool UNICHARSET::contains_unichar(const char* const unichar_repr, 00680 int length) const { 00681 if (length == 0) { 00682 return false; 00683 } 00684 return ids.contains(unichar_repr, length); 00685 } 00686 00687 bool UNICHARSET::eq(UNICHAR_ID unichar_id, 00688 const char* const unichar_repr) const { 00689 return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0; 00690 } 00691 00692 bool UNICHARSET::save_to_file(FILE *file) const { 00693 fprintf(file, "%d\n", this->size()); 00694 for (UNICHAR_ID id = 0; id < this->size(); ++id) { 00695 int min_bottom, max_bottom, min_top, max_top; 00696 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top); 00697 int min_width, max_width; 00698 get_width_range(id, &min_width, &max_width); 00699 int min_bearing, max_bearing; 00700 get_bearing_range(id, &min_bearing, &max_bearing); 00701 int min_advance, max_advance; 00702 get_advance_range(id, &min_advance, &max_advance); 00703 unsigned int properties = this->get_properties(id); 00704 if (strcmp(this->id_to_unichar(id), " ") == 0) { 00705 fprintf(file, "%s %x %s %d\n", "NULL", properties, 00706 this->get_script_from_script_id(this->get_script(id)), 00707 this->get_other_case(id)); 00708 } else { 00709 fprintf(file, 00710 "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n", 00711 this->id_to_unichar(id), properties, 00712 min_bottom, max_bottom, min_top, max_top, min_width, max_width, 00713 min_bearing, max_bearing, min_advance, max_advance, 00714 this->get_script_from_script_id(this->get_script(id)), 00715 this->get_other_case(id), this->get_direction(id), 00716 this->get_mirror(id), this->get_normed_unichar(id), 00717 this->debug_str(id).string()); 00718 } 00719 } 00720 return true; 00721 } 00722 00723 class InMemoryFilePointer { 00724 public: 00725 InMemoryFilePointer(const char *memory, int mem_size) 00726 : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { } 00727 00728 char *fgets(char *orig_dst, int size) { 00729 const char *src_end = memory_ + mem_size_; 00730 char *dst_end = orig_dst + size - 1; 00731 if (size < 1) { 00732 return fgets_ptr_ < src_end ? orig_dst : NULL; 00733 } 00734 00735 char *dst = orig_dst; 00736 char ch = '^'; 00737 while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') { 00738 ch = *dst++ = *fgets_ptr_++; 00739 } 00740 *dst = 0; 00741 return (dst == orig_dst) ? NULL : orig_dst; 00742 } 00743 00744 private: 00745 const char *memory_; 00746 const char *fgets_ptr_; 00747 const int mem_size_; 00748 }; 00749 00750 bool UNICHARSET::load_from_inmemory_file( 00751 const char *memory, int mem_size, bool skip_fragments) { 00752 InMemoryFilePointer mem_fp(memory, mem_size); 00753 TessResultCallback2<char *, char *, int> *fgets_cb = 00754 NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets); 00755 bool success = load_via_fgets(fgets_cb, skip_fragments); 00756 delete fgets_cb; 00757 return success; 00758 } 00759 00760 class LocalFilePointer { 00761 public: 00762 LocalFilePointer(FILE *stream) : fp_(stream) {} 00763 char *fgets(char *dst, int size) { 00764 return ::fgets(dst, size, fp_); 00765 } 00766 private: 00767 FILE *fp_; 00768 }; 00769 00770 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) { 00771 LocalFilePointer lfp(file); 00772 TessResultCallback2<char *, char *, int> *fgets_cb = 00773 NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets); 00774 bool success = load_via_fgets(fgets_cb, skip_fragments); 00775 delete fgets_cb; 00776 return success; 00777 } 00778 00779 bool UNICHARSET::load_via_fgets( 00780 TessResultCallback2<char *, char *, int> *fgets_cb, 00781 bool skip_fragments) { 00782 int unicharset_size; 00783 char buffer[256]; 00784 00785 this->clear(); 00786 if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL || 00787 sscanf(buffer, "%d", &unicharset_size) != 1) { 00788 return false; 00789 } 00790 this->reserve(unicharset_size); 00791 for (UNICHAR_ID id = 0; id < unicharset_size; ++id) { 00792 char unichar[256]; 00793 unsigned int properties; 00794 char script[64]; 00795 00796 strcpy(script, null_script); 00797 int min_bottom = 0; 00798 int max_bottom = MAX_UINT8; 00799 int min_top = 0; 00800 int max_top = MAX_UINT8; 00801 int min_width = 0; 00802 int max_width = MAX_INT16; 00803 int min_bearing = 0; 00804 int max_bearing = MAX_INT16; 00805 int min_advance = 0; 00806 int max_advance = MAX_INT16; 00807 // TODO(eger): check that this default it ok 00808 // after enabling BiDi iterator for Arabic+Cube. 00809 int direction = UNICHARSET::U_LEFT_TO_RIGHT; 00810 UNICHAR_ID other_case = id; 00811 UNICHAR_ID mirror = id; 00812 char normed[64]; 00813 int v = -1; 00814 if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL || 00815 ((v = sscanf(buffer, 00816 "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d %63s", 00817 unichar, &properties, 00818 &min_bottom, &max_bottom, &min_top, &max_top, 00819 &min_width, &max_width, &min_bearing, &max_bearing, 00820 &min_advance, &max_advance, script, &other_case, 00821 &direction, &mirror, normed)) != 17 && 00822 (v = sscanf(buffer, 00823 "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d", 00824 unichar, &properties, 00825 &min_bottom, &max_bottom, &min_top, &max_top, 00826 &min_width, &max_width, &min_bearing, &max_bearing, 00827 &min_advance, &max_advance, 00828 script, &other_case, &direction, &mirror)) != 16 && 00829 (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d", 00830 unichar, &properties, 00831 &min_bottom, &max_bottom, &min_top, &max_top, 00832 script, &other_case, &direction, &mirror)) != 10 && 00833 (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties, 00834 &min_bottom, &max_bottom, &min_top, &max_top, 00835 script, &other_case)) != 8 && 00836 (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties, 00837 script, &other_case)) != 4 && 00838 (v = sscanf(buffer, "%s %x %63s", 00839 unichar, &properties, script)) != 3 && 00840 (v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) { 00841 return false; 00842 } 00843 00844 // Skip fragments if needed. 00845 CHAR_FRAGMENT *frag = NULL; 00846 if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) { 00847 delete frag; 00848 continue; 00849 } 00850 // Insert unichar into unicharset and set its properties. 00851 if (strcmp(unichar, "NULL") == 0) 00852 this->unichar_insert(" "); 00853 else 00854 this->unichar_insert(unichar); 00855 00856 this->set_isalpha(id, properties & ISALPHA_MASK); 00857 this->set_islower(id, properties & ISLOWER_MASK); 00858 this->set_isupper(id, properties & ISUPPER_MASK); 00859 this->set_isdigit(id, properties & ISDIGIT_MASK); 00860 this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK); 00861 this->set_isngram(id, false); 00862 this->set_script(id, script); 00863 this->unichars[id].properties.enabled = true; 00864 this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top); 00865 this->set_width_range(id, min_width, max_width); 00866 this->set_bearing_range(id, min_bearing, max_bearing); 00867 this->set_advance_range(id, min_advance, max_advance); 00868 this->set_direction(id, static_cast<UNICHARSET::Direction>(direction)); 00869 ASSERT_HOST(other_case < unicharset_size); 00870 this->set_other_case(id, (v>3) ? other_case : id); 00871 ASSERT_HOST(mirror < unicharset_size); 00872 this->set_mirror(id, (v>8) ? mirror : id); 00873 this->set_normed(id, (v>16) ? normed : unichar); 00874 } 00875 post_load_setup(); 00876 return true; 00877 } 00878 00879 // Sets up internal data after loading the file, based on the char 00880 // properties. Called from load_from_file, but also needs to be run 00881 // during set_unicharset_properties. 00882 void UNICHARSET::post_load_setup() { 00883 // Number of alpha chars with the case property minus those without, 00884 // in order to determine that half the alpha chars have case. 00885 int net_case_alphas = 0; 00886 int x_height_alphas = 0; 00887 int cap_height_alphas = 0; 00888 top_bottom_set_ = false; 00889 for (UNICHAR_ID id = 0; id < size_used; ++id) { 00890 int min_bottom = 0; 00891 int max_bottom = MAX_UINT8; 00892 int min_top = 0; 00893 int max_top = MAX_UINT8; 00894 get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top); 00895 if (min_top > 0) 00896 top_bottom_set_ = true; 00897 if (get_isalpha(id)) { 00898 if (get_islower(id) || get_isupper(id)) 00899 ++net_case_alphas; 00900 else 00901 --net_case_alphas; 00902 if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) 00903 ++x_height_alphas; 00904 else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) 00905 ++cap_height_alphas; 00906 } 00907 set_normed_ids(id); 00908 } 00909 00910 script_has_upper_lower_ = net_case_alphas > 0; 00911 script_has_xheight_ = script_has_upper_lower_ || 00912 (x_height_alphas > cap_height_alphas * kMinXHeightFraction && 00913 cap_height_alphas > x_height_alphas * kMinCapHeightFraction); 00914 00915 null_sid_ = get_script_id_from_name(null_script); 00916 ASSERT_HOST(null_sid_ == 0); 00917 common_sid_ = get_script_id_from_name("Common"); 00918 latin_sid_ = get_script_id_from_name("Latin"); 00919 cyrillic_sid_ = get_script_id_from_name("Cyrillic"); 00920 greek_sid_ = get_script_id_from_name("Greek"); 00921 han_sid_ = get_script_id_from_name("Han"); 00922 hiragana_sid_ = get_script_id_from_name("Hiragana"); 00923 katakana_sid_ = get_script_id_from_name("Katakana"); 00924 00925 // Compute default script. Use the highest-counting alpha script, that is 00926 // not the common script, as that still contains some "alphas". 00927 int* script_counts = new int[script_table_size_used]; 00928 memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used); 00929 for (int id = 0; id < size_used; ++id) { 00930 if (get_isalpha(id)) { 00931 ++script_counts[get_script(id)]; 00932 } 00933 } 00934 default_sid_ = 0; 00935 for (int s = 1; s < script_table_size_used; ++s) { 00936 if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) 00937 default_sid_ = s; 00938 } 00939 delete [] script_counts; 00940 } 00941 00942 // Returns true if right_to_left scripts are significant in the unicharset, 00943 // but without being so sensitive that "universal" unicharsets containing 00944 // characters from many scripts, like orientation and script detection, 00945 // look like they are right_to_left. 00946 bool UNICHARSET::major_right_to_left() const { 00947 int ltr_count = 0; 00948 int rtl_count = 0; 00949 for (int id = 0; id < size_used; ++id) { 00950 int dir = get_direction(id); 00951 if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++; 00952 if (dir == UNICHARSET::U_RIGHT_TO_LEFT || 00953 dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC || 00954 dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++; 00955 } 00956 return rtl_count > ltr_count; 00957 } 00958 00959 // Set a whitelist and/or blacklist of characters to recognize. 00960 // An empty or NULL whitelist enables everything (minus any blacklist). 00961 // An empty or NULL blacklist disables nothing. 00962 void UNICHARSET::set_black_and_whitelist(const char* blacklist, 00963 const char* whitelist) { 00964 bool def_enabled = whitelist == NULL || whitelist[0] == '\0'; 00965 // Set everything to default 00966 for (int ch = 0; ch < size_used; ++ch) 00967 unichars[ch].properties.enabled = def_enabled; 00968 if (!def_enabled) { 00969 // Enable the whitelist. 00970 GenericVector<UNICHAR_ID> encoding; 00971 encode_string(whitelist, false, &encoding, NULL, NULL); 00972 for (int i = 0; i < encoding.size(); ++i) { 00973 if (encoding[i] != INVALID_UNICHAR_ID) 00974 unichars[encoding[i]].properties.enabled = true; 00975 } 00976 } 00977 if (blacklist != NULL && blacklist[0] != '\0') { 00978 // Disable the blacklist. 00979 GenericVector<UNICHAR_ID> encoding; 00980 encode_string(blacklist, false, &encoding, NULL, NULL); 00981 for (int i = 0; i < encoding.size(); ++i) { 00982 if (encoding[i] != INVALID_UNICHAR_ID) 00983 unichars[encoding[i]].properties.enabled = false; 00984 } 00985 } 00986 } 00987 00988 int UNICHARSET::add_script(const char* script) { 00989 for (int i = 0; i < script_table_size_used; ++i) { 00990 if (strcmp(script, script_table[i]) == 0) 00991 return i; 00992 } 00993 if (script_table_size_reserved == 0) { 00994 script_table_size_reserved = 8; 00995 script_table = new char*[script_table_size_reserved]; 00996 } 00997 if (script_table_size_used + 1 >= script_table_size_reserved) { 00998 char** new_script_table = new char*[script_table_size_reserved * 2]; 00999 memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*)); 01000 delete[] script_table; 01001 script_table = new_script_table; 01002 script_table_size_reserved = 2 * script_table_size_reserved; 01003 } 01004 script_table[script_table_size_used] = new char[strlen(script) + 1]; 01005 strcpy(script_table[script_table_size_used], script); 01006 return script_table_size_used++; 01007 } 01008 01009 // Returns the string that represents a fragment 01010 // with the given unichar, pos and total. 01011 STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total, 01012 bool natural) { 01013 if (total == 1) return STRING(unichar); 01014 STRING result = ""; 01015 result += kSeparator; 01016 result += unichar; 01017 char buffer[kMaxLen]; 01018 snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, 01019 natural ? kNaturalFlag : kSeparator, total); 01020 result += buffer; 01021 return result; 01022 } 01023 01024 CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) { 01025 const char *ptr = string; 01026 int len = strlen(string); 01027 if (len < kMinLen || *ptr != kSeparator) { 01028 return NULL; // this string can not represent a fragment 01029 } 01030 ptr++; // move to the next character 01031 int step = 0; 01032 while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) { 01033 step += UNICHAR::utf8_step(ptr + step); 01034 } 01035 if (step == 0 || step > UNICHAR_LEN) { 01036 return NULL; // no character for unichar or the character is too long 01037 } 01038 char unichar[UNICHAR_LEN + 1]; 01039 strncpy(unichar, ptr, step); 01040 unichar[step] = '\0'; // null terminate unichar 01041 ptr += step; // move to the next fragment separator 01042 int pos = 0; 01043 int total = 0; 01044 bool natural = false; 01045 char *end_ptr = NULL; 01046 for (int i = 0; i < 2; i++) { 01047 if (ptr > string + len || *ptr != kSeparator) { 01048 if (i == 1 && *ptr == kNaturalFlag) 01049 natural = true; 01050 else 01051 return NULL; // Failed to parse fragment representation. 01052 } 01053 ptr++; // move to the next character 01054 i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10)) 01055 : total = static_cast<int>(strtol(ptr, &end_ptr, 10)); 01056 ptr = end_ptr; 01057 } 01058 if (ptr != string + len) { 01059 return NULL; // malformed fragment representation 01060 } 01061 CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT(); 01062 fragment->set_all(unichar, pos, total, natural); 01063 return fragment; 01064 } 01065 01066 int UNICHARSET::get_script_id_from_name(const char* script_name) const { 01067 for (int i = 0; i < script_table_size_used; ++i) { 01068 if (strcmp(script_name, script_table[i]) == 0) 01069 return i; 01070 } 01071 return 0; // 0 is always the null_script 01072 }