tesseract
3.03
|
00001 00002 // File: unicharset.h 00003 // Description: Unicode character/ligature set class. 00004 // Author: Thomas Kielbus 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__ 00021 #define TESSERACT_CCUTIL_UNICHARSET_H__ 00022 00023 #include "errcode.h" 00024 #include "genericvector.h" 00025 #include "helpers.h" 00026 #include "strngs.h" 00027 #include "tesscallback.h" 00028 #include "unichar.h" 00029 #include "unicharmap.h" 00030 00031 // Enum holding special values of unichar_id. Every unicharset has these. 00032 // Warning! Keep in sync with kSpecialUnicharCodes. 00033 enum SpecialUnicharCodes { 00034 UNICHAR_SPACE, 00035 UNICHAR_JOINED, 00036 UNICHAR_BROKEN, 00037 00038 SPECIAL_UNICHAR_CODES_COUNT 00039 }; 00040 00041 class CHAR_FRAGMENT { 00042 public: 00043 // Minimum number of characters used for fragment representation. 00044 static const int kMinLen = 6; 00045 // Maximum number of characters used for fragment representation. 00046 static const int kMaxLen = 3 + UNICHAR_LEN + 2; 00047 // Maximum number of fragments per character. 00048 static const int kMaxChunks = 5; 00049 00050 // Setters and Getters. 00051 inline void set_all(const char *unichar, int pos, int total, bool natural) { 00052 set_unichar(unichar); 00053 set_pos(pos); 00054 set_total(total); 00055 set_natural(natural); 00056 } 00057 inline void set_unichar(const char *uch) { 00058 strncpy(this->unichar, uch, UNICHAR_LEN); 00059 this->unichar[UNICHAR_LEN] = '\0'; 00060 } 00061 inline void set_pos(int p) { this->pos = p; } 00062 inline void set_total(int t) { this->total = t; } 00063 inline const char* get_unichar() const { return this->unichar; } 00064 inline int get_pos() const { return this->pos; } 00065 inline int get_total() const { return this->total; } 00066 00067 // Returns the string that represents a fragment 00068 // with the given unichar, pos and total. 00069 static STRING to_string(const char *unichar, int pos, int total, 00070 bool natural); 00071 // Returns the string that represents this fragment. 00072 STRING to_string() const { 00073 return to_string(unichar, pos, total, natural); 00074 } 00075 00076 // Checks whether a fragment has the same unichar, 00077 // position and total as the given inputs. 00078 inline bool equals(const char *other_unichar, 00079 int other_pos, int other_total) const { 00080 return (strcmp(this->unichar, other_unichar) == 0 && 00081 this->pos == other_pos && this->total == other_total); 00082 } 00083 inline bool equals(const CHAR_FRAGMENT *other) const { 00084 return this->equals(other->get_unichar(), 00085 other->get_pos(), 00086 other->get_total()); 00087 } 00088 00089 // Checks whether a given fragment is a continuation of this fragment. 00090 // Assumes that the given fragment pointer is not NULL. 00091 inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const { 00092 return (strcmp(this->unichar, fragment->get_unichar()) == 0 && 00093 this->total == fragment->get_total() && 00094 this->pos == fragment->get_pos() + 1); 00095 } 00096 00097 // Returns true if this fragment is a beginning fragment. 00098 inline bool is_beginning() const { return this->pos == 0; } 00099 00100 // Returns true if this fragment is an ending fragment. 00101 inline bool is_ending() const { return this->pos == this->total-1; } 00102 00103 // Returns true if the fragment was a separate component to begin with, 00104 // ie did not need chopping to be isolated, but may have been separated 00105 // out from a multi-outline blob. 00106 inline bool is_natural() const { return natural; } 00107 void set_natural(bool value) { natural = value; } 00108 00109 // Parses the string to see whether it represents a character fragment 00110 // (rather than a regular character). If so, allocates memory for a new 00111 // CHAR_FRAGMENT instance and fills it in with the corresponding fragment 00112 // information. Fragments are of the form: 00113 // |m|1|2, meaning chunk 1 of 2 of character m, or 00114 // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed 00115 // to divide the parts, as they were already separate connected components. 00116 // 00117 // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT 00118 // instance, otherwise (if the string does not represent a fragment or it 00119 // looks like it does, but parsing it as a fragment fails) returns NULL. 00120 // 00121 // Note: The caller is responsible for deallocating memory 00122 // associated with the returned pointer. 00123 static CHAR_FRAGMENT *parse_from_string(const char *str); 00124 00125 private: 00126 char unichar[UNICHAR_LEN + 1]; 00127 // True if the fragment was a separate component to begin with, 00128 // ie did not need chopping to be isolated, but may have been separated 00129 // out from a multi-outline blob. 00130 bool natural; 00131 inT16 pos; // fragment position in the character 00132 inT16 total; // total number of fragments in the character 00133 }; 00134 00135 // The UNICHARSET class is an utility class for Tesseract that holds the 00136 // set of characters that are used by the engine. Each character is identified 00137 // by a unique number, from 0 to (size - 1). 00138 class UNICHARSET { 00139 public: 00140 // Custom list of characters and their ligature forms (UTF8) 00141 // These map to unicode values in the private use area (PUC) and are supported 00142 // by only few font families (eg. Wyld, Adobe Caslon Pro). 00143 static const char* kCustomLigatures[][2]; 00144 00145 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum. 00146 static const char* kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]; 00147 00148 // ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h) 00149 enum Direction { 00150 U_LEFT_TO_RIGHT = 0, 00151 U_RIGHT_TO_LEFT = 1, 00152 U_EUROPEAN_NUMBER = 2, 00153 U_EUROPEAN_NUMBER_SEPARATOR = 3, 00154 U_EUROPEAN_NUMBER_TERMINATOR = 4, 00155 U_ARABIC_NUMBER = 5, 00156 U_COMMON_NUMBER_SEPARATOR = 6, 00157 U_BLOCK_SEPARATOR = 7, 00158 U_SEGMENT_SEPARATOR = 8, 00159 U_WHITE_SPACE_NEUTRAL = 9, 00160 U_OTHER_NEUTRAL = 10, 00161 U_LEFT_TO_RIGHT_EMBEDDING = 11, 00162 U_LEFT_TO_RIGHT_OVERRIDE = 12, 00163 U_RIGHT_TO_LEFT_ARABIC = 13, 00164 U_RIGHT_TO_LEFT_EMBEDDING = 14, 00165 U_RIGHT_TO_LEFT_OVERRIDE = 15, 00166 U_POP_DIRECTIONAL_FORMAT = 16, 00167 U_DIR_NON_SPACING_MARK = 17, 00168 U_BOUNDARY_NEUTRAL = 18, 00169 U_CHAR_DIRECTION_COUNT 00170 }; 00171 00172 // Create an empty UNICHARSET 00173 UNICHARSET(); 00174 00175 ~UNICHARSET(); 00176 00177 // Return the UNICHAR_ID of a given unichar representation within the 00178 // UNICHARSET. 00179 const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const; 00180 00181 // Return the UNICHAR_ID of a given unichar representation within the 00182 // UNICHARSET. Only the first length characters from unichar_repr are used. 00183 const UNICHAR_ID unichar_to_id(const char* const unichar_repr, 00184 int length) const; 00185 00186 // Return the minimum number of bytes that matches a legal UNICHAR_ID, 00187 // while leaving the rest of the string encodable. Returns 0 if the 00188 // beginning of the string is not encodable. 00189 // WARNING: this function now encodes the whole string for precision. 00190 // Use encode_string in preference to repeatedly calling step. 00191 int step(const char* str) const; 00192 // As step except constraining the search to unichar-ids that are 00193 // self-normalized. Unlike step, does not encode the whole string, therefore 00194 // should be used on short strings (like those obtained from 00195 // get_normed_unichar.) 00196 int normed_step(const char* str) const; 00197 00198 // Return whether the given UTF-8 string is encodable with this UNICHARSET. 00199 // If not encodable, write the first byte offset which cannot be converted 00200 // into the second (return) argument. 00201 bool encodable_string(const char *str, int *first_bad_position) const; 00202 00203 // Encodes the given UTF-8 string with this UNICHARSET. 00204 // Any part of the string that cannot be encoded (because the utf8 can't 00205 // be broken up into pieces that are in the unicharset) then: 00206 // if give_up_on_failure, stops and returns a partial encoding, 00207 // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding. 00208 // Returns true if the encoding succeeds completely, false if there is at 00209 // least one failure. 00210 // If lengths is not NULL, then it is filled with the corresponding 00211 // byte length of each encoded UNICHAR_ID. 00212 // If encoded_length is not NULL then on return it contains the length of 00213 // str that was encoded. (if give_up_on_failure the location of the first 00214 // failure, otherwise strlen(str).) 00215 bool encode_string(const char* str, bool give_up_on_failure, 00216 GenericVector<UNICHAR_ID>* encoding, 00217 GenericVector<char>* lengths, 00218 int* encoded_length) const; 00219 00220 // Return the unichar representation corresponding to the given UNICHAR_ID 00221 // within the UNICHARSET. 00222 const char* const id_to_unichar(UNICHAR_ID id) const; 00223 00224 // Return the UTF8 representation corresponding to the given UNICHAR_ID after 00225 // resolving any private encodings internal to Tesseract. This method is 00226 // preferrable to id_to_unichar for outputting text that will be visible to 00227 // external applications. 00228 const char* const id_to_unichar_ext(UNICHAR_ID id) const; 00229 00230 // Return a STRING that reformats the utf8 str into the str followed 00231 // by its hex unicodes. 00232 static STRING debug_utf8_str(const char* str); 00233 00234 // Return a STRING containing debug information on the unichar, including 00235 // the id_to_unichar, its hex unicodes and the properties. 00236 STRING debug_str(UNICHAR_ID id) const; 00237 STRING debug_str(const char * unichar_repr) const { 00238 return debug_str(unichar_to_id(unichar_repr)); 00239 } 00240 00241 // Add a unichar representation to the set. 00242 void unichar_insert(const char* const unichar_repr); 00243 00244 // Return true if the given unichar id exists within the set. 00245 // Relies on the fact that unichar ids are contiguous in the unicharset. 00246 bool contains_unichar_id(UNICHAR_ID unichar_id) const { 00247 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used && 00248 unichar_id >= 0; 00249 } 00250 00251 // Return true if the given unichar representation exists within the set. 00252 bool contains_unichar(const char* const unichar_repr) const; 00253 bool contains_unichar(const char* const unichar_repr, int length) const; 00254 00255 // Return true if the given unichar representation corresponds to the given 00256 // UNICHAR_ID within the set. 00257 bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const; 00258 00259 // Delete CHAR_FRAGMENTs stored in properties of unichars array. 00260 void delete_pointers_in_unichars() { 00261 for (int i = 0; i < size_used; ++i) { 00262 if (unichars[i].properties.fragment != NULL) { 00263 delete unichars[i].properties.fragment; 00264 unichars[i].properties.fragment = NULL; 00265 } 00266 } 00267 } 00268 00269 // Clear the UNICHARSET (all the previous data is lost). 00270 void clear() { 00271 if (script_table != NULL) { 00272 for (int i = 0; i < script_table_size_used; ++i) 00273 delete[] script_table[i]; 00274 delete[] script_table; 00275 script_table = NULL; 00276 script_table_size_used = 0; 00277 } 00278 if (unichars != NULL) { 00279 delete_pointers_in_unichars(); 00280 delete[] unichars; 00281 unichars = NULL; 00282 } 00283 script_table_size_reserved = 0; 00284 size_reserved = 0; 00285 size_used = 0; 00286 ids.clear(); 00287 top_bottom_set_ = false; 00288 script_has_upper_lower_ = false; 00289 script_has_xheight_ = false; 00290 null_sid_ = 0; 00291 common_sid_ = 0; 00292 latin_sid_ = 0; 00293 cyrillic_sid_ = 0; 00294 greek_sid_ = 0; 00295 han_sid_ = 0; 00296 hiragana_sid_ = 0; 00297 katakana_sid_ = 0; 00298 } 00299 00300 // Return the size of the set (the number of different UNICHAR it holds). 00301 int size() const { 00302 return size_used; 00303 } 00304 00305 // Reserve enough memory space for the given number of UNICHARS 00306 void reserve(int unichars_number); 00307 00308 // Opens the file indicated by filename and saves unicharset to that file. 00309 // Returns true if the operation is successful. 00310 bool save_to_file(const char * const filename) const { 00311 FILE* file = fopen(filename, "w+b"); 00312 if (file == NULL) return false; 00313 bool result = save_to_file(file); 00314 fclose(file); 00315 return result; 00316 } 00317 00318 // Saves the content of the UNICHARSET to the given file. 00319 // Returns true if the operation is successful. 00320 bool save_to_file(FILE *file) const; 00321 00322 // Load a unicharset from a unicharset file that has been loaded into 00323 // the given memory buffer. 00324 // Returns true if the operation is successful. 00325 bool load_from_inmemory_file(const char* const memory, int mem_size, 00326 bool skip_fragments); 00327 // Returns true if the operation is successful. 00328 bool load_from_inmemory_file(const char* const memory, int mem_size) { 00329 return load_from_inmemory_file(memory, mem_size, false); 00330 } 00331 00332 // Opens the file indicated by filename and loads the UNICHARSET 00333 // from the given file. The previous data is lost. 00334 // Returns true if the operation is successful. 00335 bool load_from_file(const char* const filename, bool skip_fragments) { 00336 FILE* file = fopen(filename, "rb"); 00337 if (file == NULL) return false; 00338 bool result = load_from_file(file, skip_fragments); 00339 fclose(file); 00340 return result; 00341 } 00342 // returns true if the operation is successful. 00343 bool load_from_file(const char* const filename) { 00344 return load_from_file(filename, false); 00345 } 00346 00347 // Loads the UNICHARSET from the given file. The previous data is lost. 00348 // Returns true if the operation is successful. 00349 bool load_from_file(FILE *file, bool skip_fragments); 00350 bool load_from_file(FILE *file) { return load_from_file(file, false); } 00351 00352 // Sets up internal data after loading the file, based on the char 00353 // properties. Called from load_from_file, but also needs to be run 00354 // during set_unicharset_properties. 00355 void post_load_setup(); 00356 00357 // Returns true if right_to_left scripts are significant in the unicharset, 00358 // but without being so sensitive that "universal" unicharsets containing 00359 // characters from many scripts, like orientation and script detection, 00360 // look like they are right_to_left. 00361 bool major_right_to_left() const; 00362 00363 // Set a whitelist and/or blacklist of characters to recognize. 00364 // An empty or NULL whitelist enables everything (minus any blacklist). 00365 // An empty or NULL blacklist disables nothing. 00366 // The blacklist overrides the whitelist. 00367 // Each list is a string of utf8 character strings. Boundaries between 00368 // unicharset units are worked out automatically, and characters not in 00369 // the unicharset are silently ignored. 00370 void set_black_and_whitelist(const char* blacklist, const char* whitelist); 00371 00372 // Set the isalpha property of the given unichar to the given value. 00373 void set_isalpha(UNICHAR_ID unichar_id, bool value) { 00374 unichars[unichar_id].properties.isalpha = value; 00375 } 00376 00377 // Set the islower property of the given unichar to the given value. 00378 void set_islower(UNICHAR_ID unichar_id, bool value) { 00379 unichars[unichar_id].properties.islower = value; 00380 } 00381 00382 // Set the isupper property of the given unichar to the given value. 00383 void set_isupper(UNICHAR_ID unichar_id, bool value) { 00384 unichars[unichar_id].properties.isupper = value; 00385 } 00386 00387 // Set the isdigit property of the given unichar to the given value. 00388 void set_isdigit(UNICHAR_ID unichar_id, bool value) { 00389 unichars[unichar_id].properties.isdigit = value; 00390 } 00391 00392 // Set the ispunctuation property of the given unichar to the given value. 00393 void set_ispunctuation(UNICHAR_ID unichar_id, bool value) { 00394 unichars[unichar_id].properties.ispunctuation = value; 00395 } 00396 00397 // Set the isngram property of the given unichar to the given value. 00398 void set_isngram(UNICHAR_ID unichar_id, bool value) { 00399 unichars[unichar_id].properties.isngram = value; 00400 } 00401 00402 // Set the script name of the given unichar to the given value. 00403 // Value is copied and thus can be a temporary; 00404 void set_script(UNICHAR_ID unichar_id, const char* value) { 00405 unichars[unichar_id].properties.script_id = add_script(value); 00406 } 00407 00408 // Set other_case unichar id in the properties for the given unichar id. 00409 void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) { 00410 unichars[unichar_id].properties.other_case = other_case; 00411 } 00412 00413 // Set the direction property of the given unichar to the given value. 00414 void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) { 00415 unichars[unichar_id].properties.direction = value; 00416 } 00417 00418 // Set mirror unichar id in the properties for the given unichar id. 00419 void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) { 00420 unichars[unichar_id].properties.mirror = mirror; 00421 } 00422 00423 // Record normalized version of unichar with the given unichar_id. 00424 void set_normed(UNICHAR_ID unichar_id, const char* normed) { 00425 unichars[unichar_id].properties.normed = normed; 00426 unichars[unichar_id].properties.normed_ids.truncate(0); 00427 } 00428 // Sets the normed_ids vector from the normed string. normed_ids is not 00429 // stored in the file, and needs to be set when the UNICHARSET is loaded. 00430 void set_normed_ids(UNICHAR_ID unichar_id); 00431 00432 // Return the isalpha property of the given unichar. 00433 bool get_isalpha(UNICHAR_ID unichar_id) const { 00434 if (INVALID_UNICHAR_ID == unichar_id) return false; 00435 ASSERT_HOST(contains_unichar_id(unichar_id)); 00436 return unichars[unichar_id].properties.isalpha; 00437 } 00438 00439 // Return the islower property of the given unichar. 00440 bool get_islower(UNICHAR_ID unichar_id) const { 00441 if (INVALID_UNICHAR_ID == unichar_id) return false; 00442 ASSERT_HOST(contains_unichar_id(unichar_id)); 00443 return unichars[unichar_id].properties.islower; 00444 } 00445 00446 // Return the isupper property of the given unichar. 00447 bool get_isupper(UNICHAR_ID unichar_id) const { 00448 if (INVALID_UNICHAR_ID == unichar_id) return false; 00449 ASSERT_HOST(contains_unichar_id(unichar_id)); 00450 return unichars[unichar_id].properties.isupper; 00451 } 00452 00453 // Return the isdigit property of the given unichar. 00454 bool get_isdigit(UNICHAR_ID unichar_id) const { 00455 if (INVALID_UNICHAR_ID == unichar_id) return false; 00456 ASSERT_HOST(contains_unichar_id(unichar_id)); 00457 return unichars[unichar_id].properties.isdigit; 00458 } 00459 00460 // Return the ispunctuation property of the given unichar. 00461 bool get_ispunctuation(UNICHAR_ID unichar_id) const { 00462 if (INVALID_UNICHAR_ID == unichar_id) return false; 00463 ASSERT_HOST(contains_unichar_id(unichar_id)); 00464 return unichars[unichar_id].properties.ispunctuation; 00465 } 00466 00467 // Return the isngram property of the given unichar. 00468 bool get_isngram(UNICHAR_ID unichar_id) const { 00469 if (INVALID_UNICHAR_ID == unichar_id) return false; 00470 ASSERT_HOST(contains_unichar_id(unichar_id)); 00471 return unichars[unichar_id].properties.isngram; 00472 } 00473 00474 // Returns whether the unichar id represents a unicode value in the private 00475 // use area. 00476 bool get_isprivate(UNICHAR_ID unichar_id) const; 00477 00478 // Returns true if the ids have useful min/max top/bottom values. 00479 bool top_bottom_useful() const { 00480 return top_bottom_set_; 00481 } 00482 // Sets all ranges to empty, so they can be expanded to set the values. 00483 void set_ranges_empty(); 00484 // Sets all the properties for this unicharset given a src_unicharset with 00485 // everything set. The unicharsets don't have to be the same, and graphemes 00486 // are correctly accounted for. 00487 void SetPropertiesFromOther(const UNICHARSET& src) { 00488 PartialSetPropertiesFromOther(0, src); 00489 } 00490 // Sets properties from Other, starting only at the given index. 00491 void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src); 00492 // Expands the tops and bottoms and widths for this unicharset given a 00493 // src_unicharset with ranges in it. The unicharsets don't have to be the 00494 // same, and graphemes are correctly accounted for. 00495 void ExpandRangesFromOther(const UNICHARSET& src); 00496 // Makes this a copy of src. Clears this completely first, so the automattic 00497 // ids will not be present in this if not in src. 00498 void CopyFrom(const UNICHARSET& src); 00499 // For each id in src, if it does not occur in this, add it, as in 00500 // SetPropertiesFromOther, otherwise expand the ranges, as in 00501 // ExpandRangesFromOther. 00502 void AppendOtherUnicharset(const UNICHARSET& src); 00503 // Returns true if the acceptable ranges of the tops of the characters do 00504 // not overlap, making their x-height calculations distinct. 00505 bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const; 00506 // Returns the min and max bottom and top of the given unichar in 00507 // baseline-normalized coordinates, ie, where the baseline is 00508 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight 00509 // (See normalis.h for the definitions). 00510 void get_top_bottom(UNICHAR_ID unichar_id, 00511 int* min_bottom, int* max_bottom, 00512 int* min_top, int* max_top) const { 00513 if (INVALID_UNICHAR_ID == unichar_id) { 00514 *min_bottom = *min_top = 0; 00515 *max_bottom = *max_top = 256; // kBlnCellHeight 00516 return; 00517 } 00518 ASSERT_HOST(contains_unichar_id(unichar_id)); 00519 *min_bottom = unichars[unichar_id].properties.min_bottom; 00520 *max_bottom = unichars[unichar_id].properties.max_bottom; 00521 *min_top = unichars[unichar_id].properties.min_top; 00522 *max_top = unichars[unichar_id].properties.max_top; 00523 } 00524 void set_top_bottom(UNICHAR_ID unichar_id, 00525 int min_bottom, int max_bottom, 00526 int min_top, int max_top) { 00527 unichars[unichar_id].properties.min_bottom = 00528 static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8)); 00529 unichars[unichar_id].properties.max_bottom = 00530 static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8)); 00531 unichars[unichar_id].properties.min_top = 00532 static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8)); 00533 unichars[unichar_id].properties.max_top = 00534 static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8)); 00535 } 00536 // Returns the width range of the given unichar in baseline-normalized 00537 // coordinates, ie, where the baseline is kBlnBaselineOffset and the 00538 // meanline is kBlnBaselineOffset + kBlnXHeight. 00539 // (See normalis.h for the definitions). 00540 void get_width_range(UNICHAR_ID unichar_id, 00541 int* min_width, int* max_width) const { 00542 if (INVALID_UNICHAR_ID == unichar_id) { 00543 *min_width = 0; 00544 *max_width = 256; // kBlnCellHeight; 00545 return; 00546 } 00547 ASSERT_HOST(contains_unichar_id(unichar_id)); 00548 *min_width = unichars[unichar_id].properties.min_width; 00549 *max_width = unichars[unichar_id].properties.max_width; 00550 } 00551 void set_width_range(UNICHAR_ID unichar_id, int min_width, int max_width) { 00552 unichars[unichar_id].properties.min_width = 00553 static_cast<inT16>(ClipToRange(min_width, 0, MAX_INT16)); 00554 unichars[unichar_id].properties.max_width = 00555 static_cast<inT16>(ClipToRange(max_width, 0, MAX_INT16)); 00556 } 00557 // Returns the range of the x-bearing of the given unichar in 00558 // baseline-normalized coordinates, ie, where the baseline is 00559 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight. 00560 // (See normalis.h for the definitions). 00561 void get_bearing_range(UNICHAR_ID unichar_id, 00562 int* min_bearing, int* max_bearing) const { 00563 if (INVALID_UNICHAR_ID == unichar_id) { 00564 *min_bearing = *max_bearing = 0; 00565 return; 00566 } 00567 ASSERT_HOST(contains_unichar_id(unichar_id)); 00568 *min_bearing = unichars[unichar_id].properties.min_bearing; 00569 *max_bearing = unichars[unichar_id].properties.max_bearing; 00570 } 00571 void set_bearing_range(UNICHAR_ID unichar_id, 00572 int min_bearing, int max_bearing) { 00573 unichars[unichar_id].properties.min_bearing = 00574 static_cast<inT16>(ClipToRange(min_bearing, 0, MAX_INT16)); 00575 unichars[unichar_id].properties.max_bearing = 00576 static_cast<inT16>(ClipToRange(max_bearing, 0, MAX_INT16)); 00577 } 00578 // Returns the range of the x-advance of the given unichar in 00579 // baseline-normalized coordinates, ie, where the baseline is 00580 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight. 00581 // (See normalis.h for the definitions). 00582 void get_advance_range(UNICHAR_ID unichar_id, 00583 int* min_advance, int* max_advance) const { 00584 if (INVALID_UNICHAR_ID == unichar_id) { 00585 *min_advance = *max_advance = 0; 00586 return; 00587 } 00588 ASSERT_HOST(contains_unichar_id(unichar_id)); 00589 *min_advance = unichars[unichar_id].properties.min_advance; 00590 *max_advance = unichars[unichar_id].properties.max_advance; 00591 } 00592 void set_advance_range(UNICHAR_ID unichar_id, 00593 int min_advance, int max_advance) { 00594 unichars[unichar_id].properties.min_advance = 00595 static_cast<inT16>(ClipToRange(min_advance, 0, MAX_INT16)); 00596 unichars[unichar_id].properties.max_advance = 00597 static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16)); 00598 } 00599 00600 // Return the script name of the given unichar. 00601 // The returned pointer will always be the same for the same script, it's 00602 // managed by unicharset and thus MUST NOT be deleted 00603 int get_script(UNICHAR_ID unichar_id) const { 00604 if (INVALID_UNICHAR_ID == unichar_id) return null_sid_; 00605 ASSERT_HOST(contains_unichar_id(unichar_id)); 00606 return unichars[unichar_id].properties.script_id; 00607 } 00608 00609 // Return the character properties, eg. alpha/upper/lower/digit/punct, 00610 // as a bit field of unsigned int. 00611 unsigned int get_properties(UNICHAR_ID unichar_id) const; 00612 00613 // Return the character property as a single char. If a character has 00614 // multiple attributes, the main property is defined by the following order: 00615 // upper_case : 'A' 00616 // lower_case : 'a' 00617 // alpha : 'x' 00618 // digit : '0' 00619 // punctuation: 'p' 00620 char get_chartype(UNICHAR_ID unichar_id) const; 00621 00622 // Get other_case unichar id in the properties for the given unichar id. 00623 UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const { 00624 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00625 ASSERT_HOST(contains_unichar_id(unichar_id)); 00626 return unichars[unichar_id].properties.other_case; 00627 } 00628 00629 // Returns the direction property of the given unichar. 00630 Direction get_direction(UNICHAR_ID unichar_id) const { 00631 if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL; 00632 ASSERT_HOST(contains_unichar_id(unichar_id)); 00633 return unichars[unichar_id].properties.direction; 00634 } 00635 00636 // Get mirror unichar id in the properties for the given unichar id. 00637 UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const { 00638 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00639 ASSERT_HOST(contains_unichar_id(unichar_id)); 00640 return unichars[unichar_id].properties.mirror; 00641 } 00642 00643 // Returns UNICHAR_ID of the corresponding lower-case unichar. 00644 UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const { 00645 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00646 ASSERT_HOST(contains_unichar_id(unichar_id)); 00647 if (unichars[unichar_id].properties.islower) return unichar_id; 00648 return unichars[unichar_id].properties.other_case; 00649 } 00650 00651 // Returns UNICHAR_ID of the corresponding upper-case unichar. 00652 UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const { 00653 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; 00654 ASSERT_HOST(contains_unichar_id(unichar_id)); 00655 if (unichars[unichar_id].properties.isupper) return unichar_id; 00656 return unichars[unichar_id].properties.other_case; 00657 } 00658 00659 // Returns true if this UNICHARSET has the special codes in 00660 // SpecialUnicharCodes available. If false then there are normal unichars 00661 // at these codes and they should not be used. 00662 bool has_special_codes() const { 00663 return get_fragment(UNICHAR_BROKEN) != NULL && 00664 strcmp(id_to_unichar(UNICHAR_BROKEN), 00665 kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0; 00666 } 00667 00668 // Return a pointer to the CHAR_FRAGMENT class if the given 00669 // unichar id represents a character fragment. 00670 const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { 00671 if (INVALID_UNICHAR_ID == unichar_id) return NULL; 00672 ASSERT_HOST(contains_unichar_id(unichar_id)); 00673 return unichars[unichar_id].properties.fragment; 00674 } 00675 00676 // Return the isalpha property of the given unichar representation. 00677 bool get_isalpha(const char* const unichar_repr) const { 00678 return get_isalpha(unichar_to_id(unichar_repr)); 00679 } 00680 00681 // Return the islower property of the given unichar representation. 00682 bool get_islower(const char* const unichar_repr) const { 00683 return get_islower(unichar_to_id(unichar_repr)); 00684 } 00685 00686 // Return the isupper property of the given unichar representation. 00687 bool get_isupper(const char* const unichar_repr) const { 00688 return get_isupper(unichar_to_id(unichar_repr)); 00689 } 00690 00691 // Return the isdigit property of the given unichar representation. 00692 bool get_isdigit(const char* const unichar_repr) const { 00693 return get_isdigit(unichar_to_id(unichar_repr)); 00694 } 00695 00696 // Return the ispunctuation property of the given unichar representation. 00697 bool get_ispunctuation(const char* const unichar_repr) const { 00698 return get_ispunctuation(unichar_to_id(unichar_repr)); 00699 } 00700 00701 // Return the character properties, eg. alpha/upper/lower/digit/punct, 00702 // of the given unichar representation 00703 unsigned int get_properties(const char* const unichar_repr) const { 00704 return get_properties(unichar_to_id(unichar_repr)); 00705 } 00706 00707 char get_chartype(const char* const unichar_repr) const { 00708 return get_chartype(unichar_to_id(unichar_repr)); 00709 } 00710 00711 // Return the script name of the given unichar representation. 00712 // The returned pointer will always be the same for the same script, it's 00713 // managed by unicharset and thus MUST NOT be deleted 00714 int get_script(const char* const unichar_repr) const { 00715 return get_script(unichar_to_id(unichar_repr)); 00716 } 00717 00718 // Return a pointer to the CHAR_FRAGMENT class struct if the given 00719 // unichar representation represents a character fragment. 00720 const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const { 00721 if (unichar_repr == NULL || unichar_repr[0] == '\0' || 00722 !ids.contains(unichar_repr)) { 00723 return NULL; 00724 } 00725 return get_fragment(unichar_to_id(unichar_repr)); 00726 } 00727 00728 // Return the isalpha property of the given unichar representation. 00729 // Only the first length characters from unichar_repr are used. 00730 bool get_isalpha(const char* const unichar_repr, 00731 int length) const { 00732 return get_isalpha(unichar_to_id(unichar_repr, length)); 00733 } 00734 00735 // Return the islower property of the given unichar representation. 00736 // Only the first length characters from unichar_repr are used. 00737 bool get_islower(const char* const unichar_repr, 00738 int length) const { 00739 return get_islower(unichar_to_id(unichar_repr, length)); 00740 } 00741 00742 // Return the isupper property of the given unichar representation. 00743 // Only the first length characters from unichar_repr are used. 00744 bool get_isupper(const char* const unichar_repr, 00745 int length) const { 00746 return get_isupper(unichar_to_id(unichar_repr, length)); 00747 } 00748 00749 // Return the isdigit property of the given unichar representation. 00750 // Only the first length characters from unichar_repr are used. 00751 bool get_isdigit(const char* const unichar_repr, 00752 int length) const { 00753 return get_isdigit(unichar_to_id(unichar_repr, length)); 00754 } 00755 00756 // Return the ispunctuation property of the given unichar representation. 00757 // Only the first length characters from unichar_repr are used. 00758 bool get_ispunctuation(const char* const unichar_repr, 00759 int length) const { 00760 return get_ispunctuation(unichar_to_id(unichar_repr, length)); 00761 } 00762 00763 // Returns normalized version of unichar with the given unichar_id. 00764 const char *get_normed_unichar(UNICHAR_ID unichar_id) const { 00765 return unichars[unichar_id].properties.normed.string(); 00766 } 00767 // Returns a vector of UNICHAR_IDs that represent the ids of the normalized 00768 // version of the given id. There may be more than one UNICHAR_ID in the 00769 // vector if unichar_id represents a ligature. 00770 const GenericVector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const { 00771 return unichars[unichar_id].properties.normed_ids; 00772 } 00773 00774 // Return the script name of the given unichar representation. 00775 // Only the first length characters from unichar_repr are used. 00776 // The returned pointer will always be the same for the same script, it's 00777 // managed by unicharset and thus MUST NOT be deleted 00778 int get_script(const char* const unichar_repr, 00779 int length) const { 00780 return get_script(unichar_to_id(unichar_repr, length)); 00781 } 00782 00783 // Return the (current) number of scripts in the script table 00784 int get_script_table_size() const { 00785 return script_table_size_used; 00786 } 00787 00788 // Return the script string from its id 00789 const char* get_script_from_script_id(int id) const { 00790 if (id >= script_table_size_used || id < 0) 00791 return null_script; 00792 return script_table[id]; 00793 } 00794 00795 // Returns the id from the name of the script, or 0 if script is not found. 00796 // Note that this is an expensive operation since it involves iteratively 00797 // comparing strings in the script table. To avoid dependency on STL, we 00798 // won't use a hash. Instead, the calling function can use this to lookup 00799 // and save the ID for relevant scripts for fast comparisons later. 00800 int get_script_id_from_name(const char* script_name) const; 00801 00802 // Return true if the given script is the null script 00803 bool is_null_script(const char* script) const { 00804 return script == null_script; 00805 } 00806 00807 // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0, 00808 // then the returned pointer will be the same. 00809 // The script parameter is copied and thus can be a temporary. 00810 int add_script(const char* script); 00811 00812 // Return the enabled property of the given unichar. 00813 bool get_enabled(UNICHAR_ID unichar_id) const { 00814 return unichars[unichar_id].properties.enabled; 00815 } 00816 00817 00818 int null_sid() const { return null_sid_; } 00819 int common_sid() const { return common_sid_; } 00820 int latin_sid() const { return latin_sid_; } 00821 int cyrillic_sid() const { return cyrillic_sid_; } 00822 int greek_sid() const { return greek_sid_; } 00823 int han_sid() const { return han_sid_; } 00824 int hiragana_sid() const { return hiragana_sid_; } 00825 int katakana_sid() const { return katakana_sid_; } 00826 int default_sid() const { return default_sid_; } 00827 00828 // Returns true if the unicharset has the concept of upper/lower case. 00829 bool script_has_upper_lower() const { 00830 return script_has_upper_lower_; 00831 } 00832 // Returns true if the unicharset has the concept of x-height. 00833 // script_has_xheight can be true even if script_has_upper_lower is not, 00834 // when the script has a sufficiently predominant top line with ascenders, 00835 // such as Devanagari and Thai. 00836 bool script_has_xheight() const { 00837 return script_has_xheight_; 00838 } 00839 00840 private: 00841 00842 struct UNICHAR_PROPERTIES { 00843 UNICHAR_PROPERTIES(); 00844 // Initializes all properties to sensible default values. 00845 void Init(); 00846 // Sets all ranges wide open. Initialization default in case there are 00847 // no useful values available. 00848 void SetRangesOpen(); 00849 // Sets all ranges to empty. Used before expanding with font-based data. 00850 void SetRangesEmpty(); 00851 // Returns true if any of the top/bottom/width/bearing/advance ranges is 00852 // emtpy. 00853 bool AnyRangeEmpty() const; 00854 // Expands the ranges with the ranges from the src properties. 00855 void ExpandRangesFrom(const UNICHAR_PROPERTIES& src); 00856 // Copies the properties from src into this. 00857 void CopyFrom(const UNICHAR_PROPERTIES& src); 00858 00859 bool isalpha; 00860 bool islower; 00861 bool isupper; 00862 bool isdigit; 00863 bool ispunctuation; 00864 bool isngram; 00865 bool enabled; 00866 // Possible limits of the top and bottom of the bounding box in 00867 // baseline-normalized coordinates, ie, where the baseline is 00868 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight 00869 // (See normalis.h for the definitions). 00870 uinT8 min_bottom; 00871 uinT8 max_bottom; 00872 uinT8 min_top; 00873 uinT8 max_top; 00874 // Limits on the widths of bounding box, also in baseline-normalized coords. 00875 inT16 min_width; 00876 inT16 max_width; 00877 // Limits on the x-bearing and advance, also in baseline-normalized coords. 00878 inT16 min_bearing; 00879 inT16 max_bearing; 00880 inT16 min_advance; 00881 inT16 max_advance; 00882 int script_id; 00883 UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar 00884 Direction direction; // direction of this unichar 00885 // Mirror property is useful for reverse DAWG lookup for words in 00886 // right-to-left languages (e.g. "(word)" would be in 00887 // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string. 00888 // However, what we want in our DAWG is 00889 // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not 00890 // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'. 00891 UNICHAR_ID mirror; 00892 // A string of unichar_ids that represent the corresponding normed string. 00893 // For awkward characters like em-dash, this gives hyphen. 00894 // For ligatures, this gives the string of normal unichars. 00895 GenericVector<UNICHAR_ID> normed_ids; 00896 STRING normed; // normalized version of this unichar 00897 // Contains meta information about the fragment if a unichar represents 00898 // a fragment of a character, otherwise should be set to NULL. 00899 // It is assumed that character fragments are added to the unicharset 00900 // after the corresponding 'base' characters. 00901 CHAR_FRAGMENT *fragment; 00902 }; 00903 00904 struct UNICHAR_SLOT { 00905 char representation[UNICHAR_LEN + 1]; 00906 UNICHAR_PROPERTIES properties; 00907 }; 00908 00909 // Internal recursive version of encode_string above. 00910 // str is the start of the whole string. 00911 // str_index is the current position in str. 00912 // str_length is the length of str. 00913 // encoding is a working encoding of str. 00914 // lengths is a working set of lengths of each element of encoding. 00915 // best_total_length is the longest length of str that has been successfully 00916 // encoded so far. 00917 // On return: 00918 // best_encoding contains the encoding that used the longest part of str. 00919 // best_lengths (may be null) contains the lengths of best_encoding. 00920 void encode_string(const char* str, int str_index, int str_length, 00921 GenericVector<UNICHAR_ID>* encoding, 00922 GenericVector<char>* lengths, 00923 int* best_total_length, 00924 GenericVector<UNICHAR_ID>* best_encoding, 00925 GenericVector<char>* best_lengths) const; 00926 00927 // Gets the properties for a grapheme string, combining properties for 00928 // multiple characters in a meaningful way where possible. 00929 // Returns false if no valid match was found in the unicharset. 00930 // NOTE that script_id, mirror, and other_case refer to this unicharset on 00931 // return and will need redirecting if the target unicharset is different. 00932 bool GetStrProperties(const char* utf8_str, 00933 UNICHAR_PROPERTIES* props) const; 00934 00935 // Load ourselves from a "file" where our only interface to the file is 00936 // an implementation of fgets(). This is the parsing primitive accessed by 00937 // the public routines load_from_file() and load_from_inmemory_file(). 00938 bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb, 00939 bool skip_fragments); 00940 00941 UNICHAR_SLOT* unichars; 00942 UNICHARMAP ids; 00943 int size_used; 00944 int size_reserved; 00945 char** script_table; 00946 int script_table_size_used; 00947 int script_table_size_reserved; 00948 const char* null_script; 00949 // True if the unichars have their tops/bottoms set. 00950 bool top_bottom_set_; 00951 // True if the unicharset has significant upper/lower case chars. 00952 bool script_has_upper_lower_; 00953 // True if the unicharset has a significant mean-line with significant 00954 // ascenders above that. 00955 bool script_has_xheight_; 00956 00957 // A few convenient script name-to-id mapping without using hash. 00958 // These are initialized when unicharset file is loaded. Anything 00959 // missing from this list can be looked up using get_script_id_from_name. 00960 int null_sid_; 00961 int common_sid_; 00962 int latin_sid_; 00963 int cyrillic_sid_; 00964 int greek_sid_; 00965 int han_sid_; 00966 int hiragana_sid_; 00967 int katakana_sid_; 00968 // The most frequently occurring script in the charset. 00969 int default_sid_; 00970 }; 00971 00972 #endif // TESSERACT_CCUTIL_UNICHARSET_H__