tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccutil/unicharset.cpp
Go to the documentation of this file.
00001 
00002 // File:        unicharset.cpp
00003 // Description: Unicode character/ligature set class.
00004 // Author:      Thomas Kielbus
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include <assert.h>
00021 #include <stdio.h>
00022 #include <string.h>
00023 
00024 #include "tesscallback.h"
00025 #include "tprintf.h"
00026 #include "unichar.h"
00027 #include "unicharset.h"
00028 #include "params.h"
00029 
00030 // Special character used in representing character fragments.
00031 static const char kSeparator = '|';
00032 // Special character used in representing 'natural' character fragments.
00033 static const char kNaturalFlag = 'n';
00034 
00035 static const int ISALPHA_MASK = 0x1;
00036 static const int ISLOWER_MASK = 0x2;
00037 static const int ISUPPER_MASK = 0x4;
00038 static const int ISDIGIT_MASK = 0x8;
00039 static const int ISPUNCTUATION_MASK = 0x10;
00040 
00041 // Y coordinate threshold for determining cap-height vs x-height.
00042 // TODO(rays) Bring the global definition down to the ccutil library level,
00043 // so this constant is relative to some other constants.
00044 static const int kMeanlineThreshold = 220;
00045 // Let C be the number of alpha chars for which all tops exceed
00046 // kMeanlineThreshold, and X the number of alpha chars for which all
00047 // tops are below kMeanlineThreshold, then if X > C *
00048 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
00049 // half the alpha characters have upper or lower case, then the
00050 // unicharset "has x-height".
00051 const double kMinXHeightFraction = 0.25;
00052 const double kMinCapHeightFraction = 0.05;
00053 
00054 /*static */
00055 const char* UNICHARSET::kCustomLigatures[][2] = {
00056   {"ct", "\uE003"},  // c + t -> U+E003
00057   {"ſh", "\uE006"},  // long-s + h -> U+E006
00058   {"ſi", "\uE007"},  // long-s + i -> U+E007
00059   {"ſl", "\uE008"},  // long-s + l -> U+E008
00060   {"ſſ", "\uE009"},  // long-s + long-s -> U+E009
00061   {NULL, NULL}
00062 };
00063 
00064 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
00065 const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
00066     " ",
00067     "Joined",
00068     "|Broken|0|1"
00069 };
00070 
00071 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
00072   Init();
00073 }
00074 
00075 // Initialize all properties to sensible default values.
00076 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
00077   isalpha = false;
00078   islower = false;
00079   isupper = false;
00080   isdigit = false;
00081   ispunctuation = false;
00082   isngram = false;
00083   enabled = false;
00084   SetRangesOpen();
00085   script_id = 0;
00086   other_case = 0;
00087   mirror = 0;
00088   normed = "";
00089   direction = UNICHARSET::U_LEFT_TO_RIGHT;
00090   fragment = NULL;
00091 }
00092 
00093 // Sets all ranges wide open. Initialization default in case there are
00094 // no useful values available.
00095 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
00096   min_bottom = 0;
00097   max_bottom = MAX_UINT8;
00098   min_top = 0;
00099   max_top = MAX_UINT8;
00100   min_width = 0;
00101   max_width = MAX_INT16;
00102   min_bearing = 0;
00103   max_bearing = MAX_INT16;
00104   min_advance = 0;
00105   max_advance = MAX_INT16;
00106 }
00107 
00108 // Sets all ranges to empty. Used before expanding with font-based data.
00109 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
00110   min_bottom = MAX_UINT8;
00111   max_bottom = 0;
00112   min_top = MAX_UINT8;
00113   max_top = 0;
00114   min_width = MAX_INT16;
00115   max_width = 0;
00116   min_bearing = MAX_INT16;
00117   max_bearing = 0;
00118   min_advance = MAX_INT16;
00119   max_advance = 0;
00120 }
00121 
00122 // Returns true if any of the top/bottom/width/bearing/advance ranges is
00123 // emtpy.
00124 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
00125   return min_bottom > max_bottom || min_top > max_top ||
00126       min_width > max_width || min_bearing > max_bearing ||
00127       min_advance > max_advance;
00128 }
00129 
00130 // Expands the ranges with the ranges from the src properties.
00131 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
00132     const UNICHAR_PROPERTIES& src) {
00133   UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
00134   UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
00135   UpdateRange(src.min_top, &min_top, &max_top);
00136   UpdateRange(src.max_top, &min_top, &max_top);
00137   UpdateRange(src.min_width, &min_width, &max_width);
00138   UpdateRange(src.max_width, &min_width, &max_width);
00139   UpdateRange(src.min_bearing, &min_bearing, &max_bearing);
00140   UpdateRange(src.max_bearing, &min_bearing, &max_bearing);
00141   UpdateRange(src.min_advance, &min_advance, &max_advance);
00142   UpdateRange(src.max_advance, &min_advance, &max_advance);
00143 }
00144 
00145 // Copies the properties from src into this.
00146 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
00147   // Apart from the fragment, everything else can be done with a default copy.
00148   CHAR_FRAGMENT* saved_fragment = fragment;
00149   *this = src;  // Bitwise copy.
00150   fragment = saved_fragment;
00151 }
00152 
00153 UNICHARSET::UNICHARSET() :
00154     unichars(NULL),
00155     ids(),
00156     size_used(0),
00157     size_reserved(0),
00158     script_table(NULL),
00159     script_table_size_used(0),
00160     null_script("NULL") {
00161   clear();
00162   for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
00163     unichar_insert(kSpecialUnicharCodes[i]);
00164     if (i == UNICHAR_JOINED)
00165       set_isngram(i, true);
00166   }
00167 }
00168 
00169 UNICHARSET::~UNICHARSET() {
00170   clear();
00171 }
00172 
00173 void UNICHARSET::reserve(int unichars_number) {
00174   if (unichars_number > size_reserved) {
00175     UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
00176     for (int i = 0; i < size_used; ++i)
00177       unichars_new[i] = unichars[i];
00178     for (int j = size_used; j < unichars_number; ++j) {
00179       unichars_new[j].properties.script_id = add_script(null_script);
00180     }
00181     delete[] unichars;
00182     unichars = unichars_new;
00183     size_reserved = unichars_number;
00184   }
00185 }
00186 
00187 const UNICHAR_ID
00188 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
00189   return ids.contains(unichar_repr) ?
00190     ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
00191 }
00192 
00193 const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
00194                                            int length) const {
00195   assert(length > 0 && length <= UNICHAR_LEN);
00196   return ids.contains(unichar_repr, length) ?
00197     ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
00198 }
00199 
00200 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
00201 // while leaving the rest of the string encodable. Returns 0 if the
00202 // beginning of the string is not encodable.
00203 // WARNING: this function now encodes the whole string for precision.
00204 // Use encode_string in preference to repeatedly calling step.
00205 int UNICHARSET::step(const char* str) const {
00206   GenericVector<UNICHAR_ID> encoding;
00207   GenericVector<char> lengths;
00208   encode_string(str, true, &encoding, &lengths, NULL);
00209   if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
00210   return lengths[0];
00211 }
00212 // As step except constraining the search to unichar-ids that are
00213 // self-normalized. Unlike step, does not encode the whole string, therefore
00214 // should be used on short strings (like those obtained from
00215 // get_normed_unichar.)
00216 int UNICHARSET::normed_step(const char* str) const {
00217   // Find the length of the first matching unicharset member.
00218   int length = ids.minmatch(str);
00219   if (length == 0)
00220     return 0;  // Empty string or illegal char.
00221 
00222   while (length <= UNICHAR_LEN) {
00223     if (ids.contains(str, length)) {
00224       int matched_id = unichar_to_id(str, length);
00225       const GenericVector<UNICHAR_ID>& matched_norms = normed_ids(matched_id);
00226       bool good_start = matched_norms.size() == 1 &&
00227                         matched_norms[0] == matched_id;
00228       if (str[length] == '\0') {
00229         return good_start ? length : 0;
00230       }
00231       if (normed_step(str + length) > 0)
00232         return length;  // This length works!
00233     } else if (str[length] == '\0') {
00234       return 0;  // Ran out of string.
00235     }
00236     ++length;
00237   }
00238   return 0;
00239 }
00240 
00241 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
00242 // If not encodable, write the first byte offset which cannot be converted
00243 // into the second (return) argument.
00244 bool UNICHARSET::encodable_string(const char *str,
00245                                   int *first_bad_position) const {
00246   GenericVector<UNICHAR_ID> encoding;
00247   return encode_string(str, true, &encoding, NULL, first_bad_position);
00248 }
00249 
00250 // Encodes the given UTF-8 string with this UNICHARSET.
00251 // Returns true if the encoding succeeds completely, false if there is at
00252 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case
00253 // the rest of the string is still encoded.
00254 // If lengths is not NULL, then it is filled with the corresponding
00255 // byte length of each encoded UNICHAR_ID.
00256 bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
00257                                GenericVector<UNICHAR_ID>* encoding,
00258                                GenericVector<char>* lengths,
00259                                int* encoded_length) const {
00260   GenericVector<UNICHAR_ID> working_encoding;
00261   GenericVector<char> working_lengths;
00262   GenericVector<char> best_lengths;
00263   encoding->truncate(0);  // Just in case str is empty.
00264   int str_length = strlen(str);
00265   int str_pos = 0;
00266   bool perfect = true;
00267   while (str_pos < str_length) {
00268     encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
00269                   &str_pos, encoding, &best_lengths);
00270     if (str_pos < str_length) {
00271       // This is a non-match. Skip one utf-8 character.
00272       perfect = false;
00273       if (give_up_on_failure) break;
00274       int step = UNICHAR::utf8_step(str + str_pos);
00275       if (step == 0) step = 1;
00276       encoding->push_back(INVALID_UNICHAR_ID);
00277       best_lengths.push_back(step);
00278       str_pos += step;
00279       working_encoding = *encoding;
00280       working_lengths = best_lengths;
00281     }
00282   }
00283   if (lengths != NULL) *lengths = best_lengths;
00284   if (encoded_length != NULL) *encoded_length = str_pos;
00285   return perfect;
00286 }
00287 
00288 const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
00289   if (id == INVALID_UNICHAR_ID) {
00290     return INVALID_UNICHAR;
00291   }
00292   ASSERT_HOST(id < this->size());
00293   return unichars[id].representation;
00294 }
00295 
00296 const char* const UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {
00297   if (id == INVALID_UNICHAR_ID) {
00298     return INVALID_UNICHAR;
00299   }
00300   ASSERT_HOST(id < this->size());
00301   // Resolve from the kCustomLigatures table if this is a private encoding.
00302   if (get_isprivate(id)) {
00303     const char* ch = id_to_unichar(id);
00304     for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
00305       if (!strcmp(ch, kCustomLigatures[i][1])) {
00306         return kCustomLigatures[i][0];
00307       }
00308     }
00309   }
00310   // Otherwise return the stored representation.
00311   return unichars[id].representation;
00312 }
00313 
00314 // Return a STRING that reformats the utf8 str into the str followed
00315 // by its hex unicodes.
00316 STRING UNICHARSET::debug_utf8_str(const char* str) {
00317   STRING result = str;
00318   result += " [";
00319   int step = 1;
00320   // Chop into unicodes and code each as hex.
00321   for (int i = 0; str[i] != '\0'; i += step) {
00322     char hex[sizeof(int) * 2 + 1];
00323     step = UNICHAR::utf8_step(str + i);
00324     if (step == 0) {
00325       step = 1;
00326       sprintf(hex, "%x", str[i]);
00327     } else {
00328       UNICHAR ch(str + i, step);
00329       sprintf(hex, "%x", ch.first_uni());
00330     }
00331     result += hex;
00332     result += " ";
00333   }
00334   result += "]";
00335   return result;
00336 }
00337 
00338 // Return a STRING containing debug information on the unichar, including
00339 // the id_to_unichar, its hex unicodes and the properties.
00340 STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
00341   if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
00342   const CHAR_FRAGMENT *fragment = this->get_fragment(id);
00343   if (fragment) {
00344     return fragment->to_string();
00345   }
00346   const char* str = id_to_unichar(id);
00347   STRING result = debug_utf8_str(str);
00348   // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
00349   if (get_isalpha(id)) {
00350     if (get_islower(id))
00351       result += "a";
00352     else if (get_isupper(id))
00353       result += "A";
00354     else
00355       result += "x";
00356   }
00357   // Append 0 if a digit.
00358   if (get_isdigit(id)) {
00359     result += "0";
00360   }
00361   // Append p is a punctuation symbol.
00362   if (get_ispunctuation(id)) {
00363     result += "p";
00364   }
00365   return result;
00366 }
00367 
00368 // Sets the normed_ids vector from the normed string. normed_ids is not
00369 // stored in the file, and needs to be set when the UNICHARSET is loaded.
00370 void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
00371   unichars[unichar_id].properties.normed_ids.truncate(0);
00372   int length = unichars[unichar_id].properties.normed.length();
00373   const char* normed_str = unichars[unichar_id].properties.normed.string();
00374   int step = 0;
00375   for (int offset = 0; offset < length; offset+= step) {
00376     step = normed_step(normed_str + offset);
00377     if (step == 0) {
00378       unichars[unichar_id].properties.normed_ids.truncate(0);
00379       unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
00380       break;
00381     }
00382     int normed_id = unichar_to_id(normed_str + offset, step);
00383     ASSERT_HOST(normed_id >= 0);
00384     unichars[unichar_id].properties.normed_ids.push_back(normed_id);
00385   }
00386 }
00387 
00388 // Returns whether the unichar id represents a unicode value in the private use
00389 // area. We use this range only internally to represent uncommon ligatures
00390 // (eg. 'ct') that do not have regular unicode values.
00391 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
00392   UNICHAR uc(id_to_unichar(unichar_id), -1);
00393   int uni = uc.first_uni();
00394   return (uni >= 0xE000 && uni <= 0xF8FF);
00395 }
00396 
00397 
00398 // Sets all ranges to empty, so they can be expanded to set the values.
00399 void UNICHARSET::set_ranges_empty() {
00400   for (int id = 0; id < size_used; ++id) {
00401     unichars[id].properties.SetRangesEmpty();
00402   }
00403 }
00404 
00405 // Sets all the properties for this unicharset given a src unicharset with
00406 // everything set. The unicharsets don't have to be the same, and graphemes
00407 // are correctly accounted for.
00408 void UNICHARSET::PartialSetPropertiesFromOther(int start_index,
00409                                                const UNICHARSET& src) {
00410   for (int ch = start_index; ch < size_used; ++ch) {
00411     const char* utf8 = id_to_unichar(ch);
00412     UNICHAR_PROPERTIES properties;
00413     if (src.GetStrProperties(utf8, &properties)) {
00414       // Setup the script_id, other_case, and mirror properly.
00415       const char* script = src.get_script_from_script_id(properties.script_id);
00416       properties.script_id = add_script(script);
00417       const char* other_case = src.id_to_unichar(properties.other_case);
00418       if (contains_unichar(other_case)) {
00419         properties.other_case = unichar_to_id(other_case);
00420       } else {
00421         properties.other_case = ch;
00422       }
00423       const char* mirror_str = src.id_to_unichar(properties.mirror);
00424       if (contains_unichar(mirror_str)) {
00425         properties.mirror = unichar_to_id(mirror_str);
00426       } else {
00427         properties.mirror = ch;
00428       }
00429       unichars[ch].properties.CopyFrom(properties);
00430       set_normed_ids(ch);
00431     } else {
00432       tprintf("Failed to get properties for index %d = %s\n", ch, utf8);
00433     }
00434   }
00435 }
00436 
00437 // Expands the tops and bottoms and widths for this unicharset given a
00438 // src unicharset with ranges in it. The unicharsets don't have to be the
00439 // same, and graphemes are correctly accounted for.
00440 void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) {
00441   for (int ch = 0; ch < size_used; ++ch) {
00442     const char* utf8 = id_to_unichar(ch);
00443     UNICHAR_PROPERTIES properties;
00444     if (src.GetStrProperties(utf8, &properties)) {
00445       // Expand just the ranges from properties.
00446       unichars[ch].properties.ExpandRangesFrom(properties);
00447     }
00448   }
00449 }
00450 
00451 // Makes this a copy of src. Clears this completely first, so the automattic
00452 // ids will not be present in this if not in src.
00453 void UNICHARSET::CopyFrom(const UNICHARSET& src) {
00454   clear();
00455   AppendOtherUnicharset(src);
00456 }
00457 
00458 // For each id in src, if it does not occur in this, add it, as in
00459 // SetPropertiesFromOther, otherwise expand the ranges, as in
00460 // ExpandRangesFromOther.
00461 void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
00462   int initial_used = size_used;
00463   for (int ch = 0; ch < src.size_used; ++ch) {
00464     const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
00465     const char* utf8 = src.id_to_unichar(ch);
00466     if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) {
00467       // Only use fully valid entries.
00468       tprintf("Bad properties for index %d, char %s: "
00469               "%d,%d %d,%d %d,%d %d,%d %d,%d\n",
00470               ch, utf8, src_props.min_bottom, src_props.max_bottom,
00471               src_props.min_top, src_props.max_top,
00472               src_props.min_width, src_props.max_width,
00473               src_props.min_bearing, src_props.max_bearing,
00474               src_props.min_advance, src_props.max_advance);
00475       continue;
00476     }
00477     int id = size_used;
00478     if (contains_unichar(utf8)) {
00479       id = unichar_to_id(utf8);
00480       // Just expand current ranges.
00481       unichars[id].properties.ExpandRangesFrom(src_props);
00482     } else {
00483       unichar_insert(utf8);
00484       unichars[id].properties.SetRangesEmpty();
00485     }
00486   }
00487   // Set properties, including mirror and other_case, WITHOUT reordering
00488   // the unicharset.
00489   PartialSetPropertiesFromOther(initial_used, src);
00490 }
00491 
00492 // Returns true if the acceptable ranges of the tops of the characters do
00493 // not overlap, making their x-height calculations distinct.
00494 bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const {
00495   int overlap = MIN(unichars[id1].properties.max_top,
00496                     unichars[id2].properties.max_top) -
00497                 MAX(unichars[id1].properties.min_top,
00498                     unichars[id2].properties.min_top);
00499   return overlap <= 0;
00500 }
00501 
00502 // Internal recursive version of encode_string above.
00503 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that
00504 // each UNICHAR_ID uses the least possible part of the utf8 str.
00505 // It does this by depth-first tail recursion on increasing length matches
00506 // to the UNICHARSET, saving the first encountered result that encodes the
00507 // maximum total length of str. It stops on a failure to encode to make
00508 // the overall process of encoding a partially failed string more efficient.
00509 // See unicharset.h for definition of the args.
00510 void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
00511                                GenericVector<UNICHAR_ID>* encoding,
00512                                GenericVector<char>* lengths,
00513                                int* best_total_length,
00514                                GenericVector<UNICHAR_ID>* best_encoding,
00515                                GenericVector<char>* best_lengths) const {
00516   if (str_index > *best_total_length) {
00517     // This is the best result so far.
00518     *best_total_length = str_index;
00519     *best_encoding = *encoding;
00520     if (best_lengths != NULL)
00521       *best_lengths = *lengths;
00522   }
00523   if (str_index == str_length) return;
00524   int encoding_index = encoding->size();
00525   // Find the length of the first matching unicharset member.
00526   int length = ids.minmatch(str + str_index);
00527   if (length == 0 || str_index + length > str_length) return;
00528   do {
00529     if (ids.contains(str + str_index, length)) {
00530       // Successful encoding so far.
00531       UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
00532       encoding->push_back(id);
00533       lengths->push_back(length);
00534       encode_string(str, str_index + length, str_length, encoding, lengths,
00535                     best_total_length, best_encoding, best_lengths);
00536       if (*best_total_length == str_length)
00537         return;  // Tail recursion success!
00538       // Failed with that length, truncate back and try again.
00539       encoding->truncate(encoding_index);
00540       lengths->truncate(encoding_index);
00541     }
00542     int step = UNICHAR::utf8_step(str + str_index + length);
00543     if (step == 0) step = 1;
00544     length += step;
00545   } while (length <= UNICHAR_LEN && str_index + length <= str_length);
00546 }
00547 
00548 // Gets the properties for a grapheme string, combining properties for
00549 // multiple characters in a meaningful way where possible.
00550 // Returns false if no valid match was found in the unicharset.
00551 // NOTE that script_id, mirror, and other_case refer to this unicharset on
00552 // return and will need translation if the target unicharset is different.
00553 bool UNICHARSET::GetStrProperties(const char* utf8_str,
00554                                   UNICHAR_PROPERTIES* props) const {
00555   props->Init();
00556   props->SetRangesEmpty();
00557   props->min_advance = 0;
00558   props->max_advance = 0;
00559   int total_unicodes = 0;
00560   GenericVector<UNICHAR_ID> encoding;
00561   if (!encode_string(utf8_str, true, &encoding, NULL, NULL))
00562     return false;  // Some part was invalid.
00563   for (int i = 0; i < encoding.size(); ++i) {
00564     int id = encoding[i];
00565     const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
00566     // Logical OR all the bools.
00567     if (src_props.isalpha) props->isalpha = true;
00568     if (src_props.islower) props->islower = true;
00569     if (src_props.isupper) props->isupper = true;
00570     if (src_props.isdigit) props->isdigit = true;
00571     if (src_props.ispunctuation) props->ispunctuation = true;
00572     if (src_props.isngram) props->isngram = true;
00573     if (src_props.enabled) props->enabled = true;
00574     // Min/max the tops/bottoms.
00575     UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
00576     UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
00577     UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
00578     UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
00579     int bearing = ClipToRange(props->min_advance + src_props.min_bearing,
00580                               -MAX_INT16, MAX_INT16);
00581     if (total_unicodes == 0 || bearing < props->min_bearing)
00582       props->min_bearing = bearing;
00583     bearing = ClipToRange(props->max_advance + src_props.max_bearing,
00584                           -MAX_INT16, MAX_INT16);
00585     if (total_unicodes == 0 || bearing < props->max_bearing)
00586       props->max_bearing = bearing;
00587     props->min_advance = ClipToRange(props->min_advance + src_props.min_advance,
00588                                      -MAX_INT16, MAX_INT16);
00589     props->max_advance = ClipToRange(props->max_advance + src_props.max_advance,
00590                                      -MAX_INT16, MAX_INT16);
00591     // With a single width, just use the widths stored in the unicharset.
00592     props->min_width = src_props.min_width;
00593     props->max_width = src_props.max_width;
00594     // Use the first script id, other_case, mirror, direction.
00595     // Note that these will need translation, except direction.
00596     if (total_unicodes == 0) {
00597       props->script_id = src_props.script_id;
00598       props->other_case = src_props.other_case;
00599       props->mirror = src_props.mirror;
00600       props->direction = src_props.direction;
00601     }
00602     // The normed string for the compound character is the concatenation of
00603     // the normed versions of the individual characters.
00604     props->normed += src_props.normed;
00605     ++total_unicodes;
00606   }
00607   if (total_unicodes > 1) {
00608     // Estimate the total widths from the advance - bearing.
00609     props->min_width = ClipToRange(props->min_advance - props->max_bearing,
00610                                    -MAX_INT16, MAX_INT16);
00611     props->max_width = ClipToRange(props->max_advance - props->min_bearing,
00612                                    -MAX_INT16, MAX_INT16);
00613   }
00614   return total_unicodes > 0;
00615 }
00616 
00617 // TODO(rays) clean-up the order of functions to match unicharset.h.
00618 
00619 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
00620   unsigned int properties = 0;
00621   if (this->get_isalpha(id))
00622     properties |= ISALPHA_MASK;
00623   if (this->get_islower(id))
00624     properties |= ISLOWER_MASK;
00625   if (this->get_isupper(id))
00626     properties |= ISUPPER_MASK;
00627   if (this->get_isdigit(id))
00628     properties |= ISDIGIT_MASK;
00629   if (this->get_ispunctuation(id))
00630     properties |= ISPUNCTUATION_MASK;
00631   return properties;
00632 }
00633 
00634 char UNICHARSET::get_chartype(UNICHAR_ID id) const {
00635   if (this->get_isupper(id)) return 'A';
00636   if (this->get_islower(id)) return 'a';
00637   if (this->get_isalpha(id)) return 'x';
00638   if (this->get_isdigit(id)) return '0';
00639   if (this->get_ispunctuation(id)) return 'p';
00640   return 0;
00641 }
00642 
00643 void UNICHARSET::unichar_insert(const char* const unichar_repr) {
00644   if (!ids.contains(unichar_repr)) {
00645     if (strlen(unichar_repr) > UNICHAR_LEN) {
00646       fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
00647               int(strlen(unichar_repr)), unichar_repr);
00648       return;
00649     }
00650     if (size_used == size_reserved) {
00651       if (size_used == 0)
00652         reserve(8);
00653       else
00654         reserve(2 * size_used);
00655     }
00656 
00657     strcpy(unichars[size_used].representation, unichar_repr);
00658     this->set_script(size_used, null_script);
00659     // If the given unichar_repr represents a fragmented character, set
00660     // fragment property to a pointer to CHAR_FRAGMENT class instance with
00661     // information parsed from the unichar representation. Use the script
00662     // of the base unichar for the fragmented character if possible.
00663     CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
00664     this->unichars[size_used].properties.fragment = frag;
00665     if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
00666       this->unichars[size_used].properties.script_id =
00667         this->get_script(frag->get_unichar());
00668     }
00669     this->unichars[size_used].properties.enabled = true;
00670     ids.insert(unichar_repr, size_used);
00671     ++size_used;
00672   }
00673 }
00674 
00675 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
00676   return ids.contains(unichar_repr);
00677 }
00678 
00679 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
00680                                   int length) const {
00681   if (length == 0) {
00682     return false;
00683   }
00684   return ids.contains(unichar_repr, length);
00685 }
00686 
00687 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
00688                     const char* const unichar_repr) const {
00689   return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
00690 }
00691 
00692 bool UNICHARSET::save_to_file(FILE *file) const {
00693   fprintf(file, "%d\n", this->size());
00694   for (UNICHAR_ID id = 0; id < this->size(); ++id) {
00695     int min_bottom, max_bottom, min_top, max_top;
00696     get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
00697     int min_width, max_width;
00698     get_width_range(id, &min_width, &max_width);
00699     int min_bearing, max_bearing;
00700     get_bearing_range(id, &min_bearing, &max_bearing);
00701     int min_advance, max_advance;
00702     get_advance_range(id, &min_advance, &max_advance);
00703     unsigned int properties = this->get_properties(id);
00704     if (strcmp(this->id_to_unichar(id), " ") == 0) {
00705       fprintf(file, "%s %x %s %d\n", "NULL", properties,
00706               this->get_script_from_script_id(this->get_script(id)),
00707               this->get_other_case(id));
00708     } else {
00709       fprintf(file,
00710               "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n",
00711               this->id_to_unichar(id), properties,
00712               min_bottom, max_bottom, min_top, max_top, min_width, max_width,
00713               min_bearing, max_bearing, min_advance, max_advance,
00714               this->get_script_from_script_id(this->get_script(id)),
00715               this->get_other_case(id), this->get_direction(id),
00716               this->get_mirror(id), this->get_normed_unichar(id),
00717               this->debug_str(id).string());
00718     }
00719   }
00720   return true;
00721 }
00722 
00723 class InMemoryFilePointer {
00724  public:
00725   InMemoryFilePointer(const char *memory, int mem_size)
00726       : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
00727 
00728   char *fgets(char *orig_dst, int size) {
00729     const char *src_end = memory_ + mem_size_;
00730     char *dst_end = orig_dst + size - 1;
00731     if (size < 1) {
00732       return fgets_ptr_ < src_end ? orig_dst : NULL;
00733     }
00734 
00735     char *dst = orig_dst;
00736     char ch = '^';
00737     while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') {
00738       ch = *dst++ = *fgets_ptr_++;
00739     }
00740     *dst = 0;
00741     return (dst == orig_dst) ? NULL : orig_dst;
00742   }
00743 
00744  private:
00745   const char *memory_;
00746   const char *fgets_ptr_;
00747   const int mem_size_;
00748 };
00749 
00750 bool UNICHARSET::load_from_inmemory_file(
00751     const char *memory, int mem_size, bool skip_fragments) {
00752   InMemoryFilePointer mem_fp(memory, mem_size);
00753   TessResultCallback2<char *, char *, int> *fgets_cb =
00754       NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets);
00755   bool success = load_via_fgets(fgets_cb, skip_fragments);
00756   delete fgets_cb;
00757   return success;
00758 }
00759 
00760 class LocalFilePointer {
00761  public:
00762   LocalFilePointer(FILE *stream) : fp_(stream) {}
00763   char *fgets(char *dst, int size) {
00764     return ::fgets(dst, size, fp_);
00765   }
00766  private:
00767   FILE *fp_;
00768 };
00769 
00770 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
00771   LocalFilePointer lfp(file);
00772   TessResultCallback2<char *, char *, int> *fgets_cb =
00773       NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets);
00774   bool success = load_via_fgets(fgets_cb, skip_fragments);
00775   delete fgets_cb;
00776   return success;
00777 }
00778 
00779 bool UNICHARSET::load_via_fgets(
00780     TessResultCallback2<char *, char *, int> *fgets_cb,
00781     bool skip_fragments) {
00782   int unicharset_size;
00783   char buffer[256];
00784 
00785   this->clear();
00786   if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL ||
00787       sscanf(buffer, "%d", &unicharset_size) != 1) {
00788     return false;
00789   }
00790   this->reserve(unicharset_size);
00791   for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
00792     char unichar[256];
00793     unsigned int properties;
00794     char script[64];
00795 
00796     strcpy(script, null_script);
00797     int min_bottom = 0;
00798     int max_bottom = MAX_UINT8;
00799     int min_top = 0;
00800     int max_top = MAX_UINT8;
00801     int min_width = 0;
00802     int max_width = MAX_INT16;
00803     int min_bearing = 0;
00804     int max_bearing = MAX_INT16;
00805     int min_advance = 0;
00806     int max_advance = MAX_INT16;
00807     // TODO(eger): check that this default it ok
00808     // after enabling BiDi iterator for Arabic+Cube.
00809     int direction = UNICHARSET::U_LEFT_TO_RIGHT;
00810     UNICHAR_ID other_case = id;
00811     UNICHAR_ID mirror = id;
00812     char normed[64];
00813     int v = -1;
00814     if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL ||
00815         ((v = sscanf(buffer,
00816                      "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d %63s",
00817                      unichar, &properties,
00818                      &min_bottom, &max_bottom, &min_top, &max_top,
00819                      &min_width, &max_width, &min_bearing, &max_bearing,
00820                      &min_advance, &max_advance, script, &other_case,
00821                      &direction, &mirror, normed)) != 17 &&
00822          (v = sscanf(buffer,
00823                      "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d",
00824                      unichar, &properties,
00825                      &min_bottom, &max_bottom, &min_top, &max_top,
00826                      &min_width, &max_width, &min_bearing, &max_bearing,
00827                      &min_advance, &max_advance,
00828                      script, &other_case, &direction, &mirror)) != 16 &&
00829           (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
00830                       unichar, &properties,
00831                       &min_bottom, &max_bottom, &min_top, &max_top,
00832                       script, &other_case, &direction, &mirror)) != 10 &&
00833           (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
00834                       &min_bottom, &max_bottom, &min_top, &max_top,
00835                       script, &other_case)) != 8 &&
00836           (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
00837                       script, &other_case)) != 4 &&
00838           (v = sscanf(buffer, "%s %x %63s",
00839                       unichar, &properties, script)) != 3 &&
00840           (v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) {
00841       return false;
00842     }
00843 
00844     // Skip fragments if needed.
00845     CHAR_FRAGMENT *frag = NULL;
00846     if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
00847       delete frag;
00848       continue;
00849     }
00850     // Insert unichar into unicharset and set its properties.
00851     if (strcmp(unichar, "NULL") == 0)
00852       this->unichar_insert(" ");
00853     else
00854       this->unichar_insert(unichar);
00855 
00856     this->set_isalpha(id, properties & ISALPHA_MASK);
00857     this->set_islower(id, properties & ISLOWER_MASK);
00858     this->set_isupper(id, properties & ISUPPER_MASK);
00859     this->set_isdigit(id, properties & ISDIGIT_MASK);
00860     this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
00861     this->set_isngram(id, false);
00862     this->set_script(id, script);
00863     this->unichars[id].properties.enabled = true;
00864     this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
00865     this->set_width_range(id, min_width, max_width);
00866     this->set_bearing_range(id, min_bearing, max_bearing);
00867     this->set_advance_range(id, min_advance, max_advance);
00868     this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
00869     ASSERT_HOST(other_case < unicharset_size);
00870     this->set_other_case(id, (v>3) ? other_case : id);
00871     ASSERT_HOST(mirror < unicharset_size);
00872     this->set_mirror(id, (v>8) ? mirror : id);
00873     this->set_normed(id, (v>16) ? normed : unichar);
00874   }
00875   post_load_setup();
00876   return true;
00877 }
00878 
00879 // Sets up internal data after loading the file, based on the char
00880 // properties. Called from load_from_file, but also needs to be run
00881 // during set_unicharset_properties.
00882 void UNICHARSET::post_load_setup() {
00883   // Number of alpha chars with the case property minus those without,
00884   // in order to determine that half the alpha chars have case.
00885   int net_case_alphas = 0;
00886   int x_height_alphas = 0;
00887   int cap_height_alphas = 0;
00888   top_bottom_set_ = false;
00889   for (UNICHAR_ID id = 0; id < size_used; ++id) {
00890     int min_bottom = 0;
00891     int max_bottom = MAX_UINT8;
00892     int min_top = 0;
00893     int max_top = MAX_UINT8;
00894     get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
00895     if (min_top > 0)
00896       top_bottom_set_ = true;
00897     if (get_isalpha(id)) {
00898       if (get_islower(id) || get_isupper(id))
00899         ++net_case_alphas;
00900       else
00901         --net_case_alphas;
00902       if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
00903         ++x_height_alphas;
00904       else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
00905         ++cap_height_alphas;
00906     }
00907     set_normed_ids(id);
00908   }
00909 
00910   script_has_upper_lower_ = net_case_alphas > 0;
00911   script_has_xheight_ = script_has_upper_lower_ ||
00912       (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
00913        cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
00914 
00915   null_sid_ = get_script_id_from_name(null_script);
00916   ASSERT_HOST(null_sid_ == 0);
00917   common_sid_ = get_script_id_from_name("Common");
00918   latin_sid_ = get_script_id_from_name("Latin");
00919   cyrillic_sid_ = get_script_id_from_name("Cyrillic");
00920   greek_sid_ = get_script_id_from_name("Greek");
00921   han_sid_ = get_script_id_from_name("Han");
00922   hiragana_sid_ = get_script_id_from_name("Hiragana");
00923   katakana_sid_ = get_script_id_from_name("Katakana");
00924 
00925   // Compute default script. Use the highest-counting alpha script, that is
00926   // not the common script, as that still contains some "alphas".
00927   int* script_counts = new int[script_table_size_used];
00928   memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
00929   for (int id = 0; id < size_used; ++id) {
00930     if (get_isalpha(id)) {
00931       ++script_counts[get_script(id)];
00932     }
00933   }
00934   default_sid_ = 0;
00935   for (int s = 1; s < script_table_size_used; ++s) {
00936     if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
00937       default_sid_ = s;
00938   }
00939   delete [] script_counts;
00940 }
00941 
00942 // Returns true if right_to_left scripts are significant in the unicharset,
00943 // but without being so sensitive that "universal" unicharsets containing
00944 // characters from many scripts, like orientation and script detection,
00945 // look like they are right_to_left.
00946 bool UNICHARSET::major_right_to_left() const {
00947   int ltr_count = 0;
00948   int rtl_count = 0;
00949   for (int id = 0; id < size_used; ++id) {
00950     int dir = get_direction(id);
00951     if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
00952     if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
00953         dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
00954         dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
00955   }
00956   return rtl_count > ltr_count;
00957 }
00958 
00959 // Set a whitelist and/or blacklist of characters to recognize.
00960 // An empty or NULL whitelist enables everything (minus any blacklist).
00961 // An empty or NULL blacklist disables nothing.
00962 void UNICHARSET::set_black_and_whitelist(const char* blacklist,
00963                                          const char* whitelist) {
00964   bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
00965   // Set everything to default
00966   for (int ch = 0; ch < size_used; ++ch)
00967     unichars[ch].properties.enabled = def_enabled;
00968   if (!def_enabled) {
00969     // Enable the whitelist.
00970     GenericVector<UNICHAR_ID> encoding;
00971     encode_string(whitelist, false, &encoding, NULL, NULL);
00972     for (int i = 0; i < encoding.size(); ++i) {
00973       if (encoding[i] != INVALID_UNICHAR_ID)
00974         unichars[encoding[i]].properties.enabled = true;
00975     }
00976   }
00977   if (blacklist != NULL && blacklist[0] != '\0') {
00978     // Disable the blacklist.
00979     GenericVector<UNICHAR_ID> encoding;
00980     encode_string(blacklist, false, &encoding, NULL, NULL);
00981     for (int i = 0; i < encoding.size(); ++i) {
00982       if (encoding[i] != INVALID_UNICHAR_ID)
00983         unichars[encoding[i]].properties.enabled = false;
00984     }
00985   }
00986 }
00987 
00988 int UNICHARSET::add_script(const char* script) {
00989   for (int i = 0; i < script_table_size_used; ++i) {
00990     if (strcmp(script, script_table[i]) == 0)
00991       return i;
00992   }
00993   if (script_table_size_reserved == 0) {
00994     script_table_size_reserved = 8;
00995     script_table = new char*[script_table_size_reserved];
00996   }
00997   if (script_table_size_used + 1 >= script_table_size_reserved) {
00998     char** new_script_table = new char*[script_table_size_reserved * 2];
00999     memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
01000     delete[] script_table;
01001     script_table = new_script_table;
01002       script_table_size_reserved = 2 * script_table_size_reserved;
01003   }
01004   script_table[script_table_size_used] = new char[strlen(script) + 1];
01005   strcpy(script_table[script_table_size_used], script);
01006   return script_table_size_used++;
01007 }
01008 
01009 // Returns the string that represents a fragment
01010 // with the given unichar, pos and total.
01011 STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
01012                                 bool natural) {
01013   if (total == 1) return STRING(unichar);
01014   STRING result = "";
01015   result += kSeparator;
01016   result += unichar;
01017   char buffer[kMaxLen];
01018   snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
01019            natural ? kNaturalFlag : kSeparator, total);
01020   result += buffer;
01021   return result;
01022 }
01023 
01024 CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
01025   const char *ptr = string;
01026   int len = strlen(string);
01027   if (len < kMinLen || *ptr != kSeparator) {
01028     return NULL;  // this string can not represent a fragment
01029   }
01030   ptr++;  // move to the next character
01031   int step = 0;
01032   while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
01033     step += UNICHAR::utf8_step(ptr + step);
01034   }
01035   if (step == 0 || step > UNICHAR_LEN) {
01036     return NULL;  // no character for unichar or the character is too long
01037   }
01038   char unichar[UNICHAR_LEN + 1];
01039   strncpy(unichar, ptr, step);
01040   unichar[step] = '\0';  // null terminate unichar
01041   ptr += step;  // move to the next fragment separator
01042   int pos = 0;
01043   int total = 0;
01044   bool natural = false;
01045   char *end_ptr = NULL;
01046   for (int i = 0; i < 2; i++) {
01047     if (ptr > string + len || *ptr != kSeparator) {
01048       if (i == 1 && *ptr == kNaturalFlag)
01049         natural = true;
01050       else
01051         return NULL;  // Failed to parse fragment representation.
01052     }
01053     ptr++;  // move to the next character
01054     i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
01055       : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
01056     ptr = end_ptr;
01057   }
01058   if (ptr != string + len) {
01059     return NULL;  // malformed fragment representation
01060   }
01061   CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
01062   fragment->set_all(unichar, pos, total, natural);
01063   return fragment;
01064 }
01065 
01066 int UNICHARSET::get_script_id_from_name(const char* script_name) const {
01067   for (int i = 0; i < script_table_size_used; ++i) {
01068     if (strcmp(script_name, script_table[i]) == 0)
01069       return i;
01070   }
01071   return 0;  // 0 is always the null_script
01072 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines