tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccutil/unicharset.h
Go to the documentation of this file.
00001 
00002 // File:        unicharset.h
00003 // Description: Unicode character/ligature set class.
00004 // Author:      Thomas Kielbus
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
00021 #define TESSERACT_CCUTIL_UNICHARSET_H__
00022 
00023 #include "errcode.h"
00024 #include "genericvector.h"
00025 #include "helpers.h"
00026 #include "strngs.h"
00027 #include "tesscallback.h"
00028 #include "unichar.h"
00029 #include "unicharmap.h"
00030 
00031 // Enum holding special values of unichar_id. Every unicharset has these.
00032 // Warning! Keep in sync with kSpecialUnicharCodes.
00033 enum SpecialUnicharCodes {
00034   UNICHAR_SPACE,
00035   UNICHAR_JOINED,
00036   UNICHAR_BROKEN,
00037 
00038   SPECIAL_UNICHAR_CODES_COUNT
00039 };
00040 
00041 class CHAR_FRAGMENT {
00042  public:
00043   // Minimum number of characters used for fragment representation.
00044   static const int kMinLen = 6;
00045   // Maximum number of characters used for fragment representation.
00046   static const int kMaxLen = 3 + UNICHAR_LEN + 2;
00047   // Maximum number of fragments per character.
00048   static const int kMaxChunks = 5;
00049 
00050   // Setters and Getters.
00051   inline void set_all(const char *unichar, int pos, int total, bool natural) {
00052     set_unichar(unichar);
00053     set_pos(pos);
00054     set_total(total);
00055     set_natural(natural);
00056   }
00057   inline void set_unichar(const char *uch) {
00058     strncpy(this->unichar, uch, UNICHAR_LEN);
00059     this->unichar[UNICHAR_LEN] = '\0';
00060   }
00061   inline void set_pos(int p) { this->pos = p; }
00062   inline void set_total(int t) { this->total = t; }
00063   inline const char* get_unichar() const { return this->unichar; }
00064   inline int get_pos() const { return this->pos; }
00065   inline int get_total() const { return this->total; }
00066 
00067   // Returns the string that represents a fragment
00068   // with the given unichar, pos and total.
00069   static STRING to_string(const char *unichar, int pos, int total,
00070                           bool natural);
00071   // Returns the string that represents this fragment.
00072   STRING to_string() const {
00073     return to_string(unichar, pos, total, natural);
00074   }
00075 
00076   // Checks whether a fragment has the same unichar,
00077   // position and total as the given inputs.
00078   inline bool equals(const char *other_unichar,
00079                      int other_pos, int other_total) const {
00080     return (strcmp(this->unichar, other_unichar) == 0 &&
00081             this->pos == other_pos && this->total == other_total);
00082   }
00083   inline bool equals(const CHAR_FRAGMENT *other) const {
00084     return this->equals(other->get_unichar(),
00085                         other->get_pos(),
00086                         other->get_total());
00087   }
00088 
00089   // Checks whether a given fragment is a continuation of this fragment.
00090   // Assumes that the given fragment pointer is not NULL.
00091   inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
00092     return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
00093             this->total == fragment->get_total() &&
00094             this->pos == fragment->get_pos() + 1);
00095   }
00096 
00097   // Returns true if this fragment is a beginning fragment.
00098   inline bool is_beginning() const { return this->pos == 0; }
00099 
00100   // Returns true if this fragment is an ending fragment.
00101   inline bool is_ending() const { return this->pos == this->total-1; }
00102 
00103   // Returns true if the fragment was a separate component to begin with,
00104   // ie did not need chopping to be isolated, but may have been separated
00105   // out from a multi-outline blob.
00106   inline bool is_natural() const { return natural; }
00107   void set_natural(bool value) { natural = value; }
00108 
00109   // Parses the string to see whether it represents a character fragment
00110   // (rather than a regular character). If so, allocates memory for a new
00111   // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
00112   // information. Fragments are of the form:
00113   // |m|1|2, meaning chunk 1 of 2 of character m, or
00114   // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
00115   // to divide the parts, as they were already separate connected components.
00116   //
00117   // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
00118   // instance, otherwise (if the string does not represent a fragment or it
00119   // looks like it does, but parsing it as a fragment fails) returns NULL.
00120   //
00121   // Note: The caller is responsible for deallocating memory
00122   // associated with the returned pointer.
00123   static CHAR_FRAGMENT *parse_from_string(const char *str);
00124 
00125  private:
00126   char unichar[UNICHAR_LEN + 1];
00127   // True if the fragment was a separate component to begin with,
00128   // ie did not need chopping to be isolated, but may have been separated
00129   // out from a multi-outline blob.
00130   bool natural;
00131   inT16 pos;    // fragment position in the character
00132   inT16 total;  // total number of fragments in the character
00133 };
00134 
00135 // The UNICHARSET class is an utility class for Tesseract that holds the
00136 // set of characters that are used by the engine. Each character is identified
00137 // by a unique number, from 0 to (size - 1).
00138 class UNICHARSET {
00139  public:
00140   // Custom list of characters and their ligature forms (UTF8)
00141   // These map to unicode values in the private use area (PUC) and are supported
00142   // by only few font families (eg. Wyld, Adobe Caslon Pro).
00143   static const char* kCustomLigatures[][2];
00144 
00145   // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
00146   static const char* kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT];
00147 
00148   // ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h)
00149   enum Direction {
00150       U_LEFT_TO_RIGHT               = 0,
00151       U_RIGHT_TO_LEFT               = 1,
00152       U_EUROPEAN_NUMBER             = 2,
00153       U_EUROPEAN_NUMBER_SEPARATOR   = 3,
00154       U_EUROPEAN_NUMBER_TERMINATOR  = 4,
00155       U_ARABIC_NUMBER               = 5,
00156       U_COMMON_NUMBER_SEPARATOR     = 6,
00157       U_BLOCK_SEPARATOR             = 7,
00158       U_SEGMENT_SEPARATOR           = 8,
00159       U_WHITE_SPACE_NEUTRAL         = 9,
00160       U_OTHER_NEUTRAL               = 10,
00161       U_LEFT_TO_RIGHT_EMBEDDING     = 11,
00162       U_LEFT_TO_RIGHT_OVERRIDE      = 12,
00163       U_RIGHT_TO_LEFT_ARABIC        = 13,
00164       U_RIGHT_TO_LEFT_EMBEDDING     = 14,
00165       U_RIGHT_TO_LEFT_OVERRIDE      = 15,
00166       U_POP_DIRECTIONAL_FORMAT      = 16,
00167       U_DIR_NON_SPACING_MARK        = 17,
00168       U_BOUNDARY_NEUTRAL            = 18,
00169       U_CHAR_DIRECTION_COUNT
00170   };
00171 
00172   // Create an empty UNICHARSET
00173   UNICHARSET();
00174 
00175   ~UNICHARSET();
00176 
00177   // Return the UNICHAR_ID of a given unichar representation within the
00178   // UNICHARSET.
00179   const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
00180 
00181   // Return the UNICHAR_ID of a given unichar representation within the
00182   // UNICHARSET. Only the first length characters from unichar_repr are used.
00183   const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
00184                                  int length) const;
00185 
00186   // Return the minimum number of bytes that matches a legal UNICHAR_ID,
00187   // while leaving the rest of the string encodable. Returns 0 if the
00188   // beginning of the string is not encodable.
00189   // WARNING: this function now encodes the whole string for precision.
00190   // Use encode_string in preference to repeatedly calling step.
00191   int step(const char* str) const;
00192   // As step except constraining the search to unichar-ids that are
00193   // self-normalized. Unlike step, does not encode the whole string, therefore
00194   // should be used on short strings (like those obtained from
00195   // get_normed_unichar.)
00196   int normed_step(const char* str) const;
00197 
00198   // Return whether the given UTF-8 string is encodable with this UNICHARSET.
00199   // If not encodable, write the first byte offset which cannot be converted
00200   // into the second (return) argument.
00201   bool encodable_string(const char *str, int *first_bad_position) const;
00202 
00203   // Encodes the given UTF-8 string with this UNICHARSET.
00204   // Any part of the string that cannot be encoded (because the utf8 can't
00205   // be broken up into pieces that are in the unicharset) then:
00206   // if give_up_on_failure, stops and returns a partial encoding,
00207   // else continues and inserts an INVALID_UNICHAR_ID in the returned encoding.
00208   // Returns true if the encoding succeeds completely, false if there is at
00209   // least one failure.
00210   // If lengths is not NULL, then it is filled with the corresponding
00211   // byte length of each encoded UNICHAR_ID.
00212   // If encoded_length is not NULL then on return it contains the length of
00213   // str that was encoded. (if give_up_on_failure the location of the first
00214   // failure, otherwise strlen(str).)
00215   bool encode_string(const char* str, bool give_up_on_failure,
00216                      GenericVector<UNICHAR_ID>* encoding,
00217                      GenericVector<char>* lengths,
00218                      int* encoded_length) const;
00219 
00220   // Return the unichar representation corresponding to the given UNICHAR_ID
00221   // within the UNICHARSET.
00222   const char* const id_to_unichar(UNICHAR_ID id) const;
00223 
00224   // Return the UTF8 representation corresponding to the given UNICHAR_ID after
00225   // resolving any private encodings internal to Tesseract. This method is
00226   // preferrable to id_to_unichar for outputting text that will be visible to
00227   // external applications.
00228   const char* const id_to_unichar_ext(UNICHAR_ID id) const;
00229 
00230   // Return a STRING that reformats the utf8 str into the str followed
00231   // by its hex unicodes.
00232   static STRING debug_utf8_str(const char* str);
00233 
00234   // Return a STRING containing debug information on the unichar, including
00235   // the id_to_unichar, its hex unicodes and the properties.
00236   STRING debug_str(UNICHAR_ID id) const;
00237   STRING debug_str(const char * unichar_repr) const {
00238     return debug_str(unichar_to_id(unichar_repr));
00239   }
00240 
00241   // Add a unichar representation to the set.
00242   void unichar_insert(const char* const unichar_repr);
00243 
00244   // Return true if the given unichar id exists within the set.
00245   // Relies on the fact that unichar ids are contiguous in the unicharset.
00246   bool contains_unichar_id(UNICHAR_ID unichar_id) const {
00247     return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
00248         unichar_id >= 0;
00249   }
00250 
00251   // Return true if the given unichar representation exists within the set.
00252   bool contains_unichar(const char* const unichar_repr) const;
00253   bool contains_unichar(const char* const unichar_repr, int length) const;
00254 
00255   // Return true if the given unichar representation corresponds to the given
00256   // UNICHAR_ID within the set.
00257   bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
00258 
00259   // Delete CHAR_FRAGMENTs stored in properties of unichars array.
00260   void delete_pointers_in_unichars() {
00261     for (int i = 0; i < size_used; ++i) {
00262       if (unichars[i].properties.fragment != NULL) {
00263         delete unichars[i].properties.fragment;
00264         unichars[i].properties.fragment = NULL;
00265       }
00266     }
00267   }
00268 
00269   // Clear the UNICHARSET (all the previous data is lost).
00270   void clear() {
00271     if (script_table != NULL) {
00272       for (int i = 0; i < script_table_size_used; ++i)
00273         delete[] script_table[i];
00274       delete[] script_table;
00275       script_table = NULL;
00276       script_table_size_used = 0;
00277     }
00278     if (unichars != NULL) {
00279       delete_pointers_in_unichars();
00280       delete[] unichars;
00281       unichars = NULL;
00282     }
00283     script_table_size_reserved = 0;
00284     size_reserved = 0;
00285     size_used = 0;
00286     ids.clear();
00287     top_bottom_set_ = false;
00288     script_has_upper_lower_ = false;
00289     script_has_xheight_ = false;
00290     null_sid_ = 0;
00291     common_sid_ = 0;
00292     latin_sid_ = 0;
00293     cyrillic_sid_ = 0;
00294     greek_sid_ = 0;
00295     han_sid_ = 0;
00296     hiragana_sid_ = 0;
00297     katakana_sid_ = 0;
00298   }
00299 
00300   // Return the size of the set (the number of different UNICHAR it holds).
00301   int size() const {
00302     return size_used;
00303   }
00304 
00305   // Reserve enough memory space for the given number of UNICHARS
00306   void reserve(int unichars_number);
00307 
00308   // Opens the file indicated by filename and saves unicharset to that file.
00309   // Returns true if the operation is successful.
00310   bool save_to_file(const char * const filename) const {
00311     FILE* file = fopen(filename, "w+b");
00312     if (file == NULL) return false;
00313     bool result = save_to_file(file);
00314     fclose(file);
00315     return result;
00316   }
00317 
00318   // Saves the content of the UNICHARSET to the given file.
00319   // Returns true if the operation is successful.
00320   bool save_to_file(FILE *file) const;
00321 
00322   // Load a unicharset from a unicharset file that has been loaded into
00323   // the given memory buffer.
00324   // Returns true if the operation is successful.
00325   bool load_from_inmemory_file(const char* const memory, int mem_size,
00326                                bool skip_fragments);
00327   // Returns true if the operation is successful.
00328   bool load_from_inmemory_file(const char* const memory, int mem_size) {
00329     return load_from_inmemory_file(memory, mem_size, false);
00330   }
00331 
00332   // Opens the file indicated by filename and loads the UNICHARSET
00333   // from the given file. The previous data is lost.
00334   // Returns true if the operation is successful.
00335   bool load_from_file(const char* const filename, bool skip_fragments) {
00336     FILE* file = fopen(filename, "rb");
00337     if (file == NULL) return false;
00338     bool result = load_from_file(file, skip_fragments);
00339     fclose(file);
00340     return result;
00341   }
00342   // returns true if the operation is successful.
00343   bool load_from_file(const char* const filename) {
00344     return load_from_file(filename, false);
00345   }
00346 
00347   // Loads the UNICHARSET from the given file. The previous data is lost.
00348   // Returns true if the operation is successful.
00349   bool load_from_file(FILE *file, bool skip_fragments);
00350   bool load_from_file(FILE *file) { return load_from_file(file, false); }
00351 
00352   // Sets up internal data after loading the file, based on the char
00353   // properties. Called from load_from_file, but also needs to be run
00354   // during set_unicharset_properties.
00355   void post_load_setup();
00356 
00357   // Returns true if right_to_left scripts are significant in the unicharset,
00358   // but without being so sensitive that "universal" unicharsets containing
00359   // characters from many scripts, like orientation and script detection,
00360   // look like they are right_to_left.
00361   bool major_right_to_left() const;
00362 
00363   // Set a whitelist and/or blacklist of characters to recognize.
00364   // An empty or NULL whitelist enables everything (minus any blacklist).
00365   // An empty or NULL blacklist disables nothing.
00366   // The blacklist overrides the whitelist.
00367   // Each list is a string of utf8 character strings. Boundaries between
00368   // unicharset units are worked out automatically, and characters not in
00369   // the unicharset are silently ignored.
00370   void set_black_and_whitelist(const char* blacklist, const char* whitelist);
00371 
00372   // Set the isalpha property of the given unichar to the given value.
00373   void set_isalpha(UNICHAR_ID unichar_id, bool value) {
00374     unichars[unichar_id].properties.isalpha = value;
00375   }
00376 
00377   // Set the islower property of the given unichar to the given value.
00378   void set_islower(UNICHAR_ID unichar_id, bool value) {
00379     unichars[unichar_id].properties.islower = value;
00380   }
00381 
00382   // Set the isupper property of the given unichar to the given value.
00383   void set_isupper(UNICHAR_ID unichar_id, bool value) {
00384     unichars[unichar_id].properties.isupper = value;
00385   }
00386 
00387   // Set the isdigit property of the given unichar to the given value.
00388   void set_isdigit(UNICHAR_ID unichar_id, bool value) {
00389     unichars[unichar_id].properties.isdigit = value;
00390   }
00391 
00392   // Set the ispunctuation property of the given unichar to the given value.
00393   void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
00394     unichars[unichar_id].properties.ispunctuation = value;
00395   }
00396 
00397   // Set the isngram property of the given unichar to the given value.
00398   void set_isngram(UNICHAR_ID unichar_id, bool value) {
00399     unichars[unichar_id].properties.isngram = value;
00400   }
00401 
00402   // Set the script name of the given unichar to the given value.
00403   // Value is copied and thus can be a temporary;
00404   void set_script(UNICHAR_ID unichar_id, const char* value) {
00405     unichars[unichar_id].properties.script_id = add_script(value);
00406   }
00407 
00408   // Set other_case unichar id in the properties for the given unichar id.
00409   void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
00410     unichars[unichar_id].properties.other_case = other_case;
00411   }
00412 
00413   // Set the direction property of the given unichar to the given value.
00414   void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {
00415     unichars[unichar_id].properties.direction = value;
00416   }
00417 
00418   // Set mirror unichar id in the properties for the given unichar id.
00419   void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
00420     unichars[unichar_id].properties.mirror = mirror;
00421   }
00422 
00423   // Record normalized version of unichar with the given unichar_id.
00424   void set_normed(UNICHAR_ID unichar_id, const char* normed) {
00425     unichars[unichar_id].properties.normed = normed;
00426     unichars[unichar_id].properties.normed_ids.truncate(0);
00427   }
00428   // Sets the normed_ids vector from the normed string. normed_ids is not
00429   // stored in the file, and needs to be set when the UNICHARSET is loaded.
00430   void set_normed_ids(UNICHAR_ID unichar_id);
00431 
00432   // Return the isalpha property of the given unichar.
00433   bool get_isalpha(UNICHAR_ID unichar_id) const {
00434     if (INVALID_UNICHAR_ID == unichar_id) return false;
00435     ASSERT_HOST(contains_unichar_id(unichar_id));
00436     return unichars[unichar_id].properties.isalpha;
00437   }
00438 
00439   // Return the islower property of the given unichar.
00440   bool get_islower(UNICHAR_ID unichar_id) const {
00441     if (INVALID_UNICHAR_ID == unichar_id) return false;
00442     ASSERT_HOST(contains_unichar_id(unichar_id));
00443     return unichars[unichar_id].properties.islower;
00444   }
00445 
00446   // Return the isupper property of the given unichar.
00447   bool get_isupper(UNICHAR_ID unichar_id) const {
00448     if (INVALID_UNICHAR_ID == unichar_id) return false;
00449     ASSERT_HOST(contains_unichar_id(unichar_id));
00450     return unichars[unichar_id].properties.isupper;
00451   }
00452 
00453   // Return the isdigit property of the given unichar.
00454   bool get_isdigit(UNICHAR_ID unichar_id) const {
00455     if (INVALID_UNICHAR_ID == unichar_id) return false;
00456     ASSERT_HOST(contains_unichar_id(unichar_id));
00457     return unichars[unichar_id].properties.isdigit;
00458   }
00459 
00460   // Return the ispunctuation property of the given unichar.
00461   bool get_ispunctuation(UNICHAR_ID unichar_id) const {
00462     if (INVALID_UNICHAR_ID == unichar_id) return false;
00463     ASSERT_HOST(contains_unichar_id(unichar_id));
00464     return unichars[unichar_id].properties.ispunctuation;
00465   }
00466 
00467   // Return the isngram property of the given unichar.
00468   bool get_isngram(UNICHAR_ID unichar_id) const {
00469     if (INVALID_UNICHAR_ID == unichar_id) return false;
00470     ASSERT_HOST(contains_unichar_id(unichar_id));
00471     return unichars[unichar_id].properties.isngram;
00472   }
00473 
00474   // Returns whether the unichar id represents a unicode value in the private
00475   // use area.
00476   bool get_isprivate(UNICHAR_ID unichar_id) const;
00477 
00478   // Returns true if the ids have useful min/max top/bottom values.
00479   bool top_bottom_useful() const {
00480     return top_bottom_set_;
00481   }
00482   // Sets all ranges to empty, so they can be expanded to set the values.
00483   void set_ranges_empty();
00484   // Sets all the properties for this unicharset given a src_unicharset with
00485   // everything set. The unicharsets don't have to be the same, and graphemes
00486   // are correctly accounted for.
00487   void SetPropertiesFromOther(const UNICHARSET& src) {
00488     PartialSetPropertiesFromOther(0, src);
00489   }
00490   // Sets properties from Other, starting only at the given index.
00491   void PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src);
00492   // Expands the tops and bottoms and widths for this unicharset given a
00493   // src_unicharset with ranges in it. The unicharsets don't have to be the
00494   // same, and graphemes are correctly accounted for.
00495   void ExpandRangesFromOther(const UNICHARSET& src);
00496   // Makes this a copy of src. Clears this completely first, so the automattic
00497   // ids will not be present in this if not in src.
00498   void CopyFrom(const UNICHARSET& src);
00499   // For each id in src, if it does not occur in this, add it, as in
00500   // SetPropertiesFromOther, otherwise expand the ranges, as in
00501   // ExpandRangesFromOther.
00502   void AppendOtherUnicharset(const UNICHARSET& src);
00503   // Returns true if the acceptable ranges of the tops of the characters do
00504   // not overlap, making their x-height calculations distinct.
00505   bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const;
00506   // Returns the min and max bottom and top of the given unichar in
00507   // baseline-normalized coordinates, ie, where the baseline is
00508   // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
00509   // (See normalis.h for the definitions).
00510   void get_top_bottom(UNICHAR_ID unichar_id,
00511                       int* min_bottom, int* max_bottom,
00512                       int* min_top, int* max_top) const {
00513     if (INVALID_UNICHAR_ID == unichar_id) {
00514       *min_bottom = *min_top = 0;
00515       *max_bottom = *max_top = 256;  // kBlnCellHeight
00516       return;
00517     }
00518     ASSERT_HOST(contains_unichar_id(unichar_id));
00519     *min_bottom = unichars[unichar_id].properties.min_bottom;
00520     *max_bottom = unichars[unichar_id].properties.max_bottom;
00521     *min_top = unichars[unichar_id].properties.min_top;
00522     *max_top = unichars[unichar_id].properties.max_top;
00523   }
00524   void set_top_bottom(UNICHAR_ID unichar_id,
00525                       int min_bottom, int max_bottom,
00526                       int min_top, int max_top) {
00527     unichars[unichar_id].properties.min_bottom =
00528         static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
00529     unichars[unichar_id].properties.max_bottom =
00530         static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
00531     unichars[unichar_id].properties.min_top =
00532         static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
00533     unichars[unichar_id].properties.max_top =
00534         static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
00535   }
00536   // Returns the width range of the given unichar in baseline-normalized
00537   // coordinates, ie, where the baseline is kBlnBaselineOffset and the
00538   // meanline is kBlnBaselineOffset + kBlnXHeight.
00539   // (See normalis.h for the definitions).
00540   void get_width_range(UNICHAR_ID unichar_id,
00541                        int* min_width, int* max_width) const {
00542     if (INVALID_UNICHAR_ID == unichar_id) {
00543       *min_width = 0;
00544       *max_width = 256;  // kBlnCellHeight;
00545       return;
00546     }
00547     ASSERT_HOST(contains_unichar_id(unichar_id));
00548     *min_width = unichars[unichar_id].properties.min_width;
00549     *max_width = unichars[unichar_id].properties.max_width;
00550   }
00551   void set_width_range(UNICHAR_ID unichar_id, int min_width, int max_width) {
00552     unichars[unichar_id].properties.min_width =
00553         static_cast<inT16>(ClipToRange(min_width, 0, MAX_INT16));
00554     unichars[unichar_id].properties.max_width =
00555         static_cast<inT16>(ClipToRange(max_width, 0, MAX_INT16));
00556   }
00557   // Returns the range of the x-bearing of the given unichar in
00558   // baseline-normalized coordinates, ie, where the baseline is
00559   // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight.
00560   // (See normalis.h for the definitions).
00561   void get_bearing_range(UNICHAR_ID unichar_id,
00562                          int* min_bearing, int* max_bearing) const {
00563     if (INVALID_UNICHAR_ID == unichar_id) {
00564       *min_bearing = *max_bearing = 0;
00565       return;
00566     }
00567     ASSERT_HOST(contains_unichar_id(unichar_id));
00568     *min_bearing = unichars[unichar_id].properties.min_bearing;
00569     *max_bearing = unichars[unichar_id].properties.max_bearing;
00570   }
00571   void set_bearing_range(UNICHAR_ID unichar_id,
00572                          int min_bearing, int max_bearing) {
00573     unichars[unichar_id].properties.min_bearing =
00574         static_cast<inT16>(ClipToRange(min_bearing, 0, MAX_INT16));
00575     unichars[unichar_id].properties.max_bearing =
00576         static_cast<inT16>(ClipToRange(max_bearing, 0, MAX_INT16));
00577   }
00578   // Returns the range of the x-advance of the given unichar in
00579   // baseline-normalized coordinates, ie, where the baseline is
00580   // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight.
00581   // (See normalis.h for the definitions).
00582   void get_advance_range(UNICHAR_ID unichar_id,
00583                          int* min_advance, int* max_advance) const {
00584     if (INVALID_UNICHAR_ID == unichar_id) {
00585       *min_advance = *max_advance = 0;
00586       return;
00587     }
00588     ASSERT_HOST(contains_unichar_id(unichar_id));
00589     *min_advance = unichars[unichar_id].properties.min_advance;
00590     *max_advance = unichars[unichar_id].properties.max_advance;
00591   }
00592   void set_advance_range(UNICHAR_ID unichar_id,
00593                          int min_advance, int max_advance) {
00594     unichars[unichar_id].properties.min_advance =
00595         static_cast<inT16>(ClipToRange(min_advance, 0, MAX_INT16));
00596     unichars[unichar_id].properties.max_advance =
00597         static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16));
00598   }
00599 
00600   // Return the script name of the given unichar.
00601   // The returned pointer will always be the same for the same script, it's
00602   // managed by unicharset and thus MUST NOT be deleted
00603   int get_script(UNICHAR_ID unichar_id) const {
00604     if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
00605     ASSERT_HOST(contains_unichar_id(unichar_id));
00606     return unichars[unichar_id].properties.script_id;
00607   }
00608 
00609   // Return the character properties, eg. alpha/upper/lower/digit/punct,
00610   // as a bit field of unsigned int.
00611   unsigned int get_properties(UNICHAR_ID unichar_id) const;
00612 
00613   // Return the character property as a single char.  If a character has
00614   // multiple attributes, the main property is defined by the following order:
00615   //   upper_case : 'A'
00616   //   lower_case : 'a'
00617   //   alpha      : 'x'
00618   //   digit      : '0'
00619   //   punctuation: 'p'
00620   char get_chartype(UNICHAR_ID unichar_id) const;
00621 
00622   // Get other_case unichar id in the properties for the given unichar id.
00623   UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
00624     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00625     ASSERT_HOST(contains_unichar_id(unichar_id));
00626     return unichars[unichar_id].properties.other_case;
00627   }
00628 
00629   // Returns the direction property of the given unichar.
00630   Direction get_direction(UNICHAR_ID unichar_id) const {
00631      if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
00632      ASSERT_HOST(contains_unichar_id(unichar_id));
00633      return unichars[unichar_id].properties.direction;
00634    }
00635 
00636   // Get mirror unichar id in the properties for the given unichar id.
00637   UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
00638     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00639     ASSERT_HOST(contains_unichar_id(unichar_id));
00640     return unichars[unichar_id].properties.mirror;
00641   }
00642 
00643   // Returns UNICHAR_ID of the corresponding lower-case unichar.
00644   UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
00645     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00646     ASSERT_HOST(contains_unichar_id(unichar_id));
00647     if (unichars[unichar_id].properties.islower) return unichar_id;
00648     return unichars[unichar_id].properties.other_case;
00649   }
00650 
00651   // Returns UNICHAR_ID of the corresponding upper-case unichar.
00652   UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
00653     if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00654     ASSERT_HOST(contains_unichar_id(unichar_id));
00655     if (unichars[unichar_id].properties.isupper) return unichar_id;
00656     return unichars[unichar_id].properties.other_case;
00657   }
00658 
00659   // Returns true if this UNICHARSET has the special codes in
00660   // SpecialUnicharCodes available. If false then there are normal unichars
00661   // at these codes and they should not be used.
00662   bool has_special_codes() const {
00663     return get_fragment(UNICHAR_BROKEN) != NULL &&
00664         strcmp(id_to_unichar(UNICHAR_BROKEN),
00665                kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
00666   }
00667 
00668   // Return a pointer to the CHAR_FRAGMENT class if the given
00669   // unichar id represents a character fragment.
00670   const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
00671     if (INVALID_UNICHAR_ID == unichar_id) return NULL;
00672     ASSERT_HOST(contains_unichar_id(unichar_id));
00673     return unichars[unichar_id].properties.fragment;
00674   }
00675 
00676   // Return the isalpha property of the given unichar representation.
00677   bool get_isalpha(const char* const unichar_repr) const {
00678     return get_isalpha(unichar_to_id(unichar_repr));
00679   }
00680 
00681   // Return the islower property of the given unichar representation.
00682   bool get_islower(const char* const unichar_repr) const {
00683     return get_islower(unichar_to_id(unichar_repr));
00684   }
00685 
00686   // Return the isupper property of the given unichar representation.
00687   bool get_isupper(const char* const unichar_repr) const {
00688     return get_isupper(unichar_to_id(unichar_repr));
00689   }
00690 
00691   // Return the isdigit property of the given unichar representation.
00692   bool get_isdigit(const char* const unichar_repr) const {
00693     return get_isdigit(unichar_to_id(unichar_repr));
00694   }
00695 
00696   // Return the ispunctuation property of the given unichar representation.
00697   bool get_ispunctuation(const char* const unichar_repr) const {
00698     return get_ispunctuation(unichar_to_id(unichar_repr));
00699   }
00700 
00701   // Return the character properties, eg. alpha/upper/lower/digit/punct,
00702   // of the given unichar representation
00703   unsigned int get_properties(const char* const unichar_repr) const {
00704     return get_properties(unichar_to_id(unichar_repr));
00705   }
00706 
00707   char get_chartype(const char* const unichar_repr) const {
00708     return get_chartype(unichar_to_id(unichar_repr));
00709   }
00710 
00711   // Return the script name of the given unichar representation.
00712   // The returned pointer will always be the same for the same script, it's
00713   // managed by unicharset and thus MUST NOT be deleted
00714   int get_script(const char* const unichar_repr) const {
00715     return get_script(unichar_to_id(unichar_repr));
00716   }
00717 
00718   // Return a pointer to the CHAR_FRAGMENT class struct if the given
00719   // unichar representation represents a character fragment.
00720   const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
00721     if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
00722         !ids.contains(unichar_repr)) {
00723       return NULL;
00724     }
00725     return get_fragment(unichar_to_id(unichar_repr));
00726   }
00727 
00728   // Return the isalpha property of the given unichar representation.
00729   // Only the first length characters from unichar_repr are used.
00730   bool get_isalpha(const char* const unichar_repr,
00731                int length) const {
00732     return get_isalpha(unichar_to_id(unichar_repr, length));
00733   }
00734 
00735   // Return the islower property of the given unichar representation.
00736   // Only the first length characters from unichar_repr are used.
00737   bool get_islower(const char* const unichar_repr,
00738                int length) const {
00739     return get_islower(unichar_to_id(unichar_repr, length));
00740   }
00741 
00742   // Return the isupper property of the given unichar representation.
00743   // Only the first length characters from unichar_repr are used.
00744   bool get_isupper(const char* const unichar_repr,
00745                int length) const {
00746     return get_isupper(unichar_to_id(unichar_repr, length));
00747   }
00748 
00749   // Return the isdigit property of the given unichar representation.
00750   // Only the first length characters from unichar_repr are used.
00751   bool get_isdigit(const char* const unichar_repr,
00752                int length) const {
00753     return get_isdigit(unichar_to_id(unichar_repr, length));
00754   }
00755 
00756   // Return the ispunctuation property of the given unichar representation.
00757   // Only the first length characters from unichar_repr are used.
00758   bool get_ispunctuation(const char* const unichar_repr,
00759                           int length) const {
00760     return get_ispunctuation(unichar_to_id(unichar_repr, length));
00761   }
00762 
00763   // Returns normalized version of unichar with the given unichar_id.
00764   const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
00765     return unichars[unichar_id].properties.normed.string();
00766   }
00767   // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
00768   // version of the given id. There may be more than one UNICHAR_ID in the
00769   // vector if unichar_id represents a ligature.
00770   const GenericVector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const {
00771     return unichars[unichar_id].properties.normed_ids;
00772   }
00773 
00774   // Return the script name of the given unichar representation.
00775   // Only the first length characters from unichar_repr are used.
00776   // The returned pointer will always be the same for the same script, it's
00777   // managed by unicharset and thus MUST NOT be deleted
00778   int get_script(const char* const unichar_repr,
00779                  int length) const {
00780     return get_script(unichar_to_id(unichar_repr, length));
00781   }
00782 
00783   // Return the (current) number of scripts in the script table
00784   int get_script_table_size() const {
00785     return script_table_size_used;
00786   }
00787 
00788   // Return the script string from its id
00789   const char* get_script_from_script_id(int id) const {
00790     if (id >= script_table_size_used || id < 0)
00791       return null_script;
00792     return script_table[id];
00793   }
00794 
00795   // Returns the id from the name of the script, or 0 if script is not found.
00796   // Note that this is an expensive operation since it involves iteratively
00797   // comparing strings in the script table.  To avoid dependency on STL, we
00798   // won't use a hash.  Instead, the calling function can use this to lookup
00799   // and save the ID for relevant scripts for fast comparisons later.
00800   int get_script_id_from_name(const char* script_name) const;
00801 
00802   // Return true if the given script is the null script
00803   bool is_null_script(const char* script) const {
00804     return script == null_script;
00805   }
00806 
00807   // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
00808   // then the returned pointer will be the same.
00809   // The script parameter is copied and thus can be a temporary.
00810   int add_script(const char* script);
00811 
00812   // Return the enabled property of the given unichar.
00813   bool get_enabled(UNICHAR_ID unichar_id) const {
00814     return unichars[unichar_id].properties.enabled;
00815   }
00816 
00817 
00818   int null_sid() const { return null_sid_; }
00819   int common_sid() const { return common_sid_; }
00820   int latin_sid() const { return latin_sid_; }
00821   int cyrillic_sid() const { return cyrillic_sid_; }
00822   int greek_sid() const { return greek_sid_; }
00823   int han_sid() const { return han_sid_; }
00824   int hiragana_sid() const { return hiragana_sid_; }
00825   int katakana_sid() const { return katakana_sid_; }
00826   int default_sid() const { return default_sid_; }
00827 
00828   // Returns true if the unicharset has the concept of upper/lower case.
00829   bool script_has_upper_lower() const {
00830     return script_has_upper_lower_;
00831   }
00832   // Returns true if the unicharset has the concept of x-height.
00833   // script_has_xheight can be true even if script_has_upper_lower is not,
00834   // when the script has a sufficiently predominant top line with ascenders,
00835   // such as Devanagari and Thai.
00836   bool script_has_xheight() const {
00837     return script_has_xheight_;
00838   }
00839 
00840  private:
00841 
00842   struct UNICHAR_PROPERTIES {
00843     UNICHAR_PROPERTIES();
00844     // Initializes all properties to sensible default values.
00845     void Init();
00846     // Sets all ranges wide open. Initialization default in case there are
00847     // no useful values available.
00848     void SetRangesOpen();
00849     // Sets all ranges to empty. Used before expanding with font-based data.
00850     void SetRangesEmpty();
00851     // Returns true if any of the top/bottom/width/bearing/advance ranges is
00852     // emtpy.
00853     bool AnyRangeEmpty() const;
00854     // Expands the ranges with the ranges from the src properties.
00855     void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
00856     // Copies the properties from src into this.
00857     void CopyFrom(const UNICHAR_PROPERTIES& src);
00858 
00859     bool  isalpha;
00860     bool  islower;
00861     bool  isupper;
00862     bool  isdigit;
00863     bool  ispunctuation;
00864     bool  isngram;
00865     bool  enabled;
00866     // Possible limits of the top and bottom of the bounding box in
00867     // baseline-normalized coordinates, ie, where the baseline is
00868     // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
00869     // (See normalis.h for the definitions).
00870     uinT8 min_bottom;
00871     uinT8 max_bottom;
00872     uinT8 min_top;
00873     uinT8 max_top;
00874     // Limits on the widths of bounding box, also in baseline-normalized coords.
00875     inT16 min_width;
00876     inT16 max_width;
00877     // Limits on the x-bearing and advance, also in baseline-normalized coords.
00878     inT16 min_bearing;
00879     inT16 max_bearing;
00880     inT16 min_advance;
00881     inT16 max_advance;
00882     int   script_id;
00883     UNICHAR_ID other_case;  // id of the corresponding upper/lower case unichar
00884     Direction direction;  // direction of this unichar
00885     // Mirror property is useful for reverse DAWG lookup for words in
00886     // right-to-left languages (e.g. "(word)" would be in
00887     // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
00888     // However, what we want in our DAWG is
00889     // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
00890     // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
00891     UNICHAR_ID mirror;
00892     // A string of unichar_ids that represent the corresponding normed string.
00893     // For awkward characters like em-dash, this gives hyphen.
00894     // For ligatures, this gives the string of normal unichars.
00895     GenericVector<UNICHAR_ID> normed_ids;
00896     STRING normed;  // normalized version of this unichar
00897     // Contains meta information about the fragment if a unichar represents
00898     // a fragment of a character, otherwise should be set to NULL.
00899     // It is assumed that character fragments are added to the unicharset
00900     // after the corresponding 'base' characters.
00901     CHAR_FRAGMENT *fragment;
00902   };
00903 
00904   struct UNICHAR_SLOT {
00905     char representation[UNICHAR_LEN + 1];
00906     UNICHAR_PROPERTIES properties;
00907   };
00908 
00909   // Internal recursive version of encode_string above.
00910   // str is the start of the whole string.
00911   // str_index is the current position in str.
00912   // str_length is the length of str.
00913   // encoding is a working encoding of str.
00914   // lengths is a working set of lengths of each element of encoding.
00915   // best_total_length is the longest length of str that has been successfully
00916   // encoded so far.
00917   // On return:
00918   // best_encoding contains the encoding that used the longest part of str.
00919   // best_lengths (may be null) contains the lengths of best_encoding.
00920   void encode_string(const char* str, int str_index, int str_length,
00921                      GenericVector<UNICHAR_ID>* encoding,
00922                      GenericVector<char>* lengths,
00923                      int* best_total_length,
00924                      GenericVector<UNICHAR_ID>* best_encoding,
00925                      GenericVector<char>* best_lengths) const;
00926 
00927   // Gets the properties for a grapheme string, combining properties for
00928   // multiple characters in a meaningful way where possible.
00929   // Returns false if no valid match was found in the unicharset.
00930   // NOTE that script_id, mirror, and other_case refer to this unicharset on
00931   // return and will need redirecting if the target unicharset is different.
00932   bool GetStrProperties(const char* utf8_str,
00933                         UNICHAR_PROPERTIES* props) const;
00934 
00935   // Load ourselves from a "file" where our only interface to the file is
00936   // an implementation of fgets().  This is the parsing primitive accessed by
00937   // the public routines load_from_file() and load_from_inmemory_file().
00938   bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
00939                       bool skip_fragments);
00940 
00941   UNICHAR_SLOT* unichars;
00942   UNICHARMAP ids;
00943   int size_used;
00944   int size_reserved;
00945   char** script_table;
00946   int script_table_size_used;
00947   int script_table_size_reserved;
00948   const char* null_script;
00949   // True if the unichars have their tops/bottoms set.
00950   bool top_bottom_set_;
00951   // True if the unicharset has significant upper/lower case chars.
00952   bool script_has_upper_lower_;
00953   // True if the unicharset has a significant mean-line with significant
00954   // ascenders above that.
00955   bool script_has_xheight_;
00956 
00957   // A few convenient script name-to-id mapping without using hash.
00958   // These are initialized when unicharset file is loaded.  Anything
00959   // missing from this list can be looked up using get_script_id_from_name.
00960   int null_sid_;
00961   int common_sid_;
00962   int latin_sid_;
00963   int cyrillic_sid_;
00964   int greek_sid_;
00965   int han_sid_;
00966   int hiragana_sid_;
00967   int katakana_sid_;
00968   // The most frequently occurring script in the charset.
00969   int default_sid_;
00970 };
00971 
00972 #endif  // TESSERACT_CCUTIL_UNICHARSET_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines