tesseract
3.03
|
00001 00002 // File: ambigs.h 00003 // Description: Constants, flags, functions for dealing with 00004 // ambiguities (training and recognition). 00005 // Author: Daria Antonova 00006 // Created: Mon Aug 23 11:26:43 PDT 2008 00007 // 00008 // (C) Copyright 2008, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifndef TESSERACT_CCUTIL_AMBIGS_H_ 00022 #define TESSERACT_CCUTIL_AMBIGS_H_ 00023 00024 #include "elst.h" 00025 #include "tprintf.h" 00026 #include "unichar.h" 00027 #include "unicharset.h" 00028 #include "genericvector.h" 00029 00030 #define MAX_AMBIG_SIZE 10 00031 00032 namespace tesseract { 00033 00034 typedef GenericVector<UNICHAR_ID> UnicharIdVector; 00035 00036 static const int kUnigramAmbigsBufferSize = 1000; 00037 static const char kAmbigNgramSeparator[] = { ' ', '\0' }; 00038 static const char kAmbigDelimiters[] = "\t "; 00039 static const char kIllegalMsg[] = 00040 "Illegal ambiguity specification on line %d\n"; 00041 static const char kIllegalUnicharMsg[] = 00042 "Illegal unichar %s in ambiguity specification\n"; 00043 00044 enum AmbigType { 00045 NOT_AMBIG, // the ngram pair is not ambiguous 00046 REPLACE_AMBIG, // ocred ngram should always be substituted with correct 00047 DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1) 00048 SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1) 00049 CASE_AMBIG, // this is a case ambiguity (1-1) 00050 00051 AMBIG_TYPE_COUNT // number of enum entries 00052 }; 00053 00054 // A collection of utility functions for arrays of UNICHAR_IDs that are 00055 // terminated by INVALID_UNICHAR_ID. 00056 class UnicharIdArrayUtils { 00057 public: 00058 // Compares two arrays of unichar ids. Returns -1 if the length of array1 is 00059 // less than length of array2, if any array1[i] is less than array2[i]. 00060 // Returns 0 if the arrays are equal, 1 otherwise. 00061 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID. 00062 static inline int compare(const UNICHAR_ID array1[], 00063 const UNICHAR_ID array2[]) { 00064 const UNICHAR_ID *ptr1 = array1; 00065 const UNICHAR_ID *ptr2 = array2; 00066 while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) { 00067 if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1; 00068 ++ptr1; 00069 ++ptr2; 00070 } 00071 if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0; 00072 return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1; 00073 } 00074 00075 // Look uid in the vector of uids. If found, the index of the matched 00076 // element is returned. Otherwise, it returns -1. 00077 static inline int find_in(const UnicharIdVector& uid_vec, 00078 const UNICHAR_ID uid) { 00079 for (int i = 0; i < uid_vec.size(); ++i) 00080 if (uid_vec[i] == uid) return i; 00081 return -1; 00082 } 00083 00084 // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied. 00085 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID 00086 // and that dst has enough space for all the elements from src. 00087 static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) { 00088 int i = 0; 00089 do { 00090 dst[i] = src[i]; 00091 } while (dst[i++] != INVALID_UNICHAR_ID); 00092 return i - 1; 00093 } 00094 00095 // Prints unichars corresponding to the unichar_ids in the given array. 00096 // The function assumes that array is terminated by INVALID_UNICHAR_ID. 00097 static inline void print(const UNICHAR_ID array[], 00098 const UNICHARSET &unicharset) { 00099 const UNICHAR_ID *ptr = array; 00100 if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]"); 00101 while (*ptr != INVALID_UNICHAR_ID) { 00102 tprintf("%s ", unicharset.id_to_unichar(*ptr++)); 00103 } 00104 tprintf("( "); 00105 ptr = array; 00106 while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++); 00107 tprintf(")\n"); 00108 } 00109 }; 00110 00111 // AMBIG_SPEC_LIST stores a list of dangerous ambigs that 00112 // start with the same unichar (e.g. r->t rn->m rr1->m). 00113 class AmbigSpec : public ELIST_LINK { 00114 public: 00115 AmbigSpec(); 00116 ~AmbigSpec() {} 00117 00118 // Comparator function for sorting AmbigSpec_LISTs. The lists will 00119 // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors 00120 // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1]. 00121 static int compare_ambig_specs(const void *spec1, const void *spec2) { 00122 const AmbigSpec *s1 = 00123 *reinterpret_cast<const AmbigSpec * const *>(spec1); 00124 const AmbigSpec *s2 = 00125 *reinterpret_cast<const AmbigSpec * const *>(spec2); 00126 int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram); 00127 if (result != 0) return result; 00128 return UnicharIdArrayUtils::compare(s1->correct_fragments, 00129 s2->correct_fragments); 00130 } 00131 00132 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; 00133 UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1]; 00134 UNICHAR_ID correct_ngram_id; 00135 AmbigType type; 00136 int wrong_ngram_size; 00137 }; 00138 ELISTIZEH(AmbigSpec); 00139 00140 // AMBIG_TABLE[i] stores a set of ambiguities whose 00141 // wrong ngram starts with unichar id i. 00142 typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector; 00143 00144 class UnicharAmbigs { 00145 public: 00146 UnicharAmbigs() {} 00147 ~UnicharAmbigs() { 00148 replace_ambigs_.delete_data_pointers(); 00149 dang_ambigs_.delete_data_pointers(); 00150 one_to_one_definite_ambigs_.delete_data_pointers(); 00151 } 00152 00153 const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; } 00154 const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; } 00155 00156 // Initializes the ambigs by adding a NULL pointer to each table. 00157 void InitUnicharAmbigs(const UNICHARSET& unicharset, 00158 bool use_ambigs_for_adaption); 00159 00160 // Loads the universal ambigs that are useful for any language. 00161 void LoadUniversal(const UNICHARSET& encoder_set, UNICHARSET* unicharset); 00162 00163 // Fills in two ambiguity tables (replaceable and dangerous) with information 00164 // read from the ambigs file. An ambiguity table is an array of lists. 00165 // The array is indexed by a class id. Each entry in the table provides 00166 // a list of potential ambiguities which can start with the corresponding 00167 // character. For example the ambiguity "rn -> m", would be located in the 00168 // table at index of unicharset.unichar_to_id('r'). 00169 // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in 00170 // one_to_one_definite_ambigs_. This vector is also indexed by the class id 00171 // of the wrong part of the ambiguity and each entry contains a vector of 00172 // unichar ids that are ambiguous to it. 00173 // encoder_set is used to encode the ambiguity strings, undisturbed by new 00174 // unichar_ids that may be created by adding the ambigs. 00175 void LoadUnicharAmbigs(const UNICHARSET& encoder_set, 00176 TFile *ambigs_file, int debug_level, 00177 bool use_ambigs_for_adaption, UNICHARSET *unicharset); 00178 00179 // Returns definite 1-1 ambigs for the given unichar id. 00180 inline const UnicharIdVector *OneToOneDefiniteAmbigs( 00181 UNICHAR_ID unichar_id) const { 00182 if (one_to_one_definite_ambigs_.empty()) return NULL; 00183 return one_to_one_definite_ambigs_[unichar_id]; 00184 } 00185 00186 // Returns a pointer to the vector with all unichar ids that appear in the 00187 // 'correct' part of the ambiguity pair when the given unichar id appears 00188 // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of 00189 // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of 00190 // m will return a pointer to a vector with unichar ids of r,n,i. 00191 inline const UnicharIdVector *AmbigsForAdaption( 00192 UNICHAR_ID unichar_id) const { 00193 if (ambigs_for_adaption_.empty()) return NULL; 00194 return ambigs_for_adaption_[unichar_id]; 00195 } 00196 00197 // Similar to the above, but return the vector of unichar ids for which 00198 // the given unichar_id is an ambiguity (appears in the 'wrong' part of 00199 // some ambiguity pair). 00200 inline const UnicharIdVector *ReverseAmbigsForAdaption( 00201 UNICHAR_ID unichar_id) const { 00202 if (reverse_ambigs_for_adaption_.empty()) return NULL; 00203 return reverse_ambigs_for_adaption_[unichar_id]; 00204 } 00205 00206 private: 00207 bool ParseAmbiguityLine(int line_num, int version, int debug_level, 00208 const UNICHARSET &unicharset, char *buffer, 00209 int *test_ambig_part_size, 00210 UNICHAR_ID *test_unichar_ids, 00211 int *replacement_ambig_part_size, 00212 char *replacement_string, int *type); 00213 bool InsertIntoTable(UnicharAmbigsVector &table, 00214 int test_ambig_part_size, UNICHAR_ID *test_unichar_ids, 00215 int replacement_ambig_part_size, 00216 const char *replacement_string, int type, 00217 AmbigSpec *ambig_spec, UNICHARSET *unicharset); 00218 00219 UnicharAmbigsVector dang_ambigs_; 00220 UnicharAmbigsVector replace_ambigs_; 00221 GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_; 00222 GenericVector<UnicharIdVector *> ambigs_for_adaption_; 00223 GenericVector<UnicharIdVector *> reverse_ambigs_for_adaption_; 00224 }; 00225 00226 } // namespace tesseract 00227 00228 #endif // TESSERACT_CCUTIL_AMBIGS_H_