tesseract
3.03
|
00001 00002 // File: ambigs.cc 00003 // Description: Functions for dealing with ambiguities 00004 // (training and recognition). 00005 // Author: Daria Antonova 00006 // Created: Mon Feb 5 11:26:43 PDT 2009 00007 // 00008 // (C) Copyright 2008, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #include "ambigs.h" 00022 00023 #include <stdio.h> 00024 #include "helpers.h" 00025 #include "universalambigs.h" 00026 00027 #ifdef _WIN32 00028 #ifndef __GNUC__ 00029 #define strtok_r strtok_s 00030 #else 00031 #include "strtok_r.h" 00032 #endif /* __GNUC__ */ 00033 #endif /* _WIN32 */ 00034 00035 namespace tesseract { 00036 00037 // Maximum line size: 00038 // 10 for sizes of ambigs, tabs, abmig type and newline 00039 // UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig 00040 const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1); 00041 00042 AmbigSpec::AmbigSpec() { 00043 wrong_ngram[0] = INVALID_UNICHAR_ID; 00044 correct_fragments[0] = INVALID_UNICHAR_ID; 00045 correct_ngram_id = INVALID_UNICHAR_ID; 00046 type = NOT_AMBIG; 00047 wrong_ngram_size = 0; 00048 } 00049 00050 ELISTIZE(AmbigSpec); 00051 00052 // Initializes the ambigs by adding a NULL pointer to each table. 00053 void UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET& unicharset, 00054 bool use_ambigs_for_adaption) { 00055 for (int i = 0; i < unicharset.size(); ++i) { 00056 replace_ambigs_.push_back(NULL); 00057 dang_ambigs_.push_back(NULL); 00058 one_to_one_definite_ambigs_.push_back(NULL); 00059 if (use_ambigs_for_adaption) { 00060 ambigs_for_adaption_.push_back(NULL); 00061 reverse_ambigs_for_adaption_.push_back(NULL); 00062 } 00063 } 00064 } 00065 00066 // Loads the universal ambigs that are useful for any language. 00067 void UnicharAmbigs::LoadUniversal(const UNICHARSET& encoder_set, 00068 UNICHARSET* unicharset) { 00069 TFile file; 00070 if (!file.Open(kUniversalAmbigsFile, ksizeofUniversalAmbigsFile)) return; 00071 LoadUnicharAmbigs(encoder_set, &file, 0, false, unicharset); 00072 } 00073 00074 void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET& encoder_set, 00075 TFile *ambig_file, 00076 int debug_level, 00077 bool use_ambigs_for_adaption, 00078 UNICHARSET *unicharset) { 00079 int i, j; 00080 UnicharIdVector *adaption_ambigs_entry; 00081 if (debug_level) tprintf("Reading ambiguities\n"); 00082 00083 int test_ambig_part_size; 00084 int replacement_ambig_part_size; 00085 // The space for buffer is allocated on the heap to avoid 00086 // GCC frame size warning. 00087 const int kBufferSize = 10 + 2 * kMaxAmbigStringSize; 00088 char *buffer = new char[kBufferSize]; 00089 char replacement_string[kMaxAmbigStringSize]; 00090 UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1]; 00091 int line_num = 0; 00092 int type = NOT_AMBIG; 00093 00094 // Determine the version of the ambigs file. 00095 int version = 0; 00096 ASSERT_HOST(ambig_file->FGets(buffer, kBufferSize) != NULL && 00097 strlen(buffer) > 0); 00098 if (*buffer == 'v') { 00099 version = static_cast<int>(strtol(buffer+1, NULL, 10)); 00100 ++line_num; 00101 } else { 00102 ambig_file->Rewind(); 00103 } 00104 while (ambig_file->FGets(buffer, kBufferSize) != NULL) { 00105 chomp_string(buffer); 00106 if (debug_level > 2) tprintf("read line %s\n", buffer); 00107 ++line_num; 00108 if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set, 00109 buffer, &test_ambig_part_size, test_unichar_ids, 00110 &replacement_ambig_part_size, 00111 replacement_string, &type)) continue; 00112 // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST. 00113 AmbigSpec *ambig_spec = new AmbigSpec(); 00114 if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ 00115 : dang_ambigs_, 00116 test_ambig_part_size, test_unichar_ids, 00117 replacement_ambig_part_size, replacement_string, type, 00118 ambig_spec, unicharset)) 00119 continue; 00120 00121 // Update one_to_one_definite_ambigs_. 00122 if (test_ambig_part_size == 1 && 00123 replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) { 00124 if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == NULL) { 00125 one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector(); 00126 } 00127 one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back( 00128 ambig_spec->correct_ngram_id); 00129 } 00130 // Update ambigs_for_adaption_. 00131 if (use_ambigs_for_adaption) { 00132 GenericVector<UNICHAR_ID> encoding; 00133 // Silently ignore invalid strings, as before, so it is safe to use a 00134 // universal ambigs file. 00135 if (unicharset->encode_string(replacement_string, true, &encoding, 00136 NULL, NULL)) { 00137 for (i = 0; i < test_ambig_part_size; ++i) { 00138 if (ambigs_for_adaption_[test_unichar_ids[i]] == NULL) { 00139 ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector(); 00140 } 00141 adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]]; 00142 for (int r = 0; r < encoding.size(); ++r) { 00143 UNICHAR_ID id_to_insert = encoding[r]; 00144 ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID); 00145 // Add the new unichar id to adaption_ambigs_entry (only if the 00146 // vector does not already contain it) keeping it in sorted order. 00147 for (j = 0; j < adaption_ambigs_entry->size() && 00148 (*adaption_ambigs_entry)[j] > id_to_insert; ++j); 00149 if (j < adaption_ambigs_entry->size()) { 00150 if ((*adaption_ambigs_entry)[j] != id_to_insert) { 00151 adaption_ambigs_entry->insert(id_to_insert, j); 00152 } 00153 } else { 00154 adaption_ambigs_entry->push_back(id_to_insert); 00155 } 00156 } 00157 } 00158 } 00159 } 00160 } 00161 delete[] buffer; 00162 00163 // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector. 00164 if (use_ambigs_for_adaption) { 00165 for (i = 0; i < ambigs_for_adaption_.size(); ++i) { 00166 adaption_ambigs_entry = ambigs_for_adaption_[i]; 00167 if (adaption_ambigs_entry == NULL) continue; 00168 for (j = 0; j < adaption_ambigs_entry->size(); ++j) { 00169 UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j]; 00170 if (reverse_ambigs_for_adaption_[ambig_id] == NULL) { 00171 reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector(); 00172 } 00173 reverse_ambigs_for_adaption_[ambig_id]->push_back(i); 00174 } 00175 } 00176 } 00177 00178 // Print what was read from the input file. 00179 if (debug_level > 1) { 00180 for (int tbl = 0; tbl < 2; ++tbl) { 00181 const UnicharAmbigsVector &print_table = 00182 (tbl == 0) ? replace_ambigs_ : dang_ambigs_; 00183 for (i = 0; i < print_table.size(); ++i) { 00184 AmbigSpec_LIST *lst = print_table[i]; 00185 if (lst == NULL) continue; 00186 if (!lst->empty()) { 00187 tprintf("%s Ambiguities for %s:\n", 00188 (tbl == 0) ? "Replaceable" : "Dangerous", 00189 unicharset->debug_str(i).string()); 00190 } 00191 AmbigSpec_IT lst_it(lst); 00192 for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) { 00193 AmbigSpec *ambig_spec = lst_it.data(); 00194 tprintf("wrong_ngram:"); 00195 UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset); 00196 tprintf("correct_fragments:"); 00197 UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset); 00198 } 00199 } 00200 } 00201 if (use_ambigs_for_adaption) { 00202 for (int vec_id = 0; vec_id < 2; ++vec_id) { 00203 const GenericVector<UnicharIdVector *> &vec = (vec_id == 0) ? 00204 ambigs_for_adaption_ : reverse_ambigs_for_adaption_; 00205 for (i = 0; i < vec.size(); ++i) { 00206 adaption_ambigs_entry = vec[i]; 00207 if (adaption_ambigs_entry != NULL) { 00208 tprintf("%sAmbigs for adaption for %s:\n", 00209 (vec_id == 0) ? "" : "Reverse ", 00210 unicharset->debug_str(i).string()); 00211 for (j = 0; j < adaption_ambigs_entry->size(); ++j) { 00212 tprintf("%s ", unicharset->debug_str( 00213 (*adaption_ambigs_entry)[j]).string()); 00214 } 00215 tprintf("\n"); 00216 } 00217 } 00218 } 00219 } 00220 } 00221 } 00222 00223 bool UnicharAmbigs::ParseAmbiguityLine( 00224 int line_num, int version, int debug_level, const UNICHARSET &unicharset, 00225 char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids, 00226 int *replacement_ambig_part_size, char *replacement_string, int *type) { 00227 if (version > 1) { 00228 // Simpler format is just wrong-string correct-string type\n. 00229 STRING input(buffer); 00230 GenericVector<STRING> fields; 00231 input.split(' ', &fields); 00232 if (fields.size() != 3) { 00233 if (debug_level) tprintf(kIllegalMsg, line_num); 00234 return false; 00235 } 00236 // Encode wrong-string. 00237 GenericVector<UNICHAR_ID> unichars; 00238 if (!unicharset.encode_string(fields[0].string(), true, &unichars, NULL, 00239 NULL)) { 00240 return false; 00241 } 00242 *test_ambig_part_size = unichars.size(); 00243 if (*test_ambig_part_size > MAX_AMBIG_SIZE) { 00244 if (debug_level) 00245 tprintf("Too many unichars in ambiguity on line %d\n", line_num); 00246 return false; 00247 } 00248 // Copy encoded string to output. 00249 for (int i = 0; i < unichars.size(); ++i) 00250 test_unichar_ids[i] = unichars[i]; 00251 test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID; 00252 // Encode replacement-string to check validity. 00253 if (!unicharset.encode_string(fields[1].string(), true, &unichars, NULL, 00254 NULL)) { 00255 return false; 00256 } 00257 *replacement_ambig_part_size = unichars.size(); 00258 if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { 00259 if (debug_level) 00260 tprintf("Too many unichars in ambiguity on line %d\n", line_num); 00261 return false; 00262 } 00263 if (sscanf(fields[2].string(), "%d", type) != 1) { 00264 if (debug_level) tprintf(kIllegalMsg, line_num); 00265 return false; 00266 } 00267 snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].string()); 00268 return true; 00269 } 00270 int i; 00271 char *token; 00272 char *next_token; 00273 if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) || 00274 !sscanf(token, "%d", test_ambig_part_size) || 00275 *test_ambig_part_size <= 0) { 00276 if (debug_level) tprintf(kIllegalMsg, line_num); 00277 return false; 00278 } 00279 if (*test_ambig_part_size > MAX_AMBIG_SIZE) { 00280 if (debug_level) 00281 tprintf("Too many unichars in ambiguity on line %d\n", line_num); 00282 return false; 00283 } 00284 for (i = 0; i < *test_ambig_part_size; ++i) { 00285 if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; 00286 if (!unicharset.contains_unichar(token)) { 00287 if (debug_level) tprintf(kIllegalUnicharMsg, token); 00288 break; 00289 } 00290 test_unichar_ids[i] = unicharset.unichar_to_id(token); 00291 } 00292 test_unichar_ids[i] = INVALID_UNICHAR_ID; 00293 00294 if (i != *test_ambig_part_size || 00295 !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || 00296 !sscanf(token, "%d", replacement_ambig_part_size) || 00297 *replacement_ambig_part_size <= 0) { 00298 if (debug_level) tprintf(kIllegalMsg, line_num); 00299 return false; 00300 } 00301 if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { 00302 if (debug_level) 00303 tprintf("Too many unichars in ambiguity on line %d\n", line_num); 00304 return false; 00305 } 00306 replacement_string[0] = '\0'; 00307 for (i = 0; i < *replacement_ambig_part_size; ++i) { 00308 if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; 00309 strcat(replacement_string, token); 00310 if (!unicharset.contains_unichar(token)) { 00311 if (debug_level) tprintf(kIllegalUnicharMsg, token); 00312 break; 00313 } 00314 } 00315 if (i != *replacement_ambig_part_size) { 00316 if (debug_level) tprintf(kIllegalMsg, line_num); 00317 return false; 00318 } 00319 if (version > 0) { 00320 // The next field being true indicates that the abiguity should 00321 // always be substituted (e.g. '' should always be changed to "). 00322 // For such "certain" n -> m ambigs tesseract will insert character 00323 // fragments for the n pieces in the unicharset. AmbigsFound() 00324 // will then replace the incorrect ngram with the character 00325 // fragments of the correct character (or ngram if m > 1). 00326 // Note that if m > 1, an ngram will be inserted into the 00327 // modified word, not the individual unigrams. Tesseract 00328 // has limited support for ngram unichar (e.g. dawg permuter). 00329 if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || 00330 !sscanf(token, "%d", type)) { 00331 if (debug_level) tprintf(kIllegalMsg, line_num); 00332 return false; 00333 } 00334 } 00335 return true; 00336 } 00337 00338 bool UnicharAmbigs::InsertIntoTable( 00339 UnicharAmbigsVector &table, int test_ambig_part_size, 00340 UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size, 00341 const char *replacement_string, int type, 00342 AmbigSpec *ambig_spec, UNICHARSET *unicharset) { 00343 ambig_spec->type = static_cast<AmbigType>(type); 00344 if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 && 00345 unicharset->to_lower(test_unichar_ids[0]) == 00346 unicharset->to_lower(unicharset->unichar_to_id(replacement_string))) { 00347 ambig_spec->type = CASE_AMBIG; 00348 } 00349 00350 ambig_spec->wrong_ngram_size = 00351 UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram); 00352 00353 // Since we need to maintain a constant number of unichar positions in 00354 // order to construct ambig_blob_choices vector in NoDangerousAmbig(), for 00355 // each n->m ambiguity we will have to place n character fragments of the 00356 // correct ngram into the corresponding positions in the vector (e.g. given 00357 // "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and 00358 // |ww|1|4 into position 1 and so on. The correct ngram is reconstructed 00359 // from fragments by dawg_permute_and_select(). 00360 00361 // Insert the corresponding correct ngram into the unicharset. 00362 // Unicharset code assumes that the "base" ngram is inserted into 00363 // the unicharset before fragments of this ngram are inserted. 00364 unicharset->unichar_insert(replacement_string); 00365 ambig_spec->correct_ngram_id = 00366 unicharset->unichar_to_id(replacement_string); 00367 if (replacement_ambig_part_size > 1) { 00368 unicharset->set_isngram(ambig_spec->correct_ngram_id, true); 00369 } 00370 // Add the corresponding fragments of the wrong ngram to unicharset. 00371 int i; 00372 for (i = 0; i < test_ambig_part_size; ++i) { 00373 UNICHAR_ID unichar_id; 00374 if (test_ambig_part_size == 1) { 00375 unichar_id = ambig_spec->correct_ngram_id; 00376 } else { 00377 STRING frag_str = CHAR_FRAGMENT::to_string( 00378 replacement_string, i, test_ambig_part_size, false); 00379 unicharset->unichar_insert(frag_str.string()); 00380 unichar_id = unicharset->unichar_to_id(frag_str.string()); 00381 } 00382 ambig_spec->correct_fragments[i] = unichar_id; 00383 } 00384 ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID; 00385 00386 // Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST. 00387 // Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram. 00388 if (table[test_unichar_ids[0]] == NULL) { 00389 table[test_unichar_ids[0]] = new AmbigSpec_LIST(); 00390 } 00391 if (table[test_unichar_ids[0]]->add_sorted( 00392 AmbigSpec::compare_ambig_specs, true, ambig_spec)) 00393 return true; 00394 delete ambig_spec; 00395 return false; 00396 } 00397 00398 } // namespace tesseract