tesseract
3.03
|
00001 /****************************************************************************** 00002 ** Filename: stopper.c 00003 ** Purpose: Stopping criteria for word classifier. 00004 ** Author: Dan Johnson 00005 ** History: Mon Apr 29 14:56:49 1991, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 00019 #include <stdio.h> 00020 #include <string.h> 00021 #include <ctype.h> 00022 #include <math.h> 00023 00024 #include "stopper.h" 00025 #include "ambigs.h" 00026 #include "ccutil.h" 00027 #include "const.h" 00028 #include "danerror.h" 00029 #include "dict.h" 00030 #include "efio.h" 00031 #include "helpers.h" 00032 #include "matchdefs.h" 00033 #include "pageres.h" 00034 #include "params.h" 00035 #include "ratngs.h" 00036 #include "scanutils.h" 00037 #include "unichar.h" 00038 00039 #ifdef _MSC_VER 00040 #pragma warning(disable:4244) // Conversion warnings 00041 #pragma warning(disable:4800) // int/bool warnings 00042 #endif 00043 00044 using tesseract::ScriptPos; 00049 namespace tesseract { 00050 00051 bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice, 00052 XHeightConsistencyEnum xheight_consistency) { 00053 float CertaintyThreshold = stopper_nondict_certainty_base; 00054 int WordSize; 00055 00056 if (stopper_no_acceptable_choices) return false; 00057 00058 if (best_choice.length() == 0) return false; 00059 00060 bool no_dang_ambigs = !best_choice.dangerous_ambig_found(); 00061 bool is_valid_word = valid_word_permuter(best_choice.permuter(), false); 00062 bool is_case_ok = case_ok(best_choice, getUnicharset()); 00063 00064 if (stopper_debug_level >= 1) { 00065 const char *xht = "UNKNOWN"; 00066 switch (xheight_consistency) { 00067 case XH_GOOD: xht = "NORMAL"; break; 00068 case XH_SUBNORMAL: xht = "SUBNORMAL"; break; 00069 case XH_INCONSISTENT: xht = "INCONSISTENT"; break; 00070 default: xht = "UNKNOWN"; 00071 } 00072 tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n", 00073 best_choice.unichar_string().string(), 00074 (is_valid_word ? 'y' : 'n'), 00075 (is_case_ok ? 'y' : 'n'), 00076 xht, 00077 best_choice.min_x_height(), 00078 best_choice.max_x_height()); 00079 } 00080 // Do not accept invalid words in PASS1. 00081 if (reject_offset_ <= 0.0f && !is_valid_word) return false; 00082 if (is_valid_word && is_case_ok) { 00083 WordSize = LengthOfShortestAlphaRun(best_choice); 00084 WordSize -= stopper_smallword_size; 00085 if (WordSize < 0) 00086 WordSize = 0; 00087 CertaintyThreshold += WordSize * stopper_certainty_per_char; 00088 } 00089 00090 if (stopper_debug_level >= 1) 00091 tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n", 00092 best_choice.rating(), best_choice.certainty(), CertaintyThreshold); 00093 00094 if (no_dang_ambigs && 00095 best_choice.certainty() > CertaintyThreshold && 00096 xheight_consistency < XH_INCONSISTENT && 00097 UniformCertainties(best_choice)) { 00098 return true; 00099 } else { 00100 if (stopper_debug_level >= 1) { 00101 tprintf("AcceptableChoice() returned false" 00102 " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n", 00103 no_dang_ambigs, best_choice.certainty(), 00104 CertaintyThreshold, 00105 UniformCertainties(best_choice)); 00106 } 00107 return false; 00108 } 00109 } 00110 00111 bool Dict::AcceptableResult(WERD_RES* word) { 00112 if (word->best_choice == NULL) return false; 00113 float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_; 00114 int WordSize; 00115 00116 if (stopper_debug_level >= 1) { 00117 tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n", 00118 word->best_choice->debug_string().string(), 00119 (valid_word(*word->best_choice) ? 'y' : 'n'), 00120 (case_ok(*word->best_choice, getUnicharset()) ? 'y' : 'n'), 00121 word->best_choice->dangerous_ambig_found() ? 'n' : 'y', 00122 word->best_choices.singleton() ? 'n' : 'y'); 00123 } 00124 00125 if (word->best_choice->length() == 0 || !word->best_choices.singleton()) 00126 return false; 00127 if (valid_word(*word->best_choice) && 00128 case_ok(*word->best_choice, getUnicharset())) { 00129 WordSize = LengthOfShortestAlphaRun(*word->best_choice); 00130 WordSize -= stopper_smallword_size; 00131 if (WordSize < 0) 00132 WordSize = 0; 00133 CertaintyThreshold += WordSize * stopper_certainty_per_char; 00134 } 00135 00136 if (stopper_debug_level >= 1) 00137 tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", 00138 word->best_choice->certainty(), CertaintyThreshold); 00139 00140 if (word->best_choice->certainty() > CertaintyThreshold && 00141 !stopper_no_acceptable_choices) { 00142 if (stopper_debug_level >= 1) 00143 tprintf("ACCEPTED\n"); 00144 return true; 00145 } else { 00146 if (stopper_debug_level >= 1) 00147 tprintf("REJECTED\n"); 00148 return false; 00149 } 00150 } 00151 00152 bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, 00153 DANGERR *fixpt, 00154 bool fix_replaceable, 00155 MATRIX *ratings) { 00156 if (stopper_debug_level > 2) { 00157 tprintf("\nRunning NoDangerousAmbig() for %s\n", 00158 best_choice->debug_string().string()); 00159 } 00160 00161 // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities 00162 // for each unichar id in BestChoice. 00163 BLOB_CHOICE_LIST_VECTOR ambig_blob_choices; 00164 int i; 00165 bool ambigs_found = false; 00166 // For each position in best_choice: 00167 // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i] 00168 // -- initialize wrong_ngram with a single unichar_id at best_choice[i] 00169 // -- look for ambiguities corresponding to wrong_ngram in the list while 00170 // adding the following unichar_ids from best_choice to wrong_ngram 00171 // 00172 // Repeat the above procedure twice: first time look through 00173 // ambigs to be replaced and replace all the ambiguities found; 00174 // second time look through dangerous ambiguities and construct 00175 // ambig_blob_choices with fake a blob choice for each ambiguity 00176 // and pass them to dawg_permute_and_select() to search for 00177 // ambiguous words in the dictionaries. 00178 // 00179 // Note that during the execution of the for loop (on the first pass) 00180 // if replacements are made the length of best_choice might change. 00181 for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) { 00182 bool replace = (fix_replaceable && pass == 0); 00183 const UnicharAmbigsVector &table = replace ? 00184 getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs(); 00185 if (!replace) { 00186 // Initialize ambig_blob_choices with lists containing a single 00187 // unichar id for the correspoding position in best_choice. 00188 // best_choice consisting from only the original letters will 00189 // have a rating of 0.0. 00190 for (i = 0; i < best_choice->length(); ++i) { 00191 BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST(); 00192 BLOB_CHOICE_IT lst_it(lst); 00193 // TODO(rays/antonova) Put real xheights and y shifts here. 00194 lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i), 00195 0.0, 0.0, -1, -1, -1, 0, 1, 0, 00196 BCC_AMBIG)); 00197 ambig_blob_choices.push_back(lst); 00198 } 00199 } 00200 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; 00201 int wrong_ngram_index; 00202 int next_index; 00203 int blob_index = 0; 00204 for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i), 00205 ++i) { 00206 UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i); 00207 if (stopper_debug_level > 2) { 00208 tprintf("Looking for %s ngrams starting with %s:\n", 00209 replace ? "replaceable" : "ambiguous", 00210 getUnicharset().debug_str(curr_unichar_id).string()); 00211 } 00212 int num_wrong_blobs = best_choice->state(i); 00213 wrong_ngram_index = 0; 00214 wrong_ngram[wrong_ngram_index] = curr_unichar_id; 00215 if (curr_unichar_id == INVALID_UNICHAR_ID || 00216 curr_unichar_id >= table.size() || 00217 table[curr_unichar_id] == NULL) { 00218 continue; // there is no ambig spec for this unichar id 00219 } 00220 AmbigSpec_IT spec_it(table[curr_unichar_id]); 00221 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) { 00222 const AmbigSpec *ambig_spec = spec_it.data(); 00223 wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID; 00224 int compare = UnicharIdArrayUtils::compare(wrong_ngram, 00225 ambig_spec->wrong_ngram); 00226 if (stopper_debug_level > 2) { 00227 tprintf("candidate ngram: "); 00228 UnicharIdArrayUtils::print(wrong_ngram, getUnicharset()); 00229 tprintf("current ngram from spec: "); 00230 UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset()); 00231 tprintf("comparison result: %d\n", compare); 00232 } 00233 if (compare == 0) { 00234 // Record the place where we found an ambiguity. 00235 if (fixpt != NULL) { 00236 UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0]; 00237 fixpt->push_back(DANGERR_INFO( 00238 blob_index, blob_index + num_wrong_blobs, replace, 00239 getUnicharset().get_isngram(ambig_spec->correct_ngram_id), 00240 leftmost_id)); 00241 if (stopper_debug_level > 1) { 00242 tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, 00243 blob_index + num_wrong_blobs, false, 00244 getUnicharset().get_isngram( 00245 ambig_spec->correct_ngram_id), 00246 getUnicharset().id_to_unichar(leftmost_id)); 00247 } 00248 } 00249 00250 if (replace) { 00251 if (stopper_debug_level > 2) { 00252 tprintf("replace ambiguity with %s : ", 00253 getUnicharset().id_to_unichar( 00254 ambig_spec->correct_ngram_id)); 00255 UnicharIdArrayUtils::print( 00256 ambig_spec->correct_fragments, getUnicharset()); 00257 } 00258 ReplaceAmbig(i, ambig_spec->wrong_ngram_size, 00259 ambig_spec->correct_ngram_id, 00260 best_choice, ratings); 00261 } else if (i > 0 || ambig_spec->type != CASE_AMBIG) { 00262 // We found dang ambig - update ambig_blob_choices. 00263 if (stopper_debug_level > 2) { 00264 tprintf("found ambiguity: "); 00265 UnicharIdArrayUtils::print( 00266 ambig_spec->correct_fragments, getUnicharset()); 00267 } 00268 ambigs_found = true; 00269 for (int tmp_index = 0; tmp_index <= wrong_ngram_index; 00270 ++tmp_index) { 00271 // Add a blob choice for the corresponding fragment of the 00272 // ambiguity. These fake blob choices are initialized with 00273 // negative ratings (which are not possible for real blob 00274 // choices), so that dawg_permute_and_select() considers any 00275 // word not consisting of only the original letters a better 00276 // choice and stops searching for alternatives once such a 00277 // choice is found. 00278 BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]); 00279 bc_it.add_to_end(new BLOB_CHOICE( 00280 ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, 00281 -1, -1, -1, 0, 1, 0, BCC_AMBIG)); 00282 } 00283 } 00284 spec_it.forward(); 00285 } else if (compare == -1) { 00286 if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size && 00287 ((next_index = wrong_ngram_index+1+i) < best_choice->length())) { 00288 // Add the next unichar id to wrong_ngram and keep looking for 00289 // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST. 00290 wrong_ngram[++wrong_ngram_index] = 00291 best_choice->unichar_id(next_index); 00292 num_wrong_blobs += best_choice->state(next_index); 00293 } else { 00294 break; // no more matching ambigs in this AMBIG_SPEC_LIST 00295 } 00296 } else { 00297 spec_it.forward(); 00298 } 00299 } // end searching AmbigSpec_LIST 00300 } // end searching best_choice 00301 } // end searching replace and dangerous ambigs 00302 00303 // If any ambiguities were found permute the constructed ambig_blob_choices 00304 // to see if an alternative dictionary word can be found. 00305 if (ambigs_found) { 00306 if (stopper_debug_level > 2) { 00307 tprintf("\nResulting ambig_blob_choices:\n"); 00308 for (i = 0; i < ambig_blob_choices.length(); ++i) { 00309 print_ratings_list("", ambig_blob_choices.get(i), getUnicharset()); 00310 tprintf("\n"); 00311 } 00312 } 00313 WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0); 00314 ambigs_found = (alt_word->rating() < 0.0); 00315 if (ambigs_found) { 00316 if (stopper_debug_level >= 1) { 00317 tprintf ("Stopper: Possible ambiguous word = %s\n", 00318 alt_word->debug_string().string()); 00319 } 00320 if (fixpt != NULL) { 00321 // Note: Currently character choices combined from fragments can only 00322 // be generated by NoDangrousAmbigs(). This code should be updated if 00323 // the capability to produce classifications combined from character 00324 // fragments is added to other functions. 00325 int orig_i = 0; 00326 for (i = 0; i < alt_word->length(); ++i) { 00327 const UNICHARSET &uchset = getUnicharset(); 00328 bool replacement_is_ngram = 00329 uchset.get_isngram(alt_word->unichar_id(i)); 00330 UNICHAR_ID leftmost_id = alt_word->unichar_id(i); 00331 if (replacement_is_ngram) { 00332 // we have to extract the leftmost unichar from the ngram. 00333 const char *str = uchset.id_to_unichar(leftmost_id); 00334 int step = uchset.step(str); 00335 if (step) leftmost_id = uchset.unichar_to_id(str, step); 00336 } 00337 int end_i = orig_i + alt_word->state(i); 00338 if (alt_word->state(i) > 1 || 00339 (orig_i + 1 == end_i && replacement_is_ngram)) { 00340 // Compute proper blob indices. 00341 int blob_start = 0; 00342 for (int j = 0; j < orig_i; ++j) 00343 blob_start += best_choice->state(j); 00344 int blob_end = blob_start; 00345 for (int j = orig_i; j < end_i; ++j) 00346 blob_end += best_choice->state(j); 00347 fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true, 00348 replacement_is_ngram, leftmost_id)); 00349 if (stopper_debug_level > 1) { 00350 tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, 00351 true, replacement_is_ngram, 00352 uchset.id_to_unichar(leftmost_id)); 00353 } 00354 } 00355 orig_i += alt_word->state(i); 00356 } 00357 } 00358 } 00359 delete alt_word; 00360 } 00361 if (output_ambig_words_file_ != NULL) { 00362 fprintf(output_ambig_words_file_, "\n"); 00363 } 00364 00365 ambig_blob_choices.delete_data_pointers(); 00366 return !ambigs_found; 00367 } 00368 00369 void Dict::EndDangerousAmbigs() {} 00370 00371 void Dict::SettupStopperPass1() { 00372 reject_offset_ = 0.0; 00373 } 00374 00375 void Dict::SettupStopperPass2() { 00376 reject_offset_ = stopper_phase2_certainty_rejection_offset; 00377 } 00378 00379 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, 00380 UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, 00381 MATRIX *ratings) { 00382 int num_blobs_to_replace = 0; 00383 int begin_blob_index = 0; 00384 int i; 00385 // Rating and certainty for the new BLOB_CHOICE are derived from the 00386 // replaced choices. 00387 float new_rating = 0.0f; 00388 float new_certainty = 0.0f; 00389 BLOB_CHOICE* old_choice = NULL; 00390 for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) { 00391 if (i >= wrong_ngram_begin_index) { 00392 int num_blobs = werd_choice->state(i); 00393 int col = begin_blob_index + num_blobs_to_replace; 00394 int row = col + num_blobs - 1; 00395 BLOB_CHOICE_LIST* choices = ratings->get(col, row); 00396 ASSERT_HOST(choices != NULL); 00397 old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices); 00398 ASSERT_HOST(old_choice != NULL); 00399 new_rating += old_choice->rating(); 00400 new_certainty += old_choice->certainty(); 00401 num_blobs_to_replace += num_blobs; 00402 } else { 00403 begin_blob_index += werd_choice->state(i); 00404 } 00405 } 00406 new_certainty /= wrong_ngram_size; 00407 // If there is no entry in the ratings matrix, add it. 00408 MATRIX_COORD coord(begin_blob_index, 00409 begin_blob_index + num_blobs_to_replace - 1); 00410 if (!coord.Valid(*ratings)) { 00411 ratings->IncreaseBandSize(coord.row - coord.col + 1); 00412 } 00413 if (ratings->get(coord.col, coord.row) == NULL) 00414 ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST); 00415 BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row); 00416 BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices); 00417 if (choice != NULL) { 00418 // Already there. Upgrade if new rating better. 00419 if (new_rating < choice->rating()) 00420 choice->set_rating(new_rating); 00421 if (new_certainty < choice->certainty()) 00422 choice->set_certainty(new_certainty); 00423 // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState. 00424 } else { 00425 // Need a new choice with the correct_ngram_id. 00426 choice = new BLOB_CHOICE(*old_choice); 00427 choice->set_unichar_id(correct_ngram_id); 00428 choice->set_rating(new_rating); 00429 choice->set_certainty(new_certainty); 00430 choice->set_classifier(BCC_AMBIG); 00431 choice->set_matrix_cell(coord.col, coord.row); 00432 BLOB_CHOICE_IT it (new_choices); 00433 it.add_to_end(choice); 00434 } 00435 // Remove current unichar from werd_choice. On the last iteration 00436 // set the correct replacement unichar instead of removing a unichar. 00437 for (int replaced_count = 0; replaced_count < wrong_ngram_size; 00438 ++replaced_count) { 00439 if (replaced_count + 1 == wrong_ngram_size) { 00440 werd_choice->set_blob_choice(wrong_ngram_begin_index, 00441 num_blobs_to_replace, choice); 00442 } else { 00443 werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1); 00444 } 00445 } 00446 if (stopper_debug_level >= 1) { 00447 werd_choice->print("ReplaceAmbig() "); 00448 tprintf("Modified blob_choices: "); 00449 print_ratings_list("\n", new_choices, getUnicharset()); 00450 } 00451 } 00452 00453 int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) { 00454 int shortest = MAX_INT32; 00455 int curr_len = 0; 00456 for (int w = 0; w < WordChoice.length(); ++w) { 00457 if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) { 00458 curr_len++; 00459 } else if (curr_len > 0) { 00460 if (curr_len < shortest) shortest = curr_len; 00461 curr_len = 0; 00462 } 00463 } 00464 if (curr_len > 0 && curr_len < shortest) { 00465 shortest = curr_len; 00466 } else if (shortest == MAX_INT32) { 00467 shortest = 0; 00468 } 00469 return shortest; 00470 } 00471 00472 int Dict::UniformCertainties(const WERD_CHOICE& word) { 00473 float Certainty; 00474 float WorstCertainty = MAX_FLOAT32; 00475 float CertaintyThreshold; 00476 FLOAT64 TotalCertainty; 00477 FLOAT64 TotalCertaintySquared; 00478 FLOAT64 Variance; 00479 FLOAT32 Mean, StdDev; 00480 int word_length = word.length(); 00481 00482 if (word_length < 3) 00483 return true; 00484 00485 TotalCertainty = TotalCertaintySquared = 0.0; 00486 for (int i = 0; i < word_length; ++i) { 00487 Certainty = word.certainty(i); 00488 TotalCertainty += Certainty; 00489 TotalCertaintySquared += Certainty * Certainty; 00490 if (Certainty < WorstCertainty) 00491 WorstCertainty = Certainty; 00492 } 00493 00494 // Subtract off worst certainty from statistics. 00495 word_length--; 00496 TotalCertainty -= WorstCertainty; 00497 TotalCertaintySquared -= WorstCertainty * WorstCertainty; 00498 00499 Mean = TotalCertainty / word_length; 00500 Variance = ((word_length * TotalCertaintySquared - 00501 TotalCertainty * TotalCertainty) / 00502 (word_length * (word_length - 1))); 00503 if (Variance < 0.0) 00504 Variance = 0.0; 00505 StdDev = sqrt(Variance); 00506 00507 CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev; 00508 if (CertaintyThreshold > stopper_nondict_certainty_base) 00509 CertaintyThreshold = stopper_nondict_certainty_base; 00510 00511 if (word.certainty() < CertaintyThreshold) { 00512 if (stopper_debug_level >= 1) 00513 tprintf("Stopper: Non-uniform certainty = %4.1f" 00514 " (m=%4.1f, s=%4.1f, t=%4.1f)\n", 00515 word.certainty(), Mean, StdDev, CertaintyThreshold); 00516 return false; 00517 } else { 00518 return true; 00519 } 00520 } 00521 00522 } // namespace tesseract