tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/dict/stopper.cpp
Go to the documentation of this file.
00001 /******************************************************************************
00002  **     Filename:    stopper.c
00003  **     Purpose:     Stopping criteria for word classifier.
00004  **     Author:      Dan Johnson
00005  **     History:     Mon Apr 29 14:56:49 1991, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 
00019 #include <stdio.h>
00020 #include <string.h>
00021 #include <ctype.h>
00022 #include <math.h>
00023 
00024 #include "stopper.h"
00025 #include "ambigs.h"
00026 #include "ccutil.h"
00027 #include "const.h"
00028 #include "danerror.h"
00029 #include "dict.h"
00030 #include "efio.h"
00031 #include "helpers.h"
00032 #include "matchdefs.h"
00033 #include "pageres.h"
00034 #include "params.h"
00035 #include "ratngs.h"
00036 #include "scanutils.h"
00037 #include "unichar.h"
00038 
00039 #ifdef _MSC_VER
00040 #pragma warning(disable:4244)  // Conversion warnings
00041 #pragma warning(disable:4800)  // int/bool warnings
00042 #endif
00043 
00044 using tesseract::ScriptPos;
00049 namespace tesseract {
00050 
00051 bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice,
00052                             XHeightConsistencyEnum xheight_consistency) {
00053   float CertaintyThreshold = stopper_nondict_certainty_base;
00054   int WordSize;
00055 
00056   if (stopper_no_acceptable_choices) return false;
00057 
00058   if (best_choice.length() == 0) return false;
00059 
00060   bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
00061   bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
00062   bool is_case_ok = case_ok(best_choice, getUnicharset());
00063 
00064   if (stopper_debug_level >= 1) {
00065     const char *xht = "UNKNOWN";
00066     switch (xheight_consistency) {
00067       case XH_GOOD:  xht = "NORMAL"; break;
00068       case XH_SUBNORMAL:  xht = "SUBNORMAL"; break;
00069       case XH_INCONSISTENT:  xht = "INCONSISTENT"; break;
00070       default: xht = "UNKNOWN";
00071     }
00072     tprintf("\nStopper:  %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
00073             best_choice.unichar_string().string(),
00074             (is_valid_word ? 'y' : 'n'),
00075             (is_case_ok ? 'y' : 'n'),
00076             xht,
00077             best_choice.min_x_height(),
00078             best_choice.max_x_height());
00079   }
00080   // Do not accept invalid words in PASS1.
00081   if (reject_offset_ <= 0.0f && !is_valid_word) return false;
00082   if (is_valid_word && is_case_ok) {
00083     WordSize = LengthOfShortestAlphaRun(best_choice);
00084     WordSize -= stopper_smallword_size;
00085     if (WordSize < 0)
00086       WordSize = 0;
00087     CertaintyThreshold += WordSize * stopper_certainty_per_char;
00088   }
00089 
00090   if (stopper_debug_level >= 1)
00091     tprintf("Stopper:  Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
00092             best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
00093 
00094   if (no_dang_ambigs &&
00095       best_choice.certainty() > CertaintyThreshold &&
00096       xheight_consistency < XH_INCONSISTENT &&
00097       UniformCertainties(best_choice)) {
00098     return true;
00099   } else {
00100     if (stopper_debug_level >= 1) {
00101       tprintf("AcceptableChoice() returned false"
00102               " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
00103               no_dang_ambigs, best_choice.certainty(),
00104               CertaintyThreshold,
00105               UniformCertainties(best_choice));
00106     }
00107     return false;
00108   }
00109 }
00110 
00111 bool Dict::AcceptableResult(WERD_RES* word) {
00112   if (word->best_choice == NULL) return false;
00113   float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
00114   int WordSize;
00115 
00116   if (stopper_debug_level >= 1) {
00117     tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
00118             word->best_choice->debug_string().string(),
00119             (valid_word(*word->best_choice) ? 'y' : 'n'),
00120             (case_ok(*word->best_choice, getUnicharset()) ? 'y' : 'n'),
00121             word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
00122             word->best_choices.singleton() ? 'n' : 'y');
00123   }
00124 
00125   if (word->best_choice->length() == 0 || !word->best_choices.singleton())
00126     return false;
00127   if (valid_word(*word->best_choice) &&
00128       case_ok(*word->best_choice, getUnicharset())) {
00129     WordSize = LengthOfShortestAlphaRun(*word->best_choice);
00130     WordSize -= stopper_smallword_size;
00131     if (WordSize < 0)
00132       WordSize = 0;
00133     CertaintyThreshold += WordSize * stopper_certainty_per_char;
00134   }
00135 
00136   if (stopper_debug_level >= 1)
00137     tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ",
00138             word->best_choice->certainty(), CertaintyThreshold);
00139 
00140   if (word->best_choice->certainty() > CertaintyThreshold &&
00141       !stopper_no_acceptable_choices) {
00142     if (stopper_debug_level >= 1)
00143       tprintf("ACCEPTED\n");
00144     return true;
00145   } else {
00146     if (stopper_debug_level >= 1)
00147       tprintf("REJECTED\n");
00148     return false;
00149   }
00150 }
00151 
00152 bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
00153                             DANGERR *fixpt,
00154                             bool fix_replaceable,
00155                             MATRIX *ratings) {
00156   if (stopper_debug_level > 2) {
00157     tprintf("\nRunning NoDangerousAmbig() for %s\n",
00158             best_choice->debug_string().string());
00159   }
00160 
00161   // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
00162   // for each unichar id in BestChoice.
00163   BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
00164   int i;
00165   bool ambigs_found = false;
00166   // For each position in best_choice:
00167   // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
00168   // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
00169   // -- look for ambiguities corresponding to wrong_ngram in the list while
00170   //    adding the following unichar_ids from best_choice to wrong_ngram
00171   //
00172   // Repeat the above procedure twice: first time look through
00173   // ambigs to be replaced and replace all the ambiguities found;
00174   // second time look through dangerous ambiguities and construct
00175   // ambig_blob_choices with fake a blob choice for each ambiguity
00176   // and pass them to dawg_permute_and_select() to search for
00177   // ambiguous words in the dictionaries.
00178   //
00179   // Note that during the execution of the for loop (on the first pass)
00180   // if replacements are made the length of best_choice might change.
00181   for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
00182     bool replace = (fix_replaceable && pass == 0);
00183     const UnicharAmbigsVector &table = replace ?
00184       getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
00185     if (!replace) {
00186       // Initialize ambig_blob_choices with lists containing a single
00187       // unichar id for the correspoding position in best_choice.
00188       // best_choice consisting from only the original letters will
00189       // have a rating of 0.0.
00190       for (i = 0; i < best_choice->length(); ++i) {
00191         BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
00192         BLOB_CHOICE_IT lst_it(lst);
00193         // TODO(rays/antonova) Put real xheights and y shifts here.
00194         lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
00195                                           0.0, 0.0, -1, -1, -1, 0, 1, 0,
00196                                           BCC_AMBIG));
00197         ambig_blob_choices.push_back(lst);
00198       }
00199     }
00200     UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
00201     int wrong_ngram_index;
00202     int next_index;
00203     int blob_index = 0;
00204     for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
00205          ++i) {
00206       UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
00207       if (stopper_debug_level > 2) {
00208         tprintf("Looking for %s ngrams starting with %s:\n",
00209                 replace ? "replaceable" : "ambiguous",
00210                 getUnicharset().debug_str(curr_unichar_id).string());
00211       }
00212       int num_wrong_blobs = best_choice->state(i);
00213       wrong_ngram_index = 0;
00214       wrong_ngram[wrong_ngram_index] = curr_unichar_id;
00215       if (curr_unichar_id == INVALID_UNICHAR_ID ||
00216           curr_unichar_id >= table.size() ||
00217           table[curr_unichar_id] == NULL) {
00218         continue;  // there is no ambig spec for this unichar id
00219       }
00220       AmbigSpec_IT spec_it(table[curr_unichar_id]);
00221       for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
00222         const AmbigSpec *ambig_spec = spec_it.data();
00223         wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
00224         int compare = UnicharIdArrayUtils::compare(wrong_ngram,
00225                                                    ambig_spec->wrong_ngram);
00226         if (stopper_debug_level > 2) {
00227           tprintf("candidate ngram: ");
00228           UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
00229           tprintf("current ngram from spec: ");
00230           UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
00231           tprintf("comparison result: %d\n", compare);
00232         }
00233         if (compare == 0) {
00234           // Record the place where we found an ambiguity.
00235           if (fixpt != NULL) {
00236             UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
00237             fixpt->push_back(DANGERR_INFO(
00238                 blob_index, blob_index + num_wrong_blobs, replace,
00239                 getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
00240                 leftmost_id));
00241             if (stopper_debug_level > 1) {
00242               tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
00243                       blob_index + num_wrong_blobs, false,
00244                       getUnicharset().get_isngram(
00245                           ambig_spec->correct_ngram_id),
00246                       getUnicharset().id_to_unichar(leftmost_id));
00247             }
00248           }
00249 
00250           if (replace) {
00251             if (stopper_debug_level > 2) {
00252               tprintf("replace ambiguity with %s : ",
00253                       getUnicharset().id_to_unichar(
00254                           ambig_spec->correct_ngram_id));
00255               UnicharIdArrayUtils::print(
00256                   ambig_spec->correct_fragments, getUnicharset());
00257             }
00258             ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
00259                          ambig_spec->correct_ngram_id,
00260                          best_choice, ratings);
00261           } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
00262             // We found dang ambig - update ambig_blob_choices.
00263             if (stopper_debug_level > 2) {
00264               tprintf("found ambiguity: ");
00265               UnicharIdArrayUtils::print(
00266                   ambig_spec->correct_fragments, getUnicharset());
00267             }
00268             ambigs_found = true;
00269             for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
00270                  ++tmp_index) {
00271               // Add a blob choice for the corresponding fragment of the
00272               // ambiguity. These fake blob choices are initialized with
00273               // negative ratings (which are not possible for real blob
00274               // choices), so that dawg_permute_and_select() considers any
00275               // word not consisting of only the original letters a better
00276               // choice and stops searching for alternatives once such a
00277               // choice is found.
00278               BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
00279               bc_it.add_to_end(new BLOB_CHOICE(
00280                   ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
00281                   -1, -1, -1, 0, 1, 0, BCC_AMBIG));
00282             }
00283           }
00284           spec_it.forward();
00285         } else if (compare == -1) {
00286           if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
00287               ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
00288             // Add the next unichar id to wrong_ngram and keep looking for
00289             // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
00290             wrong_ngram[++wrong_ngram_index] =
00291               best_choice->unichar_id(next_index);
00292             num_wrong_blobs += best_choice->state(next_index);
00293           } else {
00294             break;  // no more matching ambigs in this AMBIG_SPEC_LIST
00295           }
00296         } else {
00297           spec_it.forward();
00298         }
00299       }  // end searching AmbigSpec_LIST
00300     }  // end searching best_choice
00301   }  // end searching replace and dangerous ambigs
00302 
00303   // If any ambiguities were found permute the constructed ambig_blob_choices
00304   // to see if an alternative dictionary word can be found.
00305   if (ambigs_found) {
00306     if (stopper_debug_level > 2) {
00307       tprintf("\nResulting ambig_blob_choices:\n");
00308       for (i = 0; i < ambig_blob_choices.length(); ++i) {
00309         print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
00310         tprintf("\n");
00311       }
00312     }
00313     WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
00314     ambigs_found = (alt_word->rating() < 0.0);
00315     if (ambigs_found) {
00316       if (stopper_debug_level >= 1) {
00317         tprintf ("Stopper: Possible ambiguous word = %s\n",
00318                  alt_word->debug_string().string());
00319       }
00320       if (fixpt != NULL) {
00321         // Note: Currently character choices combined from fragments can only
00322         // be generated by NoDangrousAmbigs(). This code should be updated if
00323         // the capability to produce classifications combined from character
00324         // fragments is added to other functions.
00325         int orig_i = 0;
00326         for (i = 0; i < alt_word->length(); ++i) {
00327           const UNICHARSET &uchset = getUnicharset();
00328           bool replacement_is_ngram =
00329               uchset.get_isngram(alt_word->unichar_id(i));
00330           UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
00331           if (replacement_is_ngram) {
00332             // we have to extract the leftmost unichar from the ngram.
00333             const char *str = uchset.id_to_unichar(leftmost_id);
00334             int step = uchset.step(str);
00335             if (step) leftmost_id = uchset.unichar_to_id(str, step);
00336           }
00337           int end_i = orig_i + alt_word->state(i);
00338           if (alt_word->state(i) > 1 ||
00339               (orig_i + 1 == end_i && replacement_is_ngram)) {
00340             // Compute proper blob indices.
00341             int blob_start = 0;
00342             for (int j = 0; j < orig_i; ++j)
00343               blob_start += best_choice->state(j);
00344             int blob_end = blob_start;
00345             for (int j = orig_i; j < end_i; ++j)
00346               blob_end += best_choice->state(j);
00347             fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
00348                                           replacement_is_ngram, leftmost_id));
00349             if (stopper_debug_level > 1) {
00350               tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
00351                       true, replacement_is_ngram,
00352                       uchset.id_to_unichar(leftmost_id));
00353             }
00354           }
00355           orig_i += alt_word->state(i);
00356         }
00357       }
00358     }
00359     delete alt_word;
00360   }
00361   if (output_ambig_words_file_ != NULL) {
00362     fprintf(output_ambig_words_file_, "\n");
00363   }
00364 
00365   ambig_blob_choices.delete_data_pointers();
00366   return !ambigs_found;
00367 }
00368 
00369 void Dict::EndDangerousAmbigs() {}
00370 
00371 void Dict::SettupStopperPass1() {
00372   reject_offset_ = 0.0;
00373 }
00374 
00375 void Dict::SettupStopperPass2() {
00376   reject_offset_ = stopper_phase2_certainty_rejection_offset;
00377 }
00378 
00379 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
00380                         UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
00381                         MATRIX *ratings) {
00382   int num_blobs_to_replace = 0;
00383   int begin_blob_index = 0;
00384   int i;
00385   // Rating and certainty for the new BLOB_CHOICE are derived from the
00386   // replaced choices.
00387   float new_rating = 0.0f;
00388   float new_certainty = 0.0f;
00389   BLOB_CHOICE* old_choice = NULL;
00390   for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
00391     if (i >= wrong_ngram_begin_index) {
00392       int num_blobs = werd_choice->state(i);
00393       int col = begin_blob_index + num_blobs_to_replace;
00394       int row = col + num_blobs - 1;
00395       BLOB_CHOICE_LIST* choices = ratings->get(col, row);
00396       ASSERT_HOST(choices != NULL);
00397       old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
00398       ASSERT_HOST(old_choice != NULL);
00399       new_rating += old_choice->rating();
00400       new_certainty += old_choice->certainty();
00401       num_blobs_to_replace += num_blobs;
00402     } else {
00403       begin_blob_index += werd_choice->state(i);
00404     }
00405   }
00406   new_certainty /= wrong_ngram_size;
00407   // If there is no entry in the ratings matrix, add it.
00408   MATRIX_COORD coord(begin_blob_index,
00409                      begin_blob_index + num_blobs_to_replace - 1);
00410   if (!coord.Valid(*ratings)) {
00411     ratings->IncreaseBandSize(coord.row - coord.col + 1);
00412   }
00413   if (ratings->get(coord.col, coord.row) == NULL)
00414     ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
00415   BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
00416   BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
00417   if (choice != NULL) {
00418     // Already there. Upgrade if new rating better.
00419     if (new_rating < choice->rating())
00420       choice->set_rating(new_rating);
00421     if (new_certainty < choice->certainty())
00422       choice->set_certainty(new_certainty);
00423     // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
00424   } else {
00425     // Need a new choice with the correct_ngram_id.
00426     choice = new BLOB_CHOICE(*old_choice);
00427     choice->set_unichar_id(correct_ngram_id);
00428     choice->set_rating(new_rating);
00429     choice->set_certainty(new_certainty);
00430     choice->set_classifier(BCC_AMBIG);
00431     choice->set_matrix_cell(coord.col, coord.row);
00432     BLOB_CHOICE_IT it (new_choices);
00433     it.add_to_end(choice);
00434   }
00435   // Remove current unichar from werd_choice. On the last iteration
00436   // set the correct replacement unichar instead of removing a unichar.
00437   for (int replaced_count = 0; replaced_count < wrong_ngram_size;
00438        ++replaced_count) {
00439     if (replaced_count + 1 == wrong_ngram_size) {
00440       werd_choice->set_blob_choice(wrong_ngram_begin_index,
00441                                    num_blobs_to_replace, choice);
00442     } else {
00443       werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
00444     }
00445   }
00446   if (stopper_debug_level >= 1) {
00447       werd_choice->print("ReplaceAmbig() ");
00448       tprintf("Modified blob_choices: ");
00449       print_ratings_list("\n", new_choices, getUnicharset());
00450   }
00451 }
00452 
00453 int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) {
00454   int shortest = MAX_INT32;
00455   int curr_len = 0;
00456   for (int w = 0; w < WordChoice.length(); ++w) {
00457     if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
00458       curr_len++;
00459     } else if (curr_len > 0) {
00460       if (curr_len < shortest) shortest = curr_len;
00461       curr_len = 0;
00462     }
00463   }
00464   if (curr_len > 0 && curr_len < shortest) {
00465     shortest = curr_len;
00466   } else if (shortest == MAX_INT32) {
00467     shortest = 0;
00468   }
00469   return shortest;
00470 }
00471 
00472 int Dict::UniformCertainties(const WERD_CHOICE& word) {
00473   float Certainty;
00474   float WorstCertainty = MAX_FLOAT32;
00475   float CertaintyThreshold;
00476   FLOAT64 TotalCertainty;
00477   FLOAT64 TotalCertaintySquared;
00478   FLOAT64 Variance;
00479   FLOAT32 Mean, StdDev;
00480   int word_length = word.length();
00481 
00482   if (word_length < 3)
00483     return true;
00484 
00485   TotalCertainty = TotalCertaintySquared = 0.0;
00486   for (int i = 0; i < word_length; ++i) {
00487     Certainty = word.certainty(i);
00488     TotalCertainty += Certainty;
00489     TotalCertaintySquared += Certainty * Certainty;
00490     if (Certainty < WorstCertainty)
00491       WorstCertainty = Certainty;
00492   }
00493 
00494   // Subtract off worst certainty from statistics.
00495   word_length--;
00496   TotalCertainty -= WorstCertainty;
00497   TotalCertaintySquared -= WorstCertainty * WorstCertainty;
00498 
00499   Mean = TotalCertainty / word_length;
00500   Variance = ((word_length * TotalCertaintySquared -
00501     TotalCertainty * TotalCertainty) /
00502     (word_length * (word_length - 1)));
00503   if (Variance < 0.0)
00504     Variance = 0.0;
00505   StdDev = sqrt(Variance);
00506 
00507   CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
00508   if (CertaintyThreshold > stopper_nondict_certainty_base)
00509     CertaintyThreshold = stopper_nondict_certainty_base;
00510 
00511   if (word.certainty() < CertaintyThreshold) {
00512     if (stopper_debug_level >= 1)
00513       tprintf("Stopper: Non-uniform certainty = %4.1f"
00514               " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
00515               word.certainty(), Mean, StdDev, CertaintyThreshold);
00516     return false;
00517   } else {
00518     return true;
00519   }
00520 }
00521 
00522 } // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines