tesseract
3.03
|
00001 00002 // File: blamer.h 00003 // Description: Module allowing precise error causes to be allocated. 00004 // Author: Rike Antonova 00005 // Refactored: Ray Smith 00006 // Created: Mon Feb 04 14:37:01 PST 2013 00007 // 00008 // (C) Copyright 2013, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifndef TESSERACT_CCSTRUCT_BLAMER_H_ 00022 #define TESSERACT_CCSTRUCT_BLAMER_H_ 00023 00024 #include <stdio.h> 00025 #include "boxword.h" 00026 #include "genericvector.h" 00027 #include "matrix.h" 00028 #include "params_training_featdef.h" 00029 #include "ratngs.h" 00030 #include "strngs.h" 00031 #include "tesscallback.h" 00032 00033 static const inT16 kBlamerBoxTolerance = 5; 00034 00035 // Enum for expressing the source of error. 00036 // Note: Please update kIncorrectResultReasonNames when modifying this enum. 00037 enum IncorrectResultReason { 00038 // The text recorded in best choice == truth text 00039 IRR_CORRECT, 00040 // Either: Top choice is incorrect and is a dictionary word (language model 00041 // is unlikely to help correct such errors, so blame the classifier). 00042 // Or: the correct unichar was not included in shortlist produced by the 00043 // classifier at all. 00044 IRR_CLASSIFIER, 00045 // Chopper have not found one or more splits that correspond to the correct 00046 // character bounding boxes recorded in BlamerBundle::truth_word. 00047 IRR_CHOPPER, 00048 // Classifier did include correct unichars for each blob in the correct 00049 // segmentation, however its rating could have been too bad to allow the 00050 // language model to pull out the correct choice. On the other hand the 00051 // strength of the language model might have been too weak to favor the 00052 // correct answer, this we call this case a classifier-language model 00053 // tradeoff error. 00054 IRR_CLASS_LM_TRADEOFF, 00055 // Page layout failed to produce the correct bounding box. Blame page layout 00056 // if the truth was not found for the word, which implies that the bounding 00057 // box of the word was incorrect (no truth word had a similar bounding box). 00058 IRR_PAGE_LAYOUT, 00059 // SegSearch heuristic prevented one or more blobs from the correct 00060 // segmentation state to be classified (e.g. the blob was too wide). 00061 IRR_SEGSEARCH_HEUR, 00062 // The correct segmentaiton state was not explored because of poor SegSearch 00063 // pain point prioritization. We blame SegSearch pain point prioritization 00064 // if the best rating of a choice constructed from correct segmentation is 00065 // better than that of the best choice (i.e. if we got to explore the correct 00066 // segmentation state, language model would have picked the correct choice). 00067 IRR_SEGSEARCH_PP, 00068 // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word, 00069 // and thus use the old language model (permuters). 00070 // TODO(antonova): integrate the new language mode with chopper 00071 IRR_CLASS_OLD_LM_TRADEOFF, 00072 // If there is an incorrect adaptive template match with a better score than 00073 // a correct one (either pre-trained or adapted), mark this as adaption error. 00074 IRR_ADAPTION, 00075 // split_and_recog_word() failed to find a suitable split in truth. 00076 IRR_NO_TRUTH_SPLIT, 00077 // Truth is not available for this word (e.g. when words in corrected content 00078 // file are turned into ~~~~ because an appropriate alignment was not found. 00079 IRR_NO_TRUTH, 00080 // The text recorded in best choice != truth text, but none of the above 00081 // reasons are set. 00082 IRR_UNKNOWN, 00083 00084 IRR_NUM_REASONS 00085 }; 00086 00087 // Blamer-related information to determine the source of errors. 00088 struct BlamerBundle { 00089 static const char *IncorrectReasonName(IncorrectResultReason irr); 00090 BlamerBundle() : truth_has_char_boxes_(false), 00091 incorrect_result_reason_(IRR_CORRECT), 00092 lattice_data_(NULL) { ClearResults(); } 00093 BlamerBundle(const BlamerBundle &other) { 00094 this->CopyTruth(other); 00095 this->CopyResults(other); 00096 } 00097 ~BlamerBundle() { delete[] lattice_data_; } 00098 00099 // Accessors. 00100 STRING TruthString() const { 00101 STRING truth_str; 00102 for (int i = 0; i < truth_text_.length(); ++i) 00103 truth_str += truth_text_[i]; 00104 return truth_str; 00105 } 00106 IncorrectResultReason incorrect_result_reason() const { 00107 return incorrect_result_reason_; 00108 } 00109 bool NoTruth() const { 00110 return incorrect_result_reason_ == IRR_NO_TRUTH || 00111 incorrect_result_reason_ == IRR_PAGE_LAYOUT; 00112 } 00113 bool HasDebugInfo() const { 00114 return debug_.length() > 0 || misadaption_debug_.length() > 0; 00115 } 00116 const STRING& debug() const { 00117 return debug_; 00118 } 00119 const STRING& misadaption_debug() const { 00120 return misadaption_debug_; 00121 } 00122 void UpdateBestRating(float rating) { 00123 if (rating < best_correctly_segmented_rating_) 00124 best_correctly_segmented_rating_ = rating; 00125 } 00126 int correct_segmentation_length() const { 00127 return correct_segmentation_cols_.length(); 00128 } 00129 // Returns true if the given ratings matrix col,row position is included 00130 // in the correct segmentation path at the given index. 00131 bool MatrixPositionCorrect(int index, const MATRIX_COORD& coord) { 00132 return correct_segmentation_cols_[index] == coord.col && 00133 correct_segmentation_rows_[index] == coord.row; 00134 } 00135 void set_best_choice_is_dict_and_top_choice(bool value) { 00136 best_choice_is_dict_and_top_choice_ = value; 00137 } 00138 const char* lattice_data() const { 00139 return lattice_data_; 00140 } 00141 int lattice_size() const { 00142 return lattice_size_; // size of lattice_data in bytes 00143 } 00144 void set_lattice_data(const char* data, int size) { 00145 lattice_size_ = size; 00146 delete [] lattice_data_; 00147 lattice_data_ = new char[lattice_size_]; 00148 memcpy(lattice_data_, data, lattice_size_); 00149 } 00150 const tesseract::ParamsTrainingBundle& params_training_bundle() const { 00151 return params_training_bundle_; 00152 } 00153 // Adds a new ParamsTrainingHypothesis to the current hypothesis list. 00154 void AddHypothesis(const tesseract::ParamsTrainingHypothesis& hypo) { 00155 params_training_bundle_.AddHypothesis(hypo); 00156 } 00157 00158 // Functions to setup the blamer. 00159 // Whole word string, whole word bounding box. 00160 void SetWordTruth(const UNICHARSET& unicharset, 00161 const char* truth_str, const TBOX& word_box); 00162 // Single "character" string, "character" bounding box. 00163 // May be called multiple times to indicate the characters in a word. 00164 void SetSymbolTruth(const UNICHARSET& unicharset, 00165 const char* char_str, const TBOX& char_box); 00166 // Marks that there is something wrong with the truth text, like it contains 00167 // reject characters. 00168 void SetRejectedTruth(); 00169 00170 // Returns true if the provided word_choice is correct. 00171 bool ChoiceIsCorrect(const WERD_CHOICE* word_choice) const; 00172 00173 void ClearResults() { 00174 norm_truth_word_.DeleteAllBoxes(); 00175 norm_box_tolerance_ = 0; 00176 if (!NoTruth()) incorrect_result_reason_ = IRR_CORRECT; 00177 debug_ = ""; 00178 segsearch_is_looking_for_blame_ = false; 00179 best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating; 00180 correct_segmentation_cols_.clear(); 00181 correct_segmentation_rows_.clear(); 00182 best_choice_is_dict_and_top_choice_ = false; 00183 delete[] lattice_data_; 00184 lattice_data_ = NULL; 00185 lattice_size_ = 0; 00186 } 00187 void CopyTruth(const BlamerBundle &other) { 00188 truth_has_char_boxes_ = other.truth_has_char_boxes_; 00189 truth_word_ = other.truth_word_; 00190 truth_text_ = other.truth_text_; 00191 incorrect_result_reason_ = 00192 (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT); 00193 } 00194 void CopyResults(const BlamerBundle &other) { 00195 norm_truth_word_ = other.norm_truth_word_; 00196 norm_box_tolerance_ = other.norm_box_tolerance_; 00197 incorrect_result_reason_ = other.incorrect_result_reason_; 00198 segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_; 00199 best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_; 00200 correct_segmentation_cols_ = other.correct_segmentation_cols_; 00201 correct_segmentation_rows_ = other.correct_segmentation_rows_; 00202 best_choice_is_dict_and_top_choice_ = 00203 other.best_choice_is_dict_and_top_choice_; 00204 if (other.lattice_data_ != NULL) { 00205 lattice_data_ = new char[other.lattice_size_]; 00206 memcpy(lattice_data_, other.lattice_data_, other.lattice_size_); 00207 lattice_size_ = other.lattice_size_; 00208 } else { 00209 lattice_data_ = NULL; 00210 } 00211 } 00212 const char *IncorrectReason() const; 00213 00214 // Appends choice and truth details to the given debug string. 00215 void FillDebugString(const STRING &msg, const WERD_CHOICE *choice, 00216 STRING *debug); 00217 00218 // Sets up the norm_truth_word from truth_word using the given DENORM. 00219 void SetupNormTruthWord(const DENORM& denorm); 00220 00221 // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty 00222 // bundles) where the right edge/ of the left-hand word is word1_right, 00223 // and the left edge of the right-hand word is word2_left. 00224 void SplitBundle(int word1_right, int word2_left, bool debug, 00225 BlamerBundle* bundle1, BlamerBundle* bundle2) const; 00226 // "Joins" the blames from bundle1 and bundle2 into *this. 00227 void JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2, 00228 bool debug); 00229 00230 // If a blob with the same bounding box as one of the truth character 00231 // bounding boxes is not classified as the corresponding truth character 00232 // blames character classifier for incorrect answer. 00233 void BlameClassifier(const UNICHARSET& unicharset, 00234 const TBOX& blob_box, 00235 const BLOB_CHOICE_LIST& choices, 00236 bool debug); 00237 00238 00239 // Checks whether chops were made at all the character bounding box 00240 // boundaries in word->truth_word. If not - blames the chopper for an 00241 // incorrect answer. 00242 void SetChopperBlame(const WERD_RES* word, bool debug); 00243 // Blames the classifier or the language model if, after running only the 00244 // chopper, best_choice is incorrect and no blame has been yet set. 00245 // Blames the classifier if best_choice is classifier's top choice and is a 00246 // dictionary word (i.e. language model could not have helped). 00247 // Otherwise, blames the language model (formerly permuter word adjustment). 00248 void BlameClassifierOrLangModel( 00249 const WERD_RES* word, 00250 const UNICHARSET& unicharset, bool valid_permuter, bool debug); 00251 // Sets up the correct_segmentation_* to mark the correct bounding boxes. 00252 void SetupCorrectSegmentation(const TWERD* word, bool debug); 00253 00254 // Returns true if a guided segmentation search is needed. 00255 bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const; 00256 // Setup ready to guide the segmentation search to the correct segmentation. 00257 // The callback pp_cb is used to avoid a cyclic dependency. 00258 // It calls into LMPainPoints::GenerateForBlamer by pre-binding the 00259 // WERD_RES, and the LMPainPoints itself. 00260 // pp_cb must be a permanent callback, and should be deleted by the caller. 00261 void InitForSegSearch(const WERD_CHOICE *best_choice, 00262 MATRIX* ratings, UNICHAR_ID wildcard_id, 00263 bool debug, STRING *debug_str, 00264 TessResultCallback2<bool, int, int>* pp_cb); 00265 // Returns true if the guided segsearch is in progress. 00266 bool GuidedSegsearchStillGoing() const; 00267 // The segmentation search has ended. Sets the blame appropriately. 00268 void FinishSegSearch(const WERD_CHOICE *best_choice, 00269 bool debug, STRING *debug_str); 00270 00271 // If the bundle is null or still does not indicate the correct result, 00272 // fix it and use some backup reason for the blame. 00273 static void LastChanceBlame(bool debug, WERD_RES* word); 00274 00275 // Sets the misadaption debug if this word is incorrect, as this word is 00276 // being adapted to. 00277 void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug); 00278 00279 private: 00280 void SetBlame(IncorrectResultReason irr, const STRING &msg, 00281 const WERD_CHOICE *choice, bool debug) { 00282 incorrect_result_reason_ = irr; 00283 debug_ = IncorrectReason(); 00284 debug_ += " to blame: "; 00285 FillDebugString(msg, choice, &debug_); 00286 if (debug) tprintf("SetBlame(): %s", debug_.string()); 00287 } 00288 00289 private: 00290 // Set to true when bounding boxes for individual unichars are recorded. 00291 bool truth_has_char_boxes_; 00292 // The true_word (in the original image coordinate space) contains ground 00293 // truth bounding boxes for this WERD_RES. 00294 tesseract::BoxWord truth_word_; 00295 // Same as above, but in normalized coordinates 00296 // (filled in by WERD_RES::SetupForRecognition()). 00297 tesseract::BoxWord norm_truth_word_; 00298 // Tolerance for bounding box comparisons in normalized space. 00299 int norm_box_tolerance_; 00300 // Contains ground truth unichar for each of the bounding boxes in truth_word. 00301 GenericVector<STRING> truth_text_; 00302 // The reason for incorrect OCR result. 00303 IncorrectResultReason incorrect_result_reason_; 00304 // Debug text associated with the blame. 00305 STRING debug_; 00306 // Misadaption debug information (filled in if this word was misadapted to). 00307 STRING misadaption_debug_; 00308 // Variables used by the segmentation search when looking for the blame. 00309 // Set to true while segmentation search is continued after the usual 00310 // termination condition in order to look for the blame. 00311 bool segsearch_is_looking_for_blame_; 00312 // Best rating for correctly segmented path 00313 // (set and used by SegSearch when looking for blame). 00314 float best_correctly_segmented_rating_; 00315 // Vectors populated by SegSearch to indicate column and row indices that 00316 // correspond to blobs with correct bounding boxes. 00317 GenericVector<int> correct_segmentation_cols_; 00318 GenericVector<int> correct_segmentation_rows_; 00319 // Set to true if best choice is a dictionary word and 00320 // classifier's top choice. 00321 bool best_choice_is_dict_and_top_choice_; 00322 // Serialized segmentation search lattice. 00323 char *lattice_data_; 00324 int lattice_size_; // size of lattice_data in bytes 00325 // Information about hypotheses (paths) explored by the segmentation search. 00326 tesseract::ParamsTrainingBundle params_training_bundle_; 00327 }; 00328 00329 00330 #endif // TESSERACT_CCSTRUCT_BLAMER_H_