tesseract
3.03
|
00001 00002 // File: blamer.cpp 00003 // Description: Module allowing precise error causes to be allocated. 00004 // Author: Rike Antonova 00005 // Refactored: Ray Smith 00006 // Created: Mon Feb 04 14:37:01 PST 2013 00007 // 00008 // (C) Copyright 2013, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #include "blamer.h" 00022 #include "blobs.h" 00023 #include "matrix.h" 00024 #include "normalis.h" 00025 #include "pageres.h" 00026 00027 // Names for each value of IncorrectResultReason enum. Keep in sync. 00028 const char kBlameCorrect[] = "corr"; 00029 const char kBlameClassifier[] = "cl"; 00030 const char kBlameChopper[] = "chop"; 00031 const char kBlameClassLMTradeoff[] = "cl/LM"; 00032 const char kBlamePageLayout[] = "pglt"; 00033 const char kBlameSegsearchHeur[] = "ss_heur"; 00034 const char kBlameSegsearchPP[] = "ss_pp"; 00035 const char kBlameClassOldLMTradeoff[] = "cl/old_LM"; 00036 const char kBlameAdaption[] = "adapt"; 00037 const char kBlameNoTruthSplit[] = "no_tr_spl"; 00038 const char kBlameNoTruth[] = "no_tr"; 00039 const char kBlameUnknown[] = "unkn"; 00040 00041 const char * const kIncorrectResultReasonNames[] = { 00042 kBlameCorrect, 00043 kBlameClassifier, 00044 kBlameChopper, 00045 kBlameClassLMTradeoff, 00046 kBlamePageLayout, 00047 kBlameSegsearchHeur, 00048 kBlameSegsearchPP, 00049 kBlameClassOldLMTradeoff, 00050 kBlameAdaption, 00051 kBlameNoTruthSplit, 00052 kBlameNoTruth, 00053 kBlameUnknown 00054 }; 00055 00056 const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) { 00057 return kIncorrectResultReasonNames[irr]; 00058 } 00059 00060 const char *BlamerBundle::IncorrectReason() const { 00061 return kIncorrectResultReasonNames[incorrect_result_reason_]; 00062 } 00063 00064 // Functions to setup the blamer. 00065 // Whole word string, whole word bounding box. 00066 void BlamerBundle::SetWordTruth(const UNICHARSET& unicharset, 00067 const char* truth_str, const TBOX& word_box) { 00068 truth_word_.InsertBox(0, word_box); 00069 truth_has_char_boxes_ = false; 00070 // Encode the string as UNICHAR_IDs. 00071 GenericVector<UNICHAR_ID> encoding; 00072 GenericVector<char> lengths; 00073 unicharset.encode_string(truth_str, false, &encoding, &lengths, NULL); 00074 int total_length = 0; 00075 for (int i = 0; i < encoding.size(); total_length += lengths[i++]) { 00076 STRING uch(truth_str + total_length); 00077 uch.truncate_at(lengths[i] - total_length); 00078 UNICHAR_ID id = encoding[i]; 00079 if (id != INVALID_UNICHAR_ID) uch = unicharset.get_normed_unichar(id); 00080 truth_text_.push_back(uch); 00081 } 00082 } 00083 00084 // Single "character" string, "character" bounding box. 00085 // May be called multiple times to indicate the characters in a word. 00086 void BlamerBundle::SetSymbolTruth(const UNICHARSET& unicharset, 00087 const char* char_str, const TBOX& char_box) { 00088 STRING symbol_str(char_str); 00089 UNICHAR_ID id = unicharset.unichar_to_id(char_str); 00090 if (id != INVALID_UNICHAR_ID) { 00091 STRING normed_uch(unicharset.get_normed_unichar(id)); 00092 if (normed_uch.length() > 0) symbol_str = normed_uch; 00093 } 00094 int length = truth_word_.length(); 00095 truth_text_.push_back(symbol_str); 00096 truth_word_.InsertBox(length, char_box); 00097 if (length == 0) 00098 truth_has_char_boxes_ = true; 00099 else if (truth_word_.BlobBox(length - 1) == char_box) 00100 truth_has_char_boxes_ = false; 00101 } 00102 00103 // Marks that there is something wrong with the truth text, like it contains 00104 // reject characters. 00105 void BlamerBundle::SetRejectedTruth() { 00106 incorrect_result_reason_ = IRR_NO_TRUTH; 00107 truth_has_char_boxes_ = false; 00108 } 00109 00110 // Returns true if the provided word_choice is correct. 00111 bool BlamerBundle::ChoiceIsCorrect(const WERD_CHOICE* word_choice) const { 00112 if (word_choice == NULL) return false; 00113 const UNICHARSET* uni_set = word_choice->unicharset(); 00114 STRING normed_choice_str; 00115 for (int i = 0; i < word_choice->length(); ++i) { 00116 normed_choice_str += 00117 uni_set->get_normed_unichar(word_choice->unichar_id(i)); 00118 } 00119 STRING truth_str = TruthString(); 00120 return truth_str == normed_choice_str; 00121 } 00122 00123 void BlamerBundle::FillDebugString(const STRING &msg, 00124 const WERD_CHOICE *choice, 00125 STRING *debug) { 00126 (*debug) += "Truth "; 00127 for (int i = 0; i < this->truth_text_.length(); ++i) { 00128 (*debug) += this->truth_text_[i]; 00129 } 00130 if (!this->truth_has_char_boxes_) (*debug) += " (no char boxes)"; 00131 if (choice != NULL) { 00132 (*debug) += " Choice "; 00133 STRING choice_str; 00134 choice->string_and_lengths(&choice_str, NULL); 00135 (*debug) += choice_str; 00136 } 00137 if (msg.length() > 0) { 00138 (*debug) += "\n"; 00139 (*debug) += msg; 00140 } 00141 (*debug) += "\n"; 00142 } 00143 00144 // Sets up the norm_truth_word from truth_word using the given DENORM. 00145 void BlamerBundle::SetupNormTruthWord(const DENORM& denorm) { 00146 // TODO(rays) Is this the last use of denorm in WERD_RES and can it go? 00147 norm_box_tolerance_ = kBlamerBoxTolerance * denorm.x_scale(); 00148 TPOINT topleft; 00149 TPOINT botright; 00150 TPOINT norm_topleft; 00151 TPOINT norm_botright; 00152 for (int b = 0; b < truth_word_.length(); ++b) { 00153 const TBOX &box = truth_word_.BlobBox(b); 00154 topleft.x = box.left(); 00155 topleft.y = box.top(); 00156 botright.x = box.right(); 00157 botright.y = box.bottom(); 00158 denorm.NormTransform(NULL, topleft, &norm_topleft); 00159 denorm.NormTransform(NULL, botright, &norm_botright); 00160 TBOX norm_box(norm_topleft.x, norm_botright.y, 00161 norm_botright.x, norm_topleft.y); 00162 norm_truth_word_.InsertBox(b, norm_box); 00163 } 00164 } 00165 00166 // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty 00167 // bundles) where the right edge/ of the left-hand word is word1_right, 00168 // and the left edge of the right-hand word is word2_left. 00169 void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug, 00170 BlamerBundle* bundle1, 00171 BlamerBundle* bundle2) const { 00172 STRING debug_str; 00173 // Find truth boxes that correspond to the split in the blobs. 00174 int b; 00175 int begin2_truth_index = -1; 00176 if (incorrect_result_reason_ != IRR_NO_TRUTH && 00177 truth_has_char_boxes_) { 00178 debug_str = "Looking for truth split at"; 00179 debug_str.add_str_int(" end1_x ", word1_right); 00180 debug_str.add_str_int(" begin2_x ", word2_left); 00181 debug_str += "\nnorm_truth_word boxes:\n"; 00182 if (norm_truth_word_.length() > 1) { 00183 norm_truth_word_.BlobBox(0).print_to_str(&debug_str); 00184 for (b = 1; b < norm_truth_word_.length(); ++b) { 00185 norm_truth_word_.BlobBox(b).print_to_str(&debug_str); 00186 if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) < 00187 norm_box_tolerance_) && 00188 (abs(word2_left - norm_truth_word_.BlobBox(b).left()) < 00189 norm_box_tolerance_)) { 00190 begin2_truth_index = b; 00191 debug_str += "Split found"; 00192 break; 00193 } 00194 } 00195 debug_str += '\n'; 00196 } 00197 } 00198 // Populate truth information in word and word2 with the first and second 00199 // part of the original truth. 00200 if (begin2_truth_index > 0) { 00201 bundle1->truth_has_char_boxes_ = true; 00202 bundle1->norm_box_tolerance_ = norm_box_tolerance_; 00203 bundle2->truth_has_char_boxes_ = true; 00204 bundle2->norm_box_tolerance_ = norm_box_tolerance_; 00205 BlamerBundle *curr_bb = bundle1; 00206 for (b = 0; b < norm_truth_word_.length(); ++b) { 00207 if (b == begin2_truth_index) curr_bb = bundle2; 00208 curr_bb->norm_truth_word_.InsertBox(b, norm_truth_word_.BlobBox(b)); 00209 curr_bb->truth_word_.InsertBox(b, truth_word_.BlobBox(b)); 00210 curr_bb->truth_text_.push_back(truth_text_[b]); 00211 } 00212 } else if (incorrect_result_reason_ == IRR_NO_TRUTH) { 00213 bundle1->incorrect_result_reason_ = IRR_NO_TRUTH; 00214 bundle2->incorrect_result_reason_ = IRR_NO_TRUTH; 00215 } else { 00216 debug_str += "Truth split not found"; 00217 debug_str += truth_has_char_boxes_ ? 00218 "\n" : " (no truth char boxes)\n"; 00219 bundle1->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug); 00220 bundle2->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug); 00221 } 00222 } 00223 00224 // "Joins" the blames from bundle1 and bundle2 into *this. 00225 void BlamerBundle::JoinBlames(const BlamerBundle& bundle1, 00226 const BlamerBundle& bundle2, bool debug) { 00227 STRING debug_str; 00228 IncorrectResultReason irr = incorrect_result_reason_; 00229 if (irr != IRR_NO_TRUTH_SPLIT) debug_str = ""; 00230 if (bundle1.incorrect_result_reason_ != IRR_CORRECT && 00231 bundle1.incorrect_result_reason_ != IRR_NO_TRUTH && 00232 bundle1.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) { 00233 debug_str += "Blame from part 1: "; 00234 debug_str += bundle1.debug_; 00235 irr = bundle1.incorrect_result_reason_; 00236 } 00237 if (bundle2.incorrect_result_reason_ != IRR_CORRECT && 00238 bundle2.incorrect_result_reason_ != IRR_NO_TRUTH && 00239 bundle2.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) { 00240 debug_str += "Blame from part 2: "; 00241 debug_str += bundle2.debug_; 00242 if (irr == IRR_CORRECT) { 00243 irr = bundle2.incorrect_result_reason_; 00244 } else if (irr != bundle2.incorrect_result_reason_) { 00245 irr = IRR_UNKNOWN; 00246 } 00247 } 00248 incorrect_result_reason_ = irr; 00249 if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) { 00250 SetBlame(irr, debug_str, NULL, debug); 00251 } 00252 } 00253 00254 // If a blob with the same bounding box as one of the truth character 00255 // bounding boxes is not classified as the corresponding truth character 00256 // blames character classifier for incorrect answer. 00257 void BlamerBundle::BlameClassifier(const UNICHARSET& unicharset, 00258 const TBOX& blob_box, 00259 const BLOB_CHOICE_LIST& choices, 00260 bool debug) { 00261 if (!truth_has_char_boxes_ || 00262 incorrect_result_reason_ != IRR_CORRECT) 00263 return; // Nothing to do here. 00264 00265 for (int b = 0; b < norm_truth_word_.length(); ++b) { 00266 const TBOX &truth_box = norm_truth_word_.BlobBox(b); 00267 // Note that we are more strict on the bounding box boundaries here 00268 // than in other places (chopper, segmentation search), since we do 00269 // not have the ability to check the previous and next bounding box. 00270 if (blob_box.x_almost_equal(truth_box, norm_box_tolerance_/2)) { 00271 bool found = false; 00272 bool incorrect_adapted = false; 00273 UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID; 00274 const char *truth_str = truth_text_[b].string(); 00275 // We promise not to modify the list or its contents, using a 00276 // const BLOB_CHOICE* below. 00277 BLOB_CHOICE_IT choices_it(const_cast<BLOB_CHOICE_LIST*>(&choices)); 00278 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); 00279 choices_it.forward()) { 00280 const BLOB_CHOICE* choice = choices_it.data(); 00281 if (strcmp(truth_str, unicharset.get_normed_unichar( 00282 choice->unichar_id())) == 0) { 00283 found = true; 00284 break; 00285 } else if (choice->IsAdapted()) { 00286 incorrect_adapted = true; 00287 incorrect_adapted_id = choice->unichar_id(); 00288 } 00289 } // end choices_it for loop 00290 if (!found) { 00291 STRING debug_str = "unichar "; 00292 debug_str += truth_str; 00293 debug_str += " not found in classification list"; 00294 SetBlame(IRR_CLASSIFIER, debug_str, NULL, debug); 00295 } else if (incorrect_adapted) { 00296 STRING debug_str = "better rating for adapted "; 00297 debug_str += unicharset.id_to_unichar(incorrect_adapted_id); 00298 debug_str += " than for correct "; 00299 debug_str += truth_str; 00300 SetBlame(IRR_ADAPTION, debug_str, NULL, debug); 00301 } 00302 break; 00303 } 00304 } // end iterating over blamer_bundle->norm_truth_word 00305 } 00306 00307 // Checks whether chops were made at all the character bounding box 00308 // boundaries in word->truth_word. If not - blames the chopper for an 00309 // incorrect answer. 00310 void BlamerBundle::SetChopperBlame(const WERD_RES* word, bool debug) { 00311 if (NoTruth() || !truth_has_char_boxes_ || 00312 word->chopped_word->blobs.empty()) { 00313 return; 00314 } 00315 STRING debug_str; 00316 bool missing_chop = false; 00317 int num_blobs = word->chopped_word->blobs.size(); 00318 int box_index = 0; 00319 int blob_index = 0; 00320 inT16 truth_x; 00321 while (box_index < truth_word_.length() && blob_index < num_blobs) { 00322 truth_x = norm_truth_word_.BlobBox(box_index).right(); 00323 TBLOB * curr_blob = word->chopped_word->blobs[blob_index]; 00324 if (curr_blob->bounding_box().right() < truth_x - norm_box_tolerance_) { 00325 ++blob_index; 00326 continue; // encountered an extra chop, keep looking 00327 } else if (curr_blob->bounding_box().right() > 00328 truth_x + norm_box_tolerance_) { 00329 missing_chop = true; 00330 break; 00331 } else { 00332 ++blob_index; 00333 } 00334 } 00335 if (missing_chop || box_index < norm_truth_word_.length()) { 00336 STRING debug_str; 00337 if (missing_chop) { 00338 debug_str.add_str_int("Detected missing chop (tolerance=", 00339 norm_box_tolerance_); 00340 debug_str += ") at Bounding Box="; 00341 TBLOB * curr_blob = word->chopped_word->blobs[blob_index]; 00342 curr_blob->bounding_box().print_to_str(&debug_str); 00343 debug_str.add_str_int("\nNo chop for truth at x=", truth_x); 00344 } else { 00345 debug_str.add_str_int("Missing chops for last ", 00346 norm_truth_word_.length() - box_index); 00347 debug_str += " truth box(es)"; 00348 } 00349 debug_str += "\nMaximally chopped word boxes:\n"; 00350 for (blob_index = 0; blob_index < num_blobs; ++blob_index) { 00351 TBLOB * curr_blob = word->chopped_word->blobs[blob_index]; 00352 curr_blob->bounding_box().print_to_str(&debug_str); 00353 debug_str += '\n'; 00354 } 00355 debug_str += "Truth bounding boxes:\n"; 00356 for (box_index = 0; box_index < norm_truth_word_.length(); ++box_index) { 00357 norm_truth_word_.BlobBox(box_index).print_to_str(&debug_str); 00358 debug_str += '\n'; 00359 } 00360 SetBlame(IRR_CHOPPER, debug_str, word->best_choice, debug); 00361 } 00362 } 00363 00364 // Blames the classifier or the language model if, after running only the 00365 // chopper, best_choice is incorrect and no blame has been yet set. 00366 // Blames the classifier if best_choice is classifier's top choice and is a 00367 // dictionary word (i.e. language model could not have helped). 00368 // Otherwise, blames the language model (formerly permuter word adjustment). 00369 void BlamerBundle::BlameClassifierOrLangModel( 00370 const WERD_RES* word, 00371 const UNICHARSET& unicharset, bool valid_permuter, bool debug) { 00372 if (valid_permuter) { 00373 // Find out whether best choice is a top choice. 00374 best_choice_is_dict_and_top_choice_ = true; 00375 for (int i = 0; i < word->best_choice->length(); ++i) { 00376 BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i)); 00377 ASSERT_HOST(!blob_choice_it.empty()); 00378 BLOB_CHOICE *first_choice = NULL; 00379 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); 00380 blob_choice_it.forward()) { // find first non-fragment choice 00381 if (!(unicharset.get_fragment(blob_choice_it.data()->unichar_id()))) { 00382 first_choice = blob_choice_it.data(); 00383 break; 00384 } 00385 } 00386 ASSERT_HOST(first_choice != NULL); 00387 if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) { 00388 best_choice_is_dict_and_top_choice_ = false; 00389 break; 00390 } 00391 } 00392 } 00393 STRING debug_str; 00394 if (best_choice_is_dict_and_top_choice_) { 00395 debug_str = "Best choice is: incorrect, top choice, dictionary word"; 00396 debug_str += " with permuter "; 00397 debug_str += word->best_choice->permuter_name(); 00398 } else { 00399 debug_str = "Classifier/Old LM tradeoff is to blame"; 00400 } 00401 SetBlame(best_choice_is_dict_and_top_choice_ ? IRR_CLASSIFIER 00402 : IRR_CLASS_OLD_LM_TRADEOFF, 00403 debug_str, word->best_choice, debug); 00404 } 00405 00406 // Sets up the correct_segmentation_* to mark the correct bounding boxes. 00407 void BlamerBundle::SetupCorrectSegmentation(const TWERD* word, bool debug) { 00408 params_training_bundle_.StartHypothesisList(); 00409 if (incorrect_result_reason_ != IRR_CORRECT || !truth_has_char_boxes_) 00410 return; // Nothing to do here. 00411 00412 STRING debug_str; 00413 debug_str += "Blamer computing correct_segmentation_cols\n"; 00414 int curr_box_col = 0; 00415 int next_box_col = 0; 00416 int num_blobs = word->NumBlobs(); 00417 if (num_blobs == 0) return; // No blobs to play with. 00418 int blob_index = 0; 00419 inT16 next_box_x = word->blobs[blob_index]->bounding_box().right(); 00420 for (int truth_idx = 0; blob_index < num_blobs && 00421 truth_idx < norm_truth_word_.length(); 00422 ++blob_index) { 00423 ++next_box_col; 00424 inT16 curr_box_x = next_box_x; 00425 if (blob_index + 1 < num_blobs) 00426 next_box_x = word->blobs[blob_index + 1]->bounding_box().right(); 00427 inT16 truth_x = norm_truth_word_.BlobBox(truth_idx).right(); 00428 debug_str.add_str_int("Box x coord vs. truth: ", curr_box_x); 00429 debug_str.add_str_int(" ", truth_x); 00430 debug_str += "\n"; 00431 if (curr_box_x > (truth_x + norm_box_tolerance_)) { 00432 break; // failed to find a matching box 00433 } else if (curr_box_x >= truth_x - norm_box_tolerance_ && // matched 00434 (blob_index + 1 >= num_blobs || // next box can't be included 00435 next_box_x > truth_x + norm_box_tolerance_)) { 00436 correct_segmentation_cols_.push_back(curr_box_col); 00437 correct_segmentation_rows_.push_back(next_box_col-1); 00438 ++truth_idx; 00439 debug_str.add_str_int("col=", curr_box_col); 00440 debug_str.add_str_int(" row=", next_box_col-1); 00441 debug_str += "\n"; 00442 curr_box_col = next_box_col; 00443 } 00444 } 00445 if (blob_index < num_blobs || // trailing blobs 00446 correct_segmentation_cols_.length() != norm_truth_word_.length()) { 00447 debug_str.add_str_int("Blamer failed to find correct segmentation" 00448 " (tolerance=", norm_box_tolerance_); 00449 if (blob_index >= num_blobs) debug_str += " blob == NULL"; 00450 debug_str += ")\n"; 00451 debug_str.add_str_int(" path length ", correct_segmentation_cols_.length()); 00452 debug_str.add_str_int(" vs. truth ", norm_truth_word_.length()); 00453 debug_str += "\n"; 00454 SetBlame(IRR_UNKNOWN, debug_str, NULL, debug); 00455 correct_segmentation_cols_.clear(); 00456 correct_segmentation_rows_.clear(); 00457 } 00458 } 00459 00460 // Returns true if a guided segmentation search is needed. 00461 bool BlamerBundle::GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const { 00462 return incorrect_result_reason_ == IRR_CORRECT && 00463 !segsearch_is_looking_for_blame_ && 00464 truth_has_char_boxes_ && 00465 !ChoiceIsCorrect(best_choice); 00466 } 00467 00468 // Setup ready to guide the segmentation search to the correct segmentation. 00469 // The callback pp_cb is used to avoid a cyclic dependency. 00470 // It calls into LMPainPoints::GenerateForBlamer by pre-binding the 00471 // WERD_RES, and the LMPainPoints itself. 00472 // pp_cb must be a permanent callback, and should be deleted by the caller. 00473 void BlamerBundle::InitForSegSearch(const WERD_CHOICE *best_choice, 00474 MATRIX* ratings, UNICHAR_ID wildcard_id, 00475 bool debug, STRING *debug_str, 00476 TessResultCallback2<bool, int, int>* cb) { 00477 segsearch_is_looking_for_blame_ = true; 00478 if (debug) { 00479 tprintf("segsearch starting to look for blame\n"); 00480 } 00481 // Fill pain points for any unclassifed blob corresponding to the 00482 // correct segmentation state. 00483 *debug_str += "Correct segmentation:\n"; 00484 for (int idx = 0; idx < correct_segmentation_cols_.length(); ++idx) { 00485 debug_str->add_str_int("col=", correct_segmentation_cols_[idx]); 00486 debug_str->add_str_int(" row=", correct_segmentation_rows_[idx]); 00487 *debug_str += "\n"; 00488 if (!ratings->Classified(correct_segmentation_cols_[idx], 00489 correct_segmentation_rows_[idx], 00490 wildcard_id) && 00491 !cb->Run(correct_segmentation_cols_[idx], 00492 correct_segmentation_rows_[idx])) { 00493 segsearch_is_looking_for_blame_ = false; 00494 *debug_str += "\nFailed to insert pain point\n"; 00495 SetBlame(IRR_SEGSEARCH_HEUR, *debug_str, best_choice, debug); 00496 break; 00497 } 00498 } // end for blamer_bundle->correct_segmentation_cols/rows 00499 } 00500 // Returns true if the guided segsearch is in progress. 00501 bool BlamerBundle::GuidedSegsearchStillGoing() const { 00502 return segsearch_is_looking_for_blame_; 00503 } 00504 00505 // The segmentation search has ended. Sets the blame appropriately. 00506 void BlamerBundle::FinishSegSearch(const WERD_CHOICE *best_choice, 00507 bool debug, STRING *debug_str) { 00508 // If we are still looking for blame (i.e. best_choice is incorrect, but a 00509 // path representing the correct segmentation could be constructed), we can 00510 // blame segmentation search pain point prioritization if the rating of the 00511 // path corresponding to the correct segmentation is better than that of 00512 // best_choice (i.e. language model would have done the correct thing, but 00513 // because of poor pain point prioritization the correct segmentation was 00514 // never explored). Otherwise we blame the tradeoff between the language model 00515 // and the classifier, since even after exploring the path corresponding to 00516 // the correct segmentation incorrect best_choice would have been chosen. 00517 // One special case when we blame the classifier instead is when best choice 00518 // is incorrect, but it is a dictionary word and it classifier's top choice. 00519 if (segsearch_is_looking_for_blame_) { 00520 segsearch_is_looking_for_blame_ = false; 00521 if (best_choice_is_dict_and_top_choice_) { 00522 *debug_str = "Best choice is: incorrect, top choice, dictionary word"; 00523 *debug_str += " with permuter "; 00524 *debug_str += best_choice->permuter_name(); 00525 SetBlame(IRR_CLASSIFIER, *debug_str, best_choice, debug); 00526 } else if (best_correctly_segmented_rating_ < 00527 best_choice->rating()) { 00528 *debug_str += "Correct segmentation state was not explored"; 00529 SetBlame(IRR_SEGSEARCH_PP, *debug_str, best_choice, debug); 00530 } else { 00531 if (best_correctly_segmented_rating_ >= 00532 WERD_CHOICE::kBadRating) { 00533 *debug_str += "Correct segmentation paths were pruned by LM\n"; 00534 } else { 00535 debug_str->add_str_double("Best correct segmentation rating ", 00536 best_correctly_segmented_rating_); 00537 debug_str->add_str_double(" vs. best choice rating ", 00538 best_choice->rating()); 00539 } 00540 SetBlame(IRR_CLASS_LM_TRADEOFF, *debug_str, best_choice, debug); 00541 } 00542 } 00543 } 00544 00545 // If the bundle is null or still does not indicate the correct result, 00546 // fix it and use some backup reason for the blame. 00547 void BlamerBundle::LastChanceBlame(bool debug, WERD_RES* word) { 00548 if (word->blamer_bundle == NULL) { 00549 word->blamer_bundle = new BlamerBundle(); 00550 word->blamer_bundle->SetBlame(IRR_PAGE_LAYOUT, "LastChanceBlame", 00551 word->best_choice, debug); 00552 } else if (word->blamer_bundle->incorrect_result_reason_ == IRR_NO_TRUTH) { 00553 word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth", 00554 word->best_choice, debug); 00555 } else { 00556 bool correct = word->blamer_bundle->ChoiceIsCorrect(word->best_choice); 00557 IncorrectResultReason irr = word->blamer_bundle->incorrect_result_reason_; 00558 if (irr == IRR_CORRECT && !correct) { 00559 STRING debug_str = "Choice is incorrect after recognition"; 00560 word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug_str, word->best_choice, 00561 debug); 00562 } else if (irr != IRR_CORRECT && correct) { 00563 if (debug) { 00564 tprintf("Corrected %s\n", word->blamer_bundle->debug_.string()); 00565 } 00566 word->blamer_bundle->incorrect_result_reason_ = IRR_CORRECT; 00567 word->blamer_bundle->debug_ = ""; 00568 } 00569 } 00570 } 00571 00572 // Sets the misadaption debug if this word is incorrect, as this word is 00573 // being adapted to. 00574 void BlamerBundle::SetMisAdaptionDebug(const WERD_CHOICE *best_choice, 00575 bool debug) { 00576 if (incorrect_result_reason_ != IRR_NO_TRUTH && 00577 !ChoiceIsCorrect(best_choice)) { 00578 misadaption_debug_ ="misadapt to word ("; 00579 misadaption_debug_ += best_choice->permuter_name(); 00580 misadaption_debug_ += "): "; 00581 FillDebugString("", best_choice, &misadaption_debug_); 00582 if (debug) { 00583 tprintf("%s\n", misadaption_debug_.string()); 00584 } 00585 } 00586 } 00587