tesseract
3.03
|
00001 /********************************************************************** 00002 * File: pageres.cpp (Formerly page_res.c) 00003 * Description: Results classes used by control.c 00004 * Author: Phil Cheatle 00005 * Created: Tue Sep 22 08:42:49 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 #include <stdlib.h> 00020 #ifdef __UNIX__ 00021 #include <assert.h> 00022 #endif 00023 #include "blamer.h" 00024 #include "pageres.h" 00025 #include "blobs.h" 00026 00027 ELISTIZE (BLOCK_RES) 00028 CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES) 00029 00030 // Gain factor for computing thresholds that determine the ambiguity of a word. 00031 static const double kStopperAmbiguityThresholdGain = 8.0; 00032 // Constant offset for computing thresholds that determine the ambiguity of a 00033 // word. 00034 static const double kStopperAmbiguityThresholdOffset = 1.5; 00035 // Max number of broken pieces to associate. 00036 const int kWordrecMaxNumJoinChunks = 4; 00037 00038 // Computes and returns a threshold of certainty difference used to determine 00039 // which words to keep, based on the adjustment factors of the two words. 00040 // TODO(rays) This is horrible. Replace with an enhance params training model. 00041 static double StopperAmbigThreshold(double f1, double f2) { 00042 return (f2 - f1) * kStopperAmbiguityThresholdGain - 00043 kStopperAmbiguityThresholdOffset; 00044 } 00045 00046 /************************************************************************* 00047 * PAGE_RES::PAGE_RES 00048 * 00049 * Constructor for page results 00050 *************************************************************************/ 00051 PAGE_RES::PAGE_RES( 00052 BLOCK_LIST *the_block_list, 00053 WERD_CHOICE **prev_word_best_choice_ptr) { 00054 Init(); 00055 BLOCK_IT block_it(the_block_list); 00056 BLOCK_RES_IT block_res_it(&block_res_list); 00057 for (block_it.mark_cycle_pt(); 00058 !block_it.cycled_list(); block_it.forward()) { 00059 block_res_it.add_to_end(new BLOCK_RES(block_it.data())); 00060 } 00061 prev_word_best_choice = prev_word_best_choice_ptr; 00062 } 00063 00064 /************************************************************************* 00065 * BLOCK_RES::BLOCK_RES 00066 * 00067 * Constructor for BLOCK results 00068 *************************************************************************/ 00069 00070 BLOCK_RES::BLOCK_RES(BLOCK *the_block) { 00071 ROW_IT row_it (the_block->row_list ()); 00072 ROW_RES_IT row_res_it(&row_res_list); 00073 00074 char_count = 0; 00075 rej_count = 0; 00076 font_class = -1; //not assigned 00077 x_height = -1.0; 00078 font_assigned = FALSE; 00079 bold = FALSE; 00080 italic = FALSE; 00081 row_count = 0; 00082 00083 block = the_block; 00084 00085 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00086 row_res_it.add_to_end(new ROW_RES(row_it.data())); 00087 } 00088 } 00089 00090 00091 /************************************************************************* 00092 * ROW_RES::ROW_RES 00093 * 00094 * Constructor for ROW results 00095 *************************************************************************/ 00096 00097 ROW_RES::ROW_RES(ROW *the_row) { 00098 WERD_IT word_it(the_row->word_list()); 00099 WERD_RES_IT word_res_it(&word_res_list); 00100 WERD_RES *combo = NULL; // current combination of fuzzies 00101 WERD_RES *word_res; // current word 00102 WERD *copy_word; 00103 00104 char_count = 0; 00105 rej_count = 0; 00106 whole_word_rej_count = 0; 00107 00108 row = the_row; 00109 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00110 word_res = new WERD_RES(word_it.data()); 00111 word_res->x_height = the_row->x_height(); 00112 00113 if (word_res->word->flag(W_FUZZY_NON)) { 00114 ASSERT_HOST(combo != NULL); 00115 word_res->part_of_combo = TRUE; 00116 combo->copy_on(word_res); 00117 } 00118 if (word_it.data_relative(1)->flag(W_FUZZY_NON)) { 00119 if (combo == NULL) { 00120 copy_word = new WERD; 00121 //deep copy 00122 *copy_word = *(word_it.data()); 00123 combo = new WERD_RES(copy_word); 00124 combo->x_height = the_row->x_height(); 00125 combo->combination = TRUE; 00126 word_res_it.add_to_end(combo); 00127 } 00128 word_res->part_of_combo = TRUE; 00129 } else { 00130 combo = NULL; 00131 } 00132 word_res_it.add_to_end(word_res); 00133 } 00134 } 00135 00136 00137 WERD_RES& WERD_RES::operator=(const WERD_RES & source) { 00138 this->ELIST_LINK::operator=(source); 00139 Clear(); 00140 if (source.combination) { 00141 word = new WERD; 00142 *word = *(source.word); // deep copy 00143 } else { 00144 word = source.word; // pt to same word 00145 } 00146 if (source.bln_boxes != NULL) 00147 bln_boxes = new tesseract::BoxWord(*source.bln_boxes); 00148 if (source.chopped_word != NULL) 00149 chopped_word = new TWERD(*source.chopped_word); 00150 if (source.rebuild_word != NULL) 00151 rebuild_word = new TWERD(*source.rebuild_word); 00152 // TODO(rays) Do we ever need to copy the seam_array? 00153 blob_row = source.blob_row; 00154 denorm = source.denorm; 00155 if (source.box_word != NULL) 00156 box_word = new tesseract::BoxWord(*source.box_word); 00157 best_state = source.best_state; 00158 correct_text = source.correct_text; 00159 blob_widths = source.blob_widths; 00160 blob_gaps = source.blob_gaps; 00161 // None of the uses of operator= require the ratings matrix to be copied, 00162 // so don't as it would be really slow. 00163 00164 // Copy the cooked choices. 00165 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.best_choices)); 00166 WERD_CHOICE_IT wc_dest_it(&best_choices); 00167 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) { 00168 const WERD_CHOICE *choice = wc_it.data(); 00169 wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice)); 00170 } 00171 if (!wc_dest_it.empty()) { 00172 wc_dest_it.move_to_first(); 00173 best_choice = wc_dest_it.data(); 00174 best_choice_fontinfo_ids = source.best_choice_fontinfo_ids; 00175 } else { 00176 best_choice = NULL; 00177 if (!best_choice_fontinfo_ids.empty()) { 00178 best_choice_fontinfo_ids.clear(); 00179 } 00180 } 00181 00182 if (source.raw_choice != NULL) { 00183 raw_choice = new WERD_CHOICE(*source.raw_choice); 00184 } else { 00185 raw_choice = NULL; 00186 } 00187 if (source.ep_choice != NULL) { 00188 ep_choice = new WERD_CHOICE(*source.ep_choice); 00189 } else { 00190 ep_choice = NULL; 00191 } 00192 reject_map = source.reject_map; 00193 combination = source.combination; 00194 part_of_combo = source.part_of_combo; 00195 CopySimpleFields(source); 00196 if (source.blamer_bundle != NULL) { 00197 blamer_bundle = new BlamerBundle(*(source.blamer_bundle)); 00198 } 00199 return *this; 00200 } 00201 00202 // Copies basic fields that don't involve pointers that might be useful 00203 // to copy when making one WERD_RES from another. 00204 void WERD_RES::CopySimpleFields(const WERD_RES& source) { 00205 tess_failed = source.tess_failed; 00206 tess_accepted = source.tess_accepted; 00207 tess_would_adapt = source.tess_would_adapt; 00208 done = source.done; 00209 unlv_crunch_mode = source.unlv_crunch_mode; 00210 small_caps = source.small_caps; 00211 italic = source.italic; 00212 bold = source.bold; 00213 fontinfo = source.fontinfo; 00214 fontinfo2 = source.fontinfo2; 00215 fontinfo_id_count = source.fontinfo_id_count; 00216 fontinfo_id2_count = source.fontinfo_id2_count; 00217 x_height = source.x_height; 00218 caps_height = source.caps_height; 00219 guessed_x_ht = source.guessed_x_ht; 00220 guessed_caps_ht = source.guessed_caps_ht; 00221 reject_spaces = source.reject_spaces; 00222 uch_set = source.uch_set; 00223 tesseract = source.tesseract; 00224 } 00225 00226 // Initializes a blank (default constructed) WERD_RES from one that has 00227 // already been recognized. 00228 // Use SetupFor*Recognition afterwards to complete the setup and make 00229 // it ready for a retry recognition. 00230 void WERD_RES::InitForRetryRecognition(const WERD_RES& source) { 00231 word = source.word; 00232 CopySimpleFields(source); 00233 if (source.blamer_bundle != NULL) { 00234 blamer_bundle = new BlamerBundle(); 00235 blamer_bundle->CopyTruth(*source.blamer_bundle); 00236 } 00237 } 00238 00239 // Sets up the members used in recognition: bln_boxes, chopped_word, 00240 // seam_array, denorm. Returns false if 00241 // the word is empty and sets up fake results. If use_body_size is 00242 // true and row->body_size is set, then body_size will be used for 00243 // blob normalization instead of xheight + ascrise. This flag is for 00244 // those languages that are using CJK pitch model and thus it has to 00245 // be true if and only if tesseract->textord_use_cjk_fp_model is 00246 // true. 00247 // If allow_detailed_fx is true, the feature extractor will receive fine 00248 // precision outline information, allowing smoother features and better 00249 // features on low resolution images. 00250 // The norm_mode_hint sets the default mode for normalization in absence 00251 // of any of the above flags. 00252 // norm_box is used to override the word bounding box to determine the 00253 // normalization scale and offset. 00254 // Returns false if the word is empty and sets up fake results. 00255 bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in, 00256 tesseract::Tesseract* tess, Pix* pix, 00257 int norm_mode, 00258 const TBOX* norm_box, 00259 bool numeric_mode, 00260 bool use_body_size, 00261 bool allow_detailed_fx, 00262 ROW *row, const BLOCK* block) { 00263 tesseract::OcrEngineMode norm_mode_hint = 00264 static_cast<tesseract::OcrEngineMode>(norm_mode); 00265 tesseract = tess; 00266 POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL; 00267 if ((norm_mode_hint != tesseract::OEM_CUBE_ONLY && 00268 word->cblob_list()->empty()) || (pb != NULL && !pb->IsText())) { 00269 // Empty words occur when all the blobs have been moved to the rej_blobs 00270 // list, which seems to occur frequently in junk. 00271 SetupFake(unicharset_in); 00272 word->set_flag(W_REP_CHAR, false); 00273 return false; 00274 } 00275 ClearResults(); 00276 SetupWordScript(unicharset_in); 00277 chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word); 00278 float word_xheight = use_body_size && row != NULL && row->body_size() > 0.0f 00279 ? row->body_size() : x_height; 00280 chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE), 00281 word_xheight, numeric_mode, norm_mode_hint, 00282 norm_box, &denorm); 00283 blob_row = row; 00284 SetupBasicsFromChoppedWord(unicharset_in); 00285 SetupBlamerBundle(); 00286 int num_blobs = chopped_word->NumBlobs(); 00287 ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks); 00288 tess_failed = false; 00289 return true; 00290 } 00291 00292 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty 00293 // accumulators from a made chopped word. We presume the fields are already 00294 // empty. 00295 void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) { 00296 bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word); 00297 start_seam_list(chopped_word, &seam_array); 00298 SetupBlobWidthsAndGaps(); 00299 ClearWordChoices(); 00300 } 00301 00302 // Sets up the members used in recognition for an empty recognition result: 00303 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. 00304 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) { 00305 ClearResults(); 00306 SetupWordScript(unicharset_in); 00307 chopped_word = new TWERD; 00308 rebuild_word = new TWERD; 00309 bln_boxes = new tesseract::BoxWord; 00310 box_word = new tesseract::BoxWord; 00311 int blob_count = word->cblob_list()->length(); 00312 if (blob_count > 0) { 00313 BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count]; 00314 // For non-text blocks, just pass any blobs through to the box_word 00315 // and call the word failed with a fake classification. 00316 C_BLOB_IT b_it(word->cblob_list()); 00317 int blob_id = 0; 00318 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00319 TBOX box = b_it.data()->bounding_box(); 00320 box_word->InsertBox(box_word->length(), box); 00321 fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f, 00322 -1, -1, -1, 0, 0, 0, BCC_FAKE); 00323 } 00324 FakeClassifyWord(blob_count, fake_choices); 00325 delete [] fake_choices; 00326 } else { 00327 WERD_CHOICE* word = new WERD_CHOICE(&unicharset_in); 00328 word->make_bad(); 00329 LogNewRawChoice(word); 00330 // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice. 00331 LogNewCookedChoice(1, false, word); 00332 } 00333 tess_failed = true; 00334 } 00335 00336 void WERD_RES::SetupWordScript(const UNICHARSET& uch) { 00337 uch_set = &uch; 00338 int script = uch.default_sid(); 00339 word->set_script_id(script); 00340 word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight()); 00341 word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid()); 00342 } 00343 00344 // Sets up the blamer_bundle if it is not null, using the initialized denorm. 00345 void WERD_RES::SetupBlamerBundle() { 00346 if (blamer_bundle != NULL) { 00347 blamer_bundle->SetupNormTruthWord(denorm); 00348 } 00349 } 00350 00351 // Computes the blob_widths and blob_gaps from the chopped_word. 00352 void WERD_RES::SetupBlobWidthsAndGaps() { 00353 blob_widths.truncate(0); 00354 blob_gaps.truncate(0); 00355 int num_blobs = chopped_word->NumBlobs(); 00356 for (int b = 0; b < num_blobs; ++b) { 00357 TBLOB *blob = chopped_word->blobs[b]; 00358 TBOX box = blob->bounding_box(); 00359 blob_widths.push_back(box.width()); 00360 if (b + 1 < num_blobs) { 00361 blob_gaps.push_back( 00362 chopped_word->blobs[b + 1]->bounding_box().left() - box.right()); 00363 } 00364 } 00365 } 00366 00367 // Updates internal data to account for a new SEAM (chop) at the given 00368 // blob_number. Fixes the ratings matrix and states in the choices, as well 00369 // as the blob widths and gaps. 00370 void WERD_RES::InsertSeam(int blob_number, SEAM* seam) { 00371 // Insert the seam into the SEAMS array. 00372 insert_seam(chopped_word, blob_number, seam, &seam_array); 00373 if (ratings != NULL) { 00374 // Expand the ratings matrix. 00375 ratings = ratings->ConsumeAndMakeBigger(blob_number); 00376 // Fix all the segmentation states. 00377 if (raw_choice != NULL) 00378 raw_choice->UpdateStateForSplit(blob_number); 00379 WERD_CHOICE_IT wc_it(&best_choices); 00380 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) { 00381 WERD_CHOICE* choice = wc_it.data(); 00382 choice->UpdateStateForSplit(blob_number); 00383 } 00384 SetupBlobWidthsAndGaps(); 00385 } 00386 } 00387 00388 // Returns true if all the word choices except the first have adjust_factors 00389 // worse than the given threshold. 00390 bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const { 00391 // The choices are not changed by this iteration. 00392 WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices)); 00393 for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) { 00394 WERD_CHOICE* choice = wc_it.data(); 00395 if (choice->adjust_factor() <= threshold) 00396 return false; 00397 } 00398 return true; 00399 } 00400 00401 // Returns true if the current word is ambiguous (by number of answers or 00402 // by dangerous ambigs.) 00403 bool WERD_RES::IsAmbiguous() { 00404 return !best_choices.singleton() || best_choice->dangerous_ambig_found(); 00405 } 00406 00407 // Returns true if the ratings matrix size matches the sum of each of the 00408 // segmentation states. 00409 bool WERD_RES::StatesAllValid() { 00410 int ratings_dim = ratings->dimension(); 00411 if (raw_choice->TotalOfStates() != ratings_dim) { 00412 tprintf("raw_choice has total of states = %d vs ratings dim of %d\n", 00413 raw_choice->TotalOfStates(), ratings_dim); 00414 return false; 00415 } 00416 WERD_CHOICE_IT it(&best_choices); 00417 int index = 0; 00418 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) { 00419 WERD_CHOICE* choice = it.data(); 00420 if (choice->TotalOfStates() != ratings_dim) { 00421 tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n", 00422 choice->TotalOfStates(), ratings_dim); 00423 return false; 00424 } 00425 } 00426 return true; 00427 } 00428 00429 // Prints a list of words found if debug is true or the word result matches 00430 // the word_to_debug. 00431 void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) { 00432 if (debug || 00433 (word_to_debug != NULL && *word_to_debug != '\0' && best_choice != NULL && 00434 best_choice->unichar_string() == STRING(word_to_debug))) { 00435 if (raw_choice != NULL) 00436 raw_choice->print("\nBest Raw Choice"); 00437 00438 WERD_CHOICE_IT it(&best_choices); 00439 int index = 0; 00440 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) { 00441 WERD_CHOICE* choice = it.data(); 00442 STRING label; 00443 label.add_str_int("\nCooked Choice #", index); 00444 choice->print(label.string()); 00445 } 00446 } 00447 } 00448 00449 // Removes from best_choices all choices which are not within a reasonable 00450 // range of the best choice. 00451 // TODO(rays) incorporate the information used here into the params training 00452 // re-ranker, in place of this heuristic that is based on the previous 00453 // adjustment factor. 00454 void WERD_RES::FilterWordChoices(int debug_level) { 00455 if (best_choice == NULL || best_choices.singleton()) 00456 return; 00457 00458 if (debug_level >= 2) 00459 best_choice->print("\nFiltering against best choice"); 00460 WERD_CHOICE_IT it(&best_choices); 00461 int index = 0; 00462 for (it.forward(); !it.at_first(); it.forward(), ++index) { 00463 WERD_CHOICE* choice = it.data(); 00464 float threshold = StopperAmbigThreshold(best_choice->adjust_factor(), 00465 choice->adjust_factor()); 00466 // i, j index the blob choice in choice, best_choice. 00467 // chunk is an index into the chopped_word blobs (AKA chunks). 00468 // Since the two words may use different segmentations of the chunks, we 00469 // iterate over the chunks to find out whether a comparable blob 00470 // classification is much worse than the best result. 00471 int i = 0, j = 0, chunk = 0; 00472 // Each iteration of the while deals with 1 chunk. On entry choice_chunk 00473 // and best_chunk are the indices of the first chunk in the NEXT blob, 00474 // i.e. we don't have to increment i, j while chunk < choice_chunk and 00475 // best_chunk respectively. 00476 int choice_chunk = choice->state(0), best_chunk = best_choice->state(0); 00477 while (i < choice->length() && j < best_choice->length()) { 00478 if (choice->unichar_id(i) != best_choice->unichar_id(j) && 00479 choice->certainty(i) - best_choice->certainty(j) < threshold) { 00480 if (debug_level >= 2) { 00481 STRING label; 00482 label.add_str_int("\nDiscarding bad choice #", index); 00483 choice->print(label.string()); 00484 tprintf("i %d j %d Chunk %d Choice->Blob[i].Certainty %.4g" 00485 " BestChoice->ChunkCertainty[Chunk] %g Threshold %g\n", 00486 i, j, chunk, choice->certainty(i), 00487 best_choice->certainty(j), threshold); 00488 } 00489 delete it.extract(); 00490 break; 00491 } 00492 ++chunk; 00493 // If needed, advance choice_chunk to keep up with chunk. 00494 while (choice_chunk < chunk && ++i < choice->length()) 00495 choice_chunk += choice->state(i); 00496 // If needed, advance best_chunk to keep up with chunk. 00497 while (best_chunk < chunk && ++j < best_choice->length()) 00498 best_chunk += best_choice->state(j); 00499 } 00500 } 00501 } 00502 00503 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale, 00504 float min_rating, 00505 float max_rating, 00506 float rating_margin, 00507 float* thresholds) { 00508 int chunk = 0; 00509 int end_chunk = best_choice->state(0); 00510 int end_raw_chunk = raw_choice->state(0); 00511 int raw_blob = 0; 00512 for (int i = 0; i < best_choice->length(); i++, thresholds++) { 00513 float avg_rating = 0.0f; 00514 int num_error_chunks = 0; 00515 00516 // For each chunk in best choice blob i, count non-matching raw results. 00517 while (chunk < end_chunk) { 00518 if (chunk >= end_raw_chunk) { 00519 ++raw_blob; 00520 end_raw_chunk += raw_choice->state(raw_blob); 00521 } 00522 if (best_choice->unichar_id(i) != 00523 raw_choice->unichar_id(raw_blob)) { 00524 avg_rating += raw_choice->certainty(raw_blob); 00525 ++num_error_chunks; 00526 } 00527 ++chunk; 00528 } 00529 00530 if (num_error_chunks > 0) { 00531 avg_rating /= num_error_chunks; 00532 *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin); 00533 } else { 00534 *thresholds = max_rating; 00535 } 00536 00537 if (*thresholds > max_rating) 00538 *thresholds = max_rating; 00539 if (*thresholds < min_rating) 00540 *thresholds = min_rating; 00541 } 00542 } 00543 00544 // Saves a copy of the word_choice if it has the best unadjusted rating. 00545 // Returns true if the word_choice was the new best. 00546 bool WERD_RES::LogNewRawChoice(WERD_CHOICE* word_choice) { 00547 if (raw_choice == NULL || word_choice->rating() < raw_choice->rating()) { 00548 delete raw_choice; 00549 raw_choice = new WERD_CHOICE(*word_choice); 00550 raw_choice->set_permuter(TOP_CHOICE_PERM); 00551 return true; 00552 } 00553 return false; 00554 } 00555 00556 // Consumes word_choice by adding it to best_choices, (taking ownership) if 00557 // the certainty for word_choice is some distance of the best choice in 00558 // best_choices, or by deleting the word_choice and returning false. 00559 // The best_choices list is kept in sorted order by rating. Duplicates are 00560 // removed, and the list is kept no longer than max_num_choices in length. 00561 // Returns true if the word_choice is still a valid pointer. 00562 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug, 00563 WERD_CHOICE* word_choice) { 00564 if (best_choice != NULL) { 00565 // Throw out obviously bad choices to save some work. 00566 // TODO(rays) Get rid of this! This piece of code produces different 00567 // results according to the order in which words are found, which is an 00568 // undesirable behavior. It would be better to keep all the choices and 00569 // prune them later when more information is available. 00570 float max_certainty_delta = 00571 StopperAmbigThreshold(best_choice->adjust_factor(), 00572 word_choice->adjust_factor()); 00573 if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) 00574 max_certainty_delta = -kStopperAmbiguityThresholdOffset; 00575 if (word_choice->certainty() - best_choice->certainty() < 00576 max_certainty_delta) { 00577 if (debug) { 00578 STRING bad_string; 00579 word_choice->string_and_lengths(&bad_string, NULL); 00580 tprintf("Discarding choice \"%s\" with an overly low certainty" 00581 " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n", 00582 bad_string.string(), word_choice->certainty(), 00583 best_choice->certainty(), 00584 max_certainty_delta + best_choice->certainty()); 00585 } 00586 delete word_choice; 00587 return false; 00588 } 00589 } 00590 00591 // Insert in the list in order of increasing rating, but knock out worse 00592 // string duplicates. 00593 WERD_CHOICE_IT it(&best_choices); 00594 const STRING& new_str = word_choice->unichar_string(); 00595 bool inserted = false; 00596 int num_choices = 0; 00597 if (!it.empty()) { 00598 do { 00599 WERD_CHOICE* choice = it.data(); 00600 if (choice->rating() > word_choice->rating() && !inserted) { 00601 // Time to insert. 00602 it.add_before_stay_put(word_choice); 00603 inserted = true; 00604 if (num_choices == 0) 00605 best_choice = word_choice; // This is the new best. 00606 ++num_choices; 00607 } 00608 if (choice->unichar_string() == new_str) { 00609 if (inserted) { 00610 // New is better. 00611 delete it.extract(); 00612 } else { 00613 // Old is better. 00614 if (debug) { 00615 tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n", 00616 new_str.string(), word_choice->rating(), choice->rating()); 00617 } 00618 delete word_choice; 00619 return false; 00620 } 00621 } else { 00622 ++num_choices; 00623 if (num_choices > max_num_choices) 00624 delete it.extract(); 00625 } 00626 it.forward(); 00627 } while (!it.at_first()); 00628 } 00629 if (!inserted && num_choices < max_num_choices) { 00630 it.add_to_end(word_choice); 00631 inserted = true; 00632 if (num_choices == 0) 00633 best_choice = word_choice; // This is the new best. 00634 } 00635 if (debug) { 00636 if (inserted) 00637 tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary"); 00638 else 00639 tprintf("Poor"); 00640 word_choice->print(" Word Choice"); 00641 } 00642 if (!inserted) { 00643 delete word_choice; 00644 return false; 00645 } 00646 return true; 00647 } 00648 00649 00650 // Simple helper moves the ownership of the pointer data from src to dest, 00651 // first deleting anything in dest, and nulling out src afterwards. 00652 template<class T> static void MovePointerData(T** dest, T**src) { 00653 delete *dest; 00654 *dest = *src; 00655 *src = NULL; 00656 } 00657 00658 // Prints a brief list of all the best choices. 00659 void WERD_RES::PrintBestChoices() const { 00660 STRING alternates_str; 00661 WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices)); 00662 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00663 if (!it.at_first()) alternates_str += "\", \""; 00664 alternates_str += it.data()->unichar_string(); 00665 } 00666 tprintf("Alternates for \"%s\": {\"%s\"}\n", 00667 best_choice->unichar_string().string(), alternates_str.string()); 00668 } 00669 00670 // Returns the sum of the widths of the blob between start_blob and last_blob 00671 // inclusive. 00672 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) { 00673 int result = 0; 00674 for (int b = start_blob; b <= last_blob; ++b) { 00675 result += blob_widths[b]; 00676 if (b < last_blob) 00677 result += blob_gaps[b]; 00678 } 00679 return result; 00680 } 00681 // Returns the width of a gap between the specified blob and the next one. 00682 int WERD_RES::GetBlobsGap(int blob_index) { 00683 if (blob_index < 0 || blob_index >= blob_gaps.size()) 00684 return 0; 00685 return blob_gaps[blob_index]; 00686 } 00687 00688 // Returns the BLOB_CHOICE corresponding to the given index in the 00689 // best choice word taken from the appropriate cell in the ratings MATRIX. 00690 // Borrowed pointer, so do not delete. May return NULL if there is no 00691 // BLOB_CHOICE matching the unichar_id at the given index. 00692 BLOB_CHOICE* WERD_RES::GetBlobChoice(int index) const { 00693 if (index < 0 || index >= best_choice->length()) return NULL; 00694 BLOB_CHOICE_LIST* choices = GetBlobChoices(index); 00695 return FindMatchingChoice(best_choice->unichar_id(index), choices); 00696 } 00697 00698 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the 00699 // best choice word taken from the appropriate cell in the ratings MATRIX. 00700 // Borrowed pointer, so do not delete. 00701 BLOB_CHOICE_LIST* WERD_RES::GetBlobChoices(int index) const { 00702 return best_choice->blob_choices(index, ratings); 00703 } 00704 00705 // Moves the results fields from word to this. This takes ownership of all 00706 // the data, so src can be destructed. 00707 void WERD_RES::ConsumeWordResults(WERD_RES* word) { 00708 denorm = word->denorm; 00709 blob_row = word->blob_row; 00710 MovePointerData(&chopped_word, &word->chopped_word); 00711 MovePointerData(&rebuild_word, &word->rebuild_word); 00712 MovePointerData(&box_word, &word->box_word); 00713 seam_array.delete_data_pointers(); 00714 seam_array = word->seam_array; 00715 word->seam_array.clear(); 00716 best_state.move(&word->best_state); 00717 correct_text.move(&word->correct_text); 00718 blob_widths.move(&word->blob_widths); 00719 blob_gaps.move(&word->blob_gaps); 00720 if (ratings != NULL) ratings->delete_matrix_pointers(); 00721 MovePointerData(&ratings, &word->ratings); 00722 best_choice = word->best_choice; 00723 MovePointerData(&raw_choice, &word->raw_choice); 00724 best_choices.clear(); 00725 WERD_CHOICE_IT wc_it(&best_choices); 00726 wc_it.add_list_after(&word->best_choices); 00727 reject_map = word->reject_map; 00728 if (word->blamer_bundle != NULL) { 00729 assert(blamer_bundle != NULL); 00730 blamer_bundle->CopyResults(*(word->blamer_bundle)); 00731 } 00732 CopySimpleFields(*word); 00733 } 00734 00735 // Replace the best choice and rebuild box word. 00736 // choice must be from the current best_choices list. 00737 void WERD_RES::ReplaceBestChoice(WERD_CHOICE* choice) { 00738 best_choice = choice; 00739 RebuildBestState(); 00740 SetupBoxWord(); 00741 // Make up a fake reject map of the right length to keep the 00742 // rejection pass happy. 00743 reject_map.initialise(best_state.length()); 00744 done = tess_accepted = tess_would_adapt = true; 00745 SetScriptPositions(); 00746 } 00747 00748 // Builds the rebuild_word and sets the best_state from the chopped_word and 00749 // the best_choice->state. 00750 void WERD_RES::RebuildBestState() { 00751 ASSERT_HOST(best_choice != NULL); 00752 if (rebuild_word != NULL) 00753 delete rebuild_word; 00754 rebuild_word = new TWERD; 00755 if (seam_array.empty()) 00756 start_seam_list(chopped_word, &seam_array); 00757 best_state.truncate(0); 00758 int start = 0; 00759 for (int i = 0; i < best_choice->length(); ++i) { 00760 int length = best_choice->state(i); 00761 best_state.push_back(length); 00762 if (length > 1) 00763 join_pieces(seam_array, start, start + length - 1, chopped_word); 00764 TBLOB* blob = chopped_word->blobs[start]; 00765 rebuild_word->blobs.push_back(new TBLOB(*blob)); 00766 if (length > 1) 00767 break_pieces(seam_array, start, start + length - 1, chopped_word); 00768 start += length; 00769 } 00770 } 00771 00772 // Copies the chopped_word to the rebuild_word, faking a best_state as well. 00773 // Also sets up the output box_word. 00774 void WERD_RES::CloneChoppedToRebuild() { 00775 if (rebuild_word != NULL) 00776 delete rebuild_word; 00777 rebuild_word = new TWERD(*chopped_word); 00778 SetupBoxWord(); 00779 int word_len = box_word->length(); 00780 best_state.reserve(word_len); 00781 correct_text.reserve(word_len); 00782 for (int i = 0; i < word_len; ++i) { 00783 best_state.push_back(1); 00784 correct_text.push_back(STRING("")); 00785 } 00786 } 00787 00788 // Sets/replaces the box_word with one made from the rebuild_word. 00789 void WERD_RES::SetupBoxWord() { 00790 if (box_word != NULL) 00791 delete box_word; 00792 rebuild_word->ComputeBoundingBoxes(); 00793 box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word); 00794 box_word->ClipToOriginalWord(denorm.block(), word); 00795 } 00796 00797 // Sets up the script positions in the output best_choice using the best_choice 00798 // to get the unichars, and the unicharset to get the target positions. 00799 void WERD_RES::SetScriptPositions() { 00800 best_choice->SetScriptPositions(small_caps, chopped_word); 00801 } 00802 // Sets all the blobs in all the words (raw choice and best choices) to be 00803 // the given position. (When a sub/superscript is recognized as a separate 00804 // word, it falls victim to the rule that a whole word cannot be sub or 00805 // superscript, so this function overrides that problem.) 00806 void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) { 00807 raw_choice->SetAllScriptPositions(position); 00808 WERD_CHOICE_IT wc_it(&best_choices); 00809 for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) 00810 wc_it.data()->SetAllScriptPositions(position); 00811 } 00812 00813 // Classifies the word with some already-calculated BLOB_CHOICEs. 00814 // The choices are an array of blob_count pointers to BLOB_CHOICE, 00815 // providing a single classifier result for each blob. 00816 // The BLOB_CHOICEs are consumed and the word takes ownership. 00817 // The number of blobs in the box_word must match blob_count. 00818 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) { 00819 // Setup the WERD_RES. 00820 ASSERT_HOST(box_word != NULL); 00821 ASSERT_HOST(blob_count == box_word->length()); 00822 ClearWordChoices(); 00823 ClearRatings(); 00824 ratings = new MATRIX(blob_count, 1); 00825 for (int c = 0; c < blob_count; ++c) { 00826 BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST; 00827 BLOB_CHOICE_IT choice_it(choice_list); 00828 choice_it.add_after_then_move(choices[c]); 00829 ratings->put(c, c, choice_list); 00830 } 00831 FakeWordFromRatings(); 00832 reject_map.initialise(blob_count); 00833 } 00834 00835 // Creates a WERD_CHOICE for the word using the top choices from the leading 00836 // diagonal of the ratings matrix. 00837 void WERD_RES::FakeWordFromRatings() { 00838 int num_blobs = ratings->dimension(); 00839 WERD_CHOICE* word_choice = new WERD_CHOICE(uch_set, num_blobs); 00840 word_choice->set_permuter(TOP_CHOICE_PERM); 00841 for (int b = 0; b < num_blobs; ++b) { 00842 UNICHAR_ID unichar_id = UNICHAR_SPACE; 00843 float rating = MAX_INT32; 00844 float certainty = -MAX_INT32; 00845 BLOB_CHOICE_LIST* choices = ratings->get(b, b); 00846 if (choices != NULL && !choices->empty()) { 00847 BLOB_CHOICE_IT bc_it(choices); 00848 BLOB_CHOICE* choice = bc_it.data(); 00849 unichar_id = choice->unichar_id(); 00850 rating = choice->rating(); 00851 certainty = choice->certainty(); 00852 } 00853 word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating, 00854 certainty); 00855 } 00856 LogNewRawChoice(word_choice); 00857 // Ownership of word_choice taken by word here. 00858 LogNewCookedChoice(1, false, word_choice); 00859 } 00860 00861 // Copies the best_choice strings to the correct_text for adaption/training. 00862 void WERD_RES::BestChoiceToCorrectText() { 00863 correct_text.clear(); 00864 ASSERT_HOST(best_choice != NULL); 00865 for (int i = 0; i < best_choice->length(); ++i) { 00866 UNICHAR_ID choice_id = best_choice->unichar_id(i); 00867 const char* blob_choice = uch_set->id_to_unichar(choice_id); 00868 correct_text.push_back(STRING(blob_choice)); 00869 } 00870 } 00871 00872 // Merges 2 adjacent blobs in the result if the permanent callback 00873 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent 00874 // callback box_cb is NULL or returns true, setting the merged blob 00875 // result to the class returned from class_cb. 00876 // Returns true if anything was merged. 00877 bool WERD_RES::ConditionalBlobMerge( 00878 TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb, 00879 TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb) { 00880 ASSERT_HOST(best_choice->length() == 0 || ratings != NULL); 00881 bool modified = false; 00882 for (int i = 0; i + 1 < best_choice->length(); ++i) { 00883 UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i), 00884 best_choice->unichar_id(i+1)); 00885 if (new_id != INVALID_UNICHAR_ID && 00886 (box_cb == NULL || box_cb->Run(box_word->BlobBox(i), 00887 box_word->BlobBox(i + 1)))) { 00888 // Raw choice should not be fixed. 00889 best_choice->set_unichar_id(new_id, i); 00890 modified = true; 00891 MergeAdjacentBlobs(i); 00892 const MATRIX_COORD& coord = best_choice->MatrixCoord(i); 00893 if (!coord.Valid(*ratings)) { 00894 ratings->IncreaseBandSize(coord.row + 1 - coord.col); 00895 } 00896 BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i); 00897 if (FindMatchingChoice(new_id, blob_choices) == NULL) { 00898 // Insert a fake result. 00899 BLOB_CHOICE* blob_choice = new BLOB_CHOICE; 00900 blob_choice->set_unichar_id(new_id); 00901 BLOB_CHOICE_IT bc_it(blob_choices); 00902 bc_it.add_before_then_move(blob_choice); 00903 } 00904 } 00905 } 00906 delete class_cb; 00907 delete box_cb; 00908 return modified; 00909 } 00910 00911 // Merges 2 adjacent blobs in the result (index and index+1) and corrects 00912 // all the data to account for the change. 00913 void WERD_RES::MergeAdjacentBlobs(int index) { 00914 if (reject_map.length() == best_choice->length()) 00915 reject_map.remove_pos(index); 00916 best_choice->remove_unichar_id(index + 1); 00917 rebuild_word->MergeBlobs(index, index + 2); 00918 box_word->MergeBoxes(index, index + 2); 00919 if (index + 1 < best_state.length()) { 00920 best_state[index] += best_state[index + 1]; 00921 best_state.remove(index + 1); 00922 } 00923 } 00924 00925 // TODO(tkielbus) Decide between keeping this behavior here or modifying the 00926 // training data. 00927 00928 // Utility function for fix_quotes 00929 // Return true if the next character in the string (given the UTF8 length in 00930 // bytes) is a quote character. 00931 static int is_simple_quote(const char* signed_str, int length) { 00932 const unsigned char* str = 00933 reinterpret_cast<const unsigned char*>(signed_str); 00934 // Standard 1 byte quotes. 00935 return (length == 1 && (*str == '\'' || *str == '`')) || 00936 // UTF-8 3 bytes curved quotes. 00937 (length == 3 && ((*str == 0xe2 && 00938 *(str + 1) == 0x80 && 00939 *(str + 2) == 0x98) || 00940 (*str == 0xe2 && 00941 *(str + 1) == 0x80 && 00942 *(str + 2) == 0x99))); 00943 } 00944 00945 // Callback helper for fix_quotes returns a double quote if both 00946 // arguments are quote, otherwise INVALID_UNICHAR_ID. 00947 UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) { 00948 const char *ch = uch_set->id_to_unichar(id1); 00949 const char *next_ch = uch_set->id_to_unichar(id2); 00950 if (is_simple_quote(ch, strlen(ch)) && 00951 is_simple_quote(next_ch, strlen(next_ch))) 00952 return uch_set->unichar_to_id("\""); 00953 return INVALID_UNICHAR_ID; 00954 } 00955 00956 // Change pairs of quotes to double quotes. 00957 void WERD_RES::fix_quotes() { 00958 if (!uch_set->contains_unichar("\"") || 00959 !uch_set->get_enabled(uch_set->unichar_to_id("\""))) 00960 return; // Don't create it if it is disallowed. 00961 00962 ConditionalBlobMerge( 00963 NewPermanentTessCallback(this, &WERD_RES::BothQuotes), 00964 NULL); 00965 } 00966 00967 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both 00968 // arguments are hyphen, otherwise INVALID_UNICHAR_ID. 00969 UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) { 00970 const char *ch = uch_set->id_to_unichar(id1); 00971 const char *next_ch = uch_set->id_to_unichar(id2); 00972 if (strlen(ch) == 1 && strlen(next_ch) == 1 && 00973 (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~')) 00974 return uch_set->unichar_to_id("-"); 00975 return INVALID_UNICHAR_ID; 00976 } 00977 00978 // Callback helper for fix_hyphens returns true if box1 and box2 overlap 00979 // (assuming both on the same textline, are in order and a chopped em dash.) 00980 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) { 00981 return box1.right() >= box2.left(); 00982 } 00983 00984 // Change pairs of hyphens to a single hyphen if the bounding boxes touch 00985 // Typically a long dash which has been segmented. 00986 void WERD_RES::fix_hyphens() { 00987 if (!uch_set->contains_unichar("-") || 00988 !uch_set->get_enabled(uch_set->unichar_to_id("-"))) 00989 return; // Don't create it if it is disallowed. 00990 00991 ConditionalBlobMerge( 00992 NewPermanentTessCallback(this, &WERD_RES::BothHyphens), 00993 NewPermanentTessCallback(this, &WERD_RES::HyphenBoxesOverlap)); 00994 } 00995 00996 // Callback helper for merge_tess_fails returns a space if both 00997 // arguments are space, otherwise INVALID_UNICHAR_ID. 00998 UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) { 00999 if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) 01000 return id1; 01001 else 01002 return INVALID_UNICHAR_ID; 01003 } 01004 01005 // Change pairs of tess failures to a single one 01006 void WERD_RES::merge_tess_fails() { 01007 if (ConditionalBlobMerge( 01008 NewPermanentTessCallback(this, &WERD_RES::BothSpaces), NULL)) { 01009 int len = best_choice->length(); 01010 ASSERT_HOST(reject_map.length() == len); 01011 ASSERT_HOST(box_word->length() == len); 01012 } 01013 } 01014 01015 // Returns true if the collection of count pieces, starting at start, are all 01016 // natural connected components, ie there are no real chops involved. 01017 bool WERD_RES::PiecesAllNatural(int start, int count) const { 01018 // all seams must have no splits. 01019 for (int index = start; index < start + count - 1; ++index) { 01020 if (index >= 0 && index < seam_array.size()) { 01021 SEAM* seam = seam_array[index]; 01022 if (seam != NULL && seam->split1 != NULL) 01023 return false; 01024 } 01025 } 01026 return true; 01027 } 01028 01029 01030 WERD_RES::~WERD_RES () { 01031 Clear(); 01032 } 01033 01034 void WERD_RES::InitNonPointers() { 01035 tess_failed = FALSE; 01036 tess_accepted = FALSE; 01037 tess_would_adapt = FALSE; 01038 done = FALSE; 01039 unlv_crunch_mode = CR_NONE; 01040 small_caps = false; 01041 italic = FALSE; 01042 bold = FALSE; 01043 // The fontinfos and tesseract count as non-pointers as they point to 01044 // data owned elsewhere. 01045 fontinfo = NULL; 01046 fontinfo2 = NULL; 01047 tesseract = NULL; 01048 fontinfo_id_count = 0; 01049 fontinfo_id2_count = 0; 01050 x_height = 0.0; 01051 caps_height = 0.0; 01052 guessed_x_ht = TRUE; 01053 guessed_caps_ht = TRUE; 01054 combination = FALSE; 01055 part_of_combo = FALSE; 01056 reject_spaces = FALSE; 01057 } 01058 01059 void WERD_RES::InitPointers() { 01060 word = NULL; 01061 bln_boxes = NULL; 01062 blob_row = NULL; 01063 uch_set = NULL; 01064 chopped_word = NULL; 01065 rebuild_word = NULL; 01066 box_word = NULL; 01067 ratings = NULL; 01068 best_choice = NULL; 01069 raw_choice = NULL; 01070 ep_choice = NULL; 01071 blamer_bundle = NULL; 01072 } 01073 01074 void WERD_RES::Clear() { 01075 if (word != NULL && combination) { 01076 delete word; 01077 } 01078 word = NULL; 01079 delete blamer_bundle; 01080 blamer_bundle = NULL; 01081 ClearResults(); 01082 } 01083 01084 void WERD_RES::ClearResults() { 01085 done = false; 01086 fontinfo = NULL; 01087 fontinfo2 = NULL; 01088 fontinfo_id_count = 0; 01089 fontinfo_id2_count = 0; 01090 if (bln_boxes != NULL) { 01091 delete bln_boxes; 01092 bln_boxes = NULL; 01093 } 01094 blob_row = NULL; 01095 if (chopped_word != NULL) { 01096 delete chopped_word; 01097 chopped_word = NULL; 01098 } 01099 if (rebuild_word != NULL) { 01100 delete rebuild_word; 01101 rebuild_word = NULL; 01102 } 01103 if (box_word != NULL) { 01104 delete box_word; 01105 box_word = NULL; 01106 } 01107 best_state.clear(); 01108 correct_text.clear(); 01109 seam_array.delete_data_pointers(); 01110 seam_array.clear(); 01111 blob_widths.clear(); 01112 blob_gaps.clear(); 01113 ClearRatings(); 01114 ClearWordChoices(); 01115 if (blamer_bundle != NULL) blamer_bundle->ClearResults(); 01116 } 01117 void WERD_RES::ClearWordChoices() { 01118 best_choice = NULL; 01119 if (raw_choice != NULL) { 01120 delete raw_choice; 01121 raw_choice = NULL; 01122 } 01123 best_choices.clear(); 01124 if (ep_choice != NULL) { 01125 delete ep_choice; 01126 ep_choice = NULL; 01127 } 01128 } 01129 void WERD_RES::ClearRatings() { 01130 if (ratings != NULL) { 01131 ratings->delete_matrix_pointers(); 01132 delete ratings; 01133 ratings = NULL; 01134 } 01135 } 01136 01137 01138 bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const { 01139 return word_res == other.word_res && 01140 row_res == other.row_res && 01141 block_res == other.block_res; 01142 } 01143 01144 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const { 01145 ASSERT_HOST(page_res == other.page_res); 01146 if (other.block_res == NULL) { 01147 // other points to the end of the page. 01148 if (block_res == NULL) 01149 return 0; 01150 return -1; 01151 } 01152 if (block_res == NULL) { 01153 return 1; // we point to the end of the page. 01154 } 01155 if (block_res == other.block_res) { 01156 if (other.row_res == NULL || row_res == NULL) { 01157 // this should only happen if we hit an image block. 01158 return 0; 01159 } 01160 if (row_res == other.row_res) { 01161 // we point to the same block and row. 01162 ASSERT_HOST(other.word_res != NULL && word_res != NULL); 01163 if (word_res == other.word_res) { 01164 // we point to the same word! 01165 return 0; 01166 } 01167 01168 WERD_RES_IT word_res_it(&row_res->word_res_list); 01169 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 01170 word_res_it.forward()) { 01171 if (word_res_it.data() == word_res) { 01172 return -1; 01173 } else if (word_res_it.data() == other.word_res) { 01174 return 1; 01175 } 01176 } 01177 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); 01178 } 01179 01180 // we both point to the same block, but different rows. 01181 ROW_RES_IT row_res_it(&block_res->row_res_list); 01182 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); 01183 row_res_it.forward()) { 01184 if (row_res_it.data() == row_res) { 01185 return -1; 01186 } else if (row_res_it.data() == other.row_res) { 01187 return 1; 01188 } 01189 } 01190 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); 01191 } 01192 01193 // We point to different blocks. 01194 BLOCK_RES_IT block_res_it(&page_res->block_res_list); 01195 for (block_res_it.mark_cycle_pt(); 01196 !block_res_it.cycled_list(); block_res_it.forward()) { 01197 if (block_res_it.data() == block_res) { 01198 return -1; 01199 } else if (block_res_it.data() == other.block_res) { 01200 return 1; 01201 } 01202 } 01203 // Shouldn't happen... 01204 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); 01205 return 0; 01206 } 01207 01208 // Inserts the new_word and a corresponding WERD_RES before the current 01209 // position. The simple fields of the WERD_RES are copied from clone_res and 01210 // the resulting WERD_RES is returned for further setup with best_choice etc. 01211 WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res, 01212 WERD* new_word) { 01213 // Insert new_word into the ROW. 01214 WERD_IT w_it(row()->row->word_list()); 01215 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 01216 WERD* word = w_it.data(); 01217 if (word == word_res->word) 01218 break; 01219 } 01220 ASSERT_HOST(!w_it.cycled_list()); 01221 w_it.add_before_then_move(new_word); 01222 // Make a WERD_RES for the new_word. 01223 WERD_RES* new_res = new WERD_RES(new_word); 01224 new_res->CopySimpleFields(clone_res); 01225 // Insert into the appropriate place in the ROW_RES. 01226 WERD_RES_IT wr_it(&row()->word_res_list); 01227 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { 01228 WERD_RES* word = wr_it.data(); 01229 if (word == word_res) 01230 break; 01231 } 01232 ASSERT_HOST(!wr_it.cycled_list()); 01233 wr_it.add_before_then_move(new_res); 01234 if (wr_it.at_first()) { 01235 // This is the new first word, so reset the member iterator so it 01236 // detects the cycled_list state correctly. 01237 ResetWordIterator(); 01238 } 01239 return new_res; 01240 } 01241 01242 // Deletes the current WERD_RES and its underlying WERD. 01243 void PAGE_RES_IT::DeleteCurrentWord() { 01244 // Check that this word is as we expect. part_of_combos are NEVER iterated 01245 // by the normal iterator, so we should never be trying to delete them. 01246 ASSERT_HOST(!word_res->part_of_combo); 01247 if (!word_res->combination) { 01248 // Combinations own their own word, so we won't find the word on the 01249 // row's word_list, but it is legitimate to try to delete them. 01250 // Delete word from the ROW when not a combination. 01251 WERD_IT w_it(row()->row->word_list()); 01252 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 01253 if (w_it.data() == word_res->word) { 01254 break; 01255 } 01256 } 01257 ASSERT_HOST(!w_it.cycled_list()); 01258 delete w_it.extract(); 01259 } 01260 // Remove the WERD_RES for the new_word. 01261 // Remove the WORD_RES from the ROW_RES. 01262 WERD_RES_IT wr_it(&row()->word_res_list); 01263 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { 01264 if (wr_it.data() == word_res) { 01265 word_res = NULL; 01266 break; 01267 } 01268 } 01269 ASSERT_HOST(!wr_it.cycled_list()); 01270 delete wr_it.extract(); 01271 ResetWordIterator(); 01272 } 01273 01274 /************************************************************************* 01275 * PAGE_RES_IT::restart_page 01276 * 01277 * Set things up at the start of the page 01278 *************************************************************************/ 01279 01280 WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) { 01281 block_res_it.set_to_list(&page_res->block_res_list); 01282 block_res_it.mark_cycle_pt(); 01283 prev_block_res = NULL; 01284 prev_row_res = NULL; 01285 prev_word_res = NULL; 01286 block_res = NULL; 01287 row_res = NULL; 01288 word_res = NULL; 01289 next_block_res = NULL; 01290 next_row_res = NULL; 01291 next_word_res = NULL; 01292 internal_forward(true, empty_ok); 01293 return internal_forward(false, empty_ok); 01294 } 01295 01296 // Recovers from operations on the current word, such as in InsertCloneWord 01297 // and DeleteCurrentWord. 01298 // Resets the word_res_it so that it is one past the next_word_res, as 01299 // it should be after internal_forward. If next_row_res != row_res, 01300 // then the next_word_res is in the next row, so there is no need to do 01301 // anything, since operations on the current word will not have disturbed 01302 // the word_res_it. 01303 void PAGE_RES_IT::ResetWordIterator() { 01304 if (row_res == next_row_res) { 01305 // Reset the member iterator so it can move forward and detect the 01306 // cycled_list state correctly. 01307 word_res_it.move_to_first(); 01308 word_res_it.mark_cycle_pt(); 01309 while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res) 01310 word_res_it.forward(); 01311 ASSERT_HOST(!word_res_it.cycled_list()); 01312 word_res_it.forward(); 01313 } 01314 } 01315 01316 /************************************************************************* 01317 * PAGE_RES_IT::internal_forward 01318 * 01319 * Find the next word on the page. If empty_ok is true, then non-text blocks 01320 * and text blocks with no text are visited as if they contain a single 01321 * imaginary word in a single imaginary row. (word() and row() both return NULL 01322 * in such a block and the return value is NULL.) 01323 * If empty_ok is false, the old behaviour is maintained. Each real word 01324 * is visited and empty and non-text blocks and rows are skipped. 01325 * new_block is used to initialize the iterators for a new block. 01326 * The iterator maintains pointers to block, row and word for the previous, 01327 * current and next words. These are correct, regardless of block/row 01328 * boundaries. NULL values denote start and end of the page. 01329 *************************************************************************/ 01330 01331 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) { 01332 bool new_row = false; 01333 01334 prev_block_res = block_res; 01335 prev_row_res = row_res; 01336 prev_word_res = word_res; 01337 block_res = next_block_res; 01338 row_res = next_row_res; 01339 word_res = next_word_res; 01340 next_block_res = NULL; 01341 next_row_res = NULL; 01342 next_word_res = NULL; 01343 01344 while (!block_res_it.cycled_list()) { 01345 if (new_block) { 01346 new_block = false; 01347 row_res_it.set_to_list(&block_res_it.data()->row_res_list); 01348 row_res_it.mark_cycle_pt(); 01349 if (row_res_it.empty() && empty_ok) { 01350 next_block_res = block_res_it.data(); 01351 break; 01352 } 01353 new_row = true; 01354 } 01355 while (!row_res_it.cycled_list()) { 01356 if (new_row) { 01357 new_row = false; 01358 word_res_it.set_to_list(&row_res_it.data()->word_res_list); 01359 word_res_it.mark_cycle_pt(); 01360 } 01361 // Skip any part_of_combo words. 01362 while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) 01363 word_res_it.forward(); 01364 if (!word_res_it.cycled_list()) { 01365 next_block_res = block_res_it.data(); 01366 next_row_res = row_res_it.data(); 01367 next_word_res = word_res_it.data(); 01368 word_res_it.forward(); 01369 goto foundword; 01370 } 01371 // end of row reached 01372 row_res_it.forward(); 01373 new_row = true; 01374 } 01375 // end of block reached 01376 block_res_it.forward(); 01377 new_block = true; 01378 } 01379 foundword: 01380 // Update prev_word_best_choice pointer. 01381 if (page_res != NULL && page_res->prev_word_best_choice != NULL) { 01382 *page_res->prev_word_best_choice = 01383 (new_block || prev_word_res == NULL) ? NULL : prev_word_res->best_choice; 01384 } 01385 return word_res; 01386 } 01387 01388 /************************************************************************* 01389 * PAGE_RES_IT::restart_row() 01390 * 01391 * Move to the beginning (leftmost word) of the current row. 01392 *************************************************************************/ 01393 WERD_RES *PAGE_RES_IT::restart_row() { 01394 ROW_RES *row = this->row(); 01395 if (!row) return NULL; 01396 for (restart_page(); this->row() != row; forward()) { 01397 // pass 01398 } 01399 return word(); 01400 } 01401 01402 /************************************************************************* 01403 * PAGE_RES_IT::forward_paragraph 01404 * 01405 * Move to the beginning of the next paragraph, allowing empty blocks. 01406 *************************************************************************/ 01407 01408 WERD_RES *PAGE_RES_IT::forward_paragraph() { 01409 while (block_res == next_block_res && 01410 (next_row_res != NULL && next_row_res->row != NULL && 01411 row_res->row->para() == next_row_res->row->para())) { 01412 internal_forward(false, true); 01413 } 01414 return internal_forward(false, true); 01415 } 01416 01417 /************************************************************************* 01418 * PAGE_RES_IT::forward_block 01419 * 01420 * Move to the beginning of the next block, allowing empty blocks. 01421 *************************************************************************/ 01422 01423 WERD_RES *PAGE_RES_IT::forward_block() { 01424 while (block_res == next_block_res) { 01425 internal_forward(false, true); 01426 } 01427 return internal_forward(false, true); 01428 } 01429 01430 void PAGE_RES_IT::rej_stat_word() { 01431 inT16 chars_in_word; 01432 inT16 rejects_in_word = 0; 01433 01434 chars_in_word = word_res->reject_map.length (); 01435 page_res->char_count += chars_in_word; 01436 block_res->char_count += chars_in_word; 01437 row_res->char_count += chars_in_word; 01438 01439 rejects_in_word = word_res->reject_map.reject_count (); 01440 01441 page_res->rej_count += rejects_in_word; 01442 block_res->rej_count += rejects_in_word; 01443 row_res->rej_count += rejects_in_word; 01444 if (chars_in_word == rejects_in_word) 01445 row_res->whole_word_rej_count += rejects_in_word; 01446 }