tesseract
3.03
|
00001 /****************************************************************** 00002 * File: control.cpp (Formerly control.c) 00003 * Description: Module-independent matcher controller. 00004 * Author: Ray Smith 00005 * Created: Thu Apr 23 11:09:58 BST 1992 00006 * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle 00007 * 00008 * (C) Copyright 1992, Hewlett-Packard Ltd. 00009 ** Licensed under the Apache License, Version 2.0 (the "License"); 00010 ** you may not use this file except in compliance with the License. 00011 ** You may obtain a copy of the License at 00012 ** http://www.apache.org/licenses/LICENSE-2.0 00013 ** Unless required by applicable law or agreed to in writing, software 00014 ** distributed under the License is distributed on an "AS IS" BASIS, 00015 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 ** See the License for the specific language governing permissions and 00017 ** limitations under the License. 00018 * 00019 **********************************************************************/ 00020 00021 #include <string.h> 00022 #include <math.h> 00023 #ifdef __UNIX__ 00024 #include <assert.h> 00025 #include <unistd.h> 00026 #include <errno.h> 00027 #endif 00028 #include <ctype.h> 00029 #include "ocrclass.h" 00030 #include "werdit.h" 00031 #include "drawfx.h" 00032 #include "tessbox.h" 00033 #include "tessvars.h" 00034 #include "pgedit.h" 00035 #include "reject.h" 00036 #include "fixspace.h" 00037 #include "docqual.h" 00038 #include "control.h" 00039 #include "secname.h" 00040 #include "output.h" 00041 #include "callcpp.h" 00042 #include "globals.h" 00043 #include "sorthelper.h" 00044 #include "tesseractclass.h" 00045 00046 // Include automatically generated configuration file if running autoconf. 00047 #ifdef HAVE_CONFIG_H 00048 #include "config_auto.h" 00049 #endif 00050 00051 #define MIN_FONT_ROW_COUNT 8 00052 #define MAX_XHEIGHT_DIFF 3 00053 00054 const char* const kBackUpConfigFile = "tempconfigdata.config"; 00055 // Multiple of x-height to make a repeated word have spaces in it. 00056 const double kRepcharGapThreshold = 0.5; 00057 // Min believable x-height for any text when refitting as a fraction of 00058 // original x-height 00059 const double kMinRefitXHeightFraction = 0.5; 00060 00061 00070 namespace tesseract { 00071 void Tesseract::recog_pseudo_word(PAGE_RES* page_res, 00072 TBOX &selection_box) { 00073 WERD *word; 00074 ROW *pseudo_row; // row of word 00075 BLOCK *pseudo_block; // block of word 00076 00077 word = make_pseudo_word(page_res, selection_box, 00078 pseudo_block, pseudo_row); 00079 if (word != NULL) { 00080 WERD_RES word_res(word); 00081 recog_interactive(pseudo_block, pseudo_row, &word_res); 00082 delete word; 00083 } 00084 } 00085 00086 00096 BOOL8 Tesseract::recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res) { 00097 inT16 char_qual; 00098 inT16 good_char_qual; 00099 00100 WordData word_data(block, row, word_res); 00101 SetupWordPassN(2, &word_data); 00102 classify_word_and_language(&Tesseract::classify_word_pass2, &word_data); 00103 if (tessedit_debug_quality_metrics) { 00104 word_char_quality(word_res, row, &char_qual, &good_char_qual); 00105 tprintf 00106 ("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n", 00107 word_res->reject_map.length(), word_blob_quality(word_res, row), 00108 word_outline_errs(word_res), char_qual, good_char_qual); 00109 } 00110 return TRUE; 00111 } 00112 00113 // Helper function to check for a target word and handle it appropriately. 00114 // Inspired by Jetsoft's requirement to process only single words on pass2 00115 // and beyond. 00116 // If word_config is not null: 00117 // If the word_box and target_word_box overlap, read the word_config file 00118 // else reset to previous config data. 00119 // return true. 00120 // else 00121 // If the word_box and target_word_box overlap or pass <= 1, return true. 00122 // Note that this function uses a fixed temporary file for storing the previous 00123 // configs, so it is neither thread-safe, nor process-safe, but the assumption 00124 // is that it will only be used for one debug window at a time. 00125 // 00126 // Since this function is used for debugging (and not to change OCR results) 00127 // set only debug params from the word config file. 00128 bool Tesseract::ProcessTargetWord(const TBOX& word_box, 00129 const TBOX& target_word_box, 00130 const char* word_config, 00131 int pass) { 00132 if (word_config != NULL) { 00133 if (word_box.major_overlap(target_word_box)) { 00134 if (backup_config_file_ == NULL) { 00135 backup_config_file_ = kBackUpConfigFile; 00136 FILE* config_fp = fopen(backup_config_file_, "wb"); 00137 ParamUtils::PrintParams(config_fp, params()); 00138 fclose(config_fp); 00139 ParamUtils::ReadParamsFile(word_config, 00140 SET_PARAM_CONSTRAINT_DEBUG_ONLY, 00141 params()); 00142 } 00143 } else { 00144 if (backup_config_file_ != NULL) { 00145 ParamUtils::ReadParamsFile(backup_config_file_, 00146 SET_PARAM_CONSTRAINT_DEBUG_ONLY, 00147 params()); 00148 backup_config_file_ = NULL; 00149 } 00150 } 00151 } else if (pass > 1 && !word_box.major_overlap(target_word_box)) { 00152 return false; 00153 } 00154 return true; 00155 } 00156 00157 // If tesseract is to be run, sets the words up ready for it. 00158 void Tesseract::SetupAllWordsPassN(int pass_n, 00159 const TBOX* target_word_box, 00160 const char* word_config, 00161 PAGE_RES* page_res, 00162 GenericVector<WordData>* words) { 00163 // Prepare all the words. 00164 PAGE_RES_IT page_res_it(page_res); 00165 for (page_res_it.restart_page(); page_res_it.word() != NULL; 00166 page_res_it.forward()) { 00167 if (pass_n == 1) 00168 page_res_it.word()->SetupFake(unicharset); 00169 if (target_word_box == NULL || 00170 ProcessTargetWord(page_res_it.word()->word->bounding_box(), 00171 *target_word_box, word_config, 1)) { 00172 words->push_back(WordData(page_res_it)); 00173 } 00174 } 00175 // Setup all the words for recognition with polygonal approximation. 00176 for (int w = 0; w < words->size(); ++w) { 00177 SetupWordPassN(pass_n, &(*words)[w]); 00178 if (w > 0) (*words)[w].prev_word = &(*words)[w - 1]; 00179 } 00180 } 00181 00182 // Sets up the single word ready for whichever engine is to be run. 00183 void Tesseract::SetupWordPassN(int pass_n, WordData* word) { 00184 if (pass_n == 1 || !word->word->done || tessedit_training_tess) { 00185 if (pass_n == 2) { 00186 // TODO(rays) Should we do this on pass1 too? 00187 word->word->caps_height = 0.0; 00188 if (word->word->x_height == 0.0f) 00189 word->word->x_height = word->row->x_height(); 00190 } 00191 // Cube doesn't get setup for pass2. 00192 if (pass_n != 2 || tessedit_ocr_engine_mode != OEM_CUBE_ONLY) { 00193 word->word->SetupForRecognition( 00194 unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, 00195 classify_bln_numeric_mode, textord_use_cjk_fp_model, 00196 poly_allow_detailed_fx, word->row, word->block); 00197 } 00198 } 00199 if (!sub_langs_.empty()) { 00200 if (word->lang_words.size() != sub_langs_.size()) { 00201 // Setup the words for all the sub-languages now. 00202 WERD_RES empty; 00203 word->lang_words.init_to_size(sub_langs_.size(), empty); 00204 } 00205 for (int s = 0; s < sub_langs_.size(); ++s) { 00206 Tesseract* lang_t = sub_langs_[s]; 00207 if (pass_n == 1 || (lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY && 00208 (!word->lang_words[s].done || lang_t->tessedit_training_tess))) { 00209 word->lang_words[s].InitForRetryRecognition(*word->word); 00210 word->lang_words[s].SetupForRecognition( 00211 lang_t->unicharset, lang_t, BestPix(), 00212 lang_t->tessedit_ocr_engine_mode, NULL, 00213 lang_t->classify_bln_numeric_mode, 00214 lang_t->textord_use_cjk_fp_model, 00215 lang_t->poly_allow_detailed_fx, word->row, word->block); 00216 } 00217 } 00218 } 00219 } 00220 00221 00222 // Runs word recognition on all the words. 00223 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, 00224 GenericVector<WordData>* words) { 00225 // TODO(rays) Before this loop can be parallelized (it would yield a massive 00226 // speed-up) all remaining member globals need to be converted to local/heap 00227 // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be 00228 // added. The results will be significantly different with adaption on, and 00229 // deterioration will need investigation. 00230 for (int w = 0; w < words->size(); ++w) { 00231 WordData* word = &(*words)[w]; 00232 if (monitor != NULL) { 00233 monitor->ocr_alive = TRUE; 00234 if (pass_n == 1) 00235 monitor->progress = 30 + 50 * w / words->size(); 00236 else 00237 monitor->progress = 80 + 10 * w / words->size(); 00238 if (monitor->deadline_exceeded() || 00239 (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, 00240 words->size()))) { 00241 // Timeout. Fake out the rest of the words. 00242 for (; w < words->size(); ++w) { 00243 (*words)[w].word->SetupFake(unicharset); 00244 } 00245 return false; 00246 } 00247 } 00248 if (word->word->tess_failed) continue; 00249 WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1 00250 : &Tesseract::classify_word_pass2; 00251 classify_word_and_language(recognizer, word); 00252 if (tessedit_dump_choices) { 00253 word_dumper(NULL, word->row, word->word); 00254 tprintf("Pass%d: %s [%s]\n", pass_n, 00255 word->word->best_choice->unichar_string().string(), 00256 word->word->best_choice->debug_string().string()); 00257 } 00258 } 00259 return true; 00260 } 00261 00283 bool Tesseract::recog_all_words(PAGE_RES* page_res, 00284 ETEXT_DESC* monitor, 00285 const TBOX* target_word_box, 00286 const char* word_config, 00287 int dopasses) { 00288 PAGE_RES_IT page_res_it(page_res); 00289 00290 if (tessedit_minimal_rej_pass1) { 00291 tessedit_test_adaption.set_value (TRUE); 00292 tessedit_minimal_rejection.set_value (TRUE); 00293 } 00294 00295 if (dopasses==0 || dopasses==1) { 00296 page_res_it.restart_page(); 00297 // ****************** Pass 1 ******************* 00298 00299 // Clear adaptive classifier at the beginning of the page if it is full. 00300 // This is done only at the beginning of the page to ensure that the 00301 // classifier is not reset at an arbitrary point while processing the page, 00302 // which would cripple Passes 2+ if the reset happens towards the end of 00303 // Pass 1 on a page with very difficult text. 00304 // TODO(daria): preemptively clear the classifier if it is almost full. 00305 if (AdaptiveClassifierIsFull()) ResetAdaptiveClassifierInternal(); 00306 // Now check the sub-langs as well. 00307 for (int i = 0; i < sub_langs_.size(); ++i) { 00308 if (sub_langs_[i]->AdaptiveClassifierIsFull()) 00309 sub_langs_[i]->ResetAdaptiveClassifierInternal(); 00310 } 00311 // Set up all words ready for recognition, so that if parallelism is on 00312 // all the input and output classes are ready to run the classifier. 00313 GenericVector<WordData> words; 00314 SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words); 00315 if (tessedit_parallelize) { 00316 PrerecAllWordsPar(words); 00317 } 00318 00319 stats_.word_count = words.size(); 00320 00321 stats_.dict_words = 0; 00322 stats_.doc_blob_quality = 0; 00323 stats_.doc_outline_errs = 0; 00324 stats_.doc_char_quality = 0; 00325 stats_.good_char_count = 0; 00326 stats_.doc_good_char_quality = 0; 00327 00328 most_recently_used_ = this; 00329 // Run pass 1 word recognition. 00330 if (!RecogAllWordsPassN(1, monitor, &words)) return false; 00331 // Pass 1 post-processing. 00332 while (page_res_it.word() != NULL) { 00333 if (page_res_it.word()->word->flag(W_REP_CHAR)) { 00334 fix_rep_char(&page_res_it); 00335 page_res_it.forward(); 00336 continue; 00337 } 00338 00339 // Count dict words. 00340 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) 00341 ++(stats_.dict_words); 00342 00343 // Update misadaption log (we only need to do it on pass 1, since 00344 // adaption only happens on this pass). 00345 if (page_res_it.word()->blamer_bundle != NULL && 00346 page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) { 00347 page_res->misadaption_log.push_back( 00348 page_res_it.word()->blamer_bundle->misadaption_debug()); 00349 } 00350 00351 page_res_it.forward(); 00352 } 00353 } 00354 00355 if (dopasses == 1) return true; 00356 00357 // ****************** Pass 2 ******************* 00358 if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption) { 00359 page_res_it.restart_page(); 00360 GenericVector<WordData> words; 00361 SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words); 00362 if (tessedit_parallelize) { 00363 PrerecAllWordsPar(words); 00364 } 00365 most_recently_used_ = this; 00366 // Run pass 2 word recognition. 00367 if (!RecogAllWordsPassN(2, monitor, &words)) return false; 00368 // Pass 2 post-processing. 00369 while (page_res_it.word() != NULL) { 00370 WERD_RES* word = page_res_it.word(); 00371 if (word->word->flag(W_REP_CHAR) && !word->done) { 00372 fix_rep_char(&page_res_it); 00373 page_res_it.forward(); 00374 continue; 00375 } 00376 page_res_it.forward(); 00377 } 00378 } 00379 00380 // The next passes can only be run if tesseract has been used, as cube 00381 // doesn't set all the necessary outputs in WERD_RES. 00382 if (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY || 00383 tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { 00384 // ****************** Pass 3 ******************* 00385 // Fix fuzzy spaces. 00386 set_global_loc_code(LOC_FUZZY_SPACE); 00387 00388 if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces 00389 && !tessedit_word_for_word && !right_to_left()) 00390 fix_fuzzy_spaces(monitor, stats_.word_count, page_res); 00391 00392 // ****************** Pass 4 ******************* 00393 if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res); 00394 00395 // ****************** Pass 5,6 ******************* 00396 rejection_passes(page_res, monitor, target_word_box, word_config); 00397 00398 // ****************** Pass 7 ******************* 00399 // Cube combiner. 00400 // If cube is loaded and its combiner is present, run it. 00401 if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { 00402 run_cube_combiner(page_res); 00403 } 00404 00405 // ****************** Pass 8 ******************* 00406 font_recognition_pass(page_res); 00407 00408 // ****************** Pass 9 ******************* 00409 // Check the correctness of the final results. 00410 blamer_pass(page_res); 00411 } 00412 script_pos_pass(page_res); 00413 00414 // Write results pass. 00415 set_global_loc_code(LOC_WRITE_RESULTS); 00416 // This is now redundant, but retained commented so show how to obtain 00417 // bounding boxes and style information. 00418 00419 // changed by jetsoft 00420 // needed for dll to output memory structure 00421 if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) 00422 output_pass(page_res_it, target_word_box); 00423 // end jetsoft 00424 PageSegMode pageseg_mode = static_cast<PageSegMode>( 00425 static_cast<int>(tessedit_pageseg_mode)); 00426 textord_.CleanupSingleRowResult(pageseg_mode, page_res); 00427 00428 if (monitor != NULL) { 00429 monitor->progress = 100; 00430 } 00431 return true; 00432 } 00433 00434 void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { 00435 PAGE_RES_IT word_it(page_res); 00436 00437 WERD_RES *w_prev = NULL; 00438 WERD_RES *w = word_it.word(); 00439 while (1) { 00440 w_prev = w; 00441 while (word_it.forward() != NULL && 00442 (!word_it.word() || word_it.word()->part_of_combo)) { 00443 // advance word_it, skipping over parts of combos 00444 } 00445 if (!word_it.word()) break; 00446 w = word_it.word(); 00447 if (!w || !w_prev || w->uch_set != w_prev->uch_set) { 00448 continue; 00449 } 00450 if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) { 00451 if (tessedit_bigram_debug) { 00452 tprintf("Skipping because one of the words is W_REP_CHAR\n"); 00453 } 00454 continue; 00455 } 00456 // Two words sharing the same language model, excellent! 00457 GenericVector<WERD_CHOICE *> overrides_word1; 00458 GenericVector<WERD_CHOICE *> overrides_word2; 00459 00460 STRING orig_w1_str = w_prev->best_choice->unichar_string(); 00461 STRING orig_w2_str = w->best_choice->unichar_string(); 00462 WERD_CHOICE prev_best(w->uch_set); 00463 { 00464 int w1start, w1end; 00465 w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end); 00466 prev_best = w_prev->best_choice->shallow_copy(w1start, w1end); 00467 } 00468 WERD_CHOICE this_best(w->uch_set); 00469 { 00470 int w2start, w2end; 00471 w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end); 00472 this_best = w->best_choice->shallow_copy(w2start, w2end); 00473 } 00474 00475 if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) { 00476 if (tessedit_bigram_debug) { 00477 tprintf("Top choice \"%s %s\" verified by bigram model.\n", 00478 orig_w1_str.string(), orig_w2_str.string()); 00479 } 00480 continue; 00481 } 00482 if (tessedit_bigram_debug > 2) { 00483 tprintf("Examining alt choices for \"%s %s\".\n", 00484 orig_w1_str.string(), orig_w2_str.string()); 00485 } 00486 if (tessedit_bigram_debug > 1) { 00487 if (!w_prev->best_choices.singleton()) { 00488 w_prev->PrintBestChoices(); 00489 } 00490 if (!w->best_choices.singleton()) { 00491 w->PrintBestChoices(); 00492 } 00493 } 00494 float best_rating = 0.0; 00495 int best_idx = 0; 00496 WERD_CHOICE_IT prev_it(&w_prev->best_choices); 00497 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) { 00498 WERD_CHOICE *p1 = prev_it.data(); 00499 WERD_CHOICE strip1(w->uch_set); 00500 { 00501 int p1start, p1end; 00502 p1->GetNonSuperscriptSpan(&p1start, &p1end); 00503 strip1 = p1->shallow_copy(p1start, p1end); 00504 } 00505 WERD_CHOICE_IT w_it(&w->best_choices); 00506 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00507 WERD_CHOICE *p2 = w_it.data(); 00508 WERD_CHOICE strip2(w->uch_set); 00509 { 00510 int p2start, p2end; 00511 p2->GetNonSuperscriptSpan(&p2start, &p2end); 00512 strip2 = p2->shallow_copy(p2start, p2end); 00513 } 00514 if (w->tesseract->getDict().valid_bigram(strip1, strip2)) { 00515 overrides_word1.push_back(p1); 00516 overrides_word2.push_back(p2); 00517 if (overrides_word1.size() == 1 || 00518 p1->rating() + p2->rating() < best_rating) { 00519 best_rating = p1->rating() + p2->rating(); 00520 best_idx = overrides_word1.size() - 1; 00521 } 00522 } 00523 } 00524 } 00525 if (overrides_word1.size() >= 1) { 00526 // Excellent, we have some bigram matches. 00527 if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, 00528 *overrides_word1[best_idx]) && 00529 EqualIgnoringCaseAndTerminalPunct(*w->best_choice, 00530 *overrides_word2[best_idx])) { 00531 if (tessedit_bigram_debug > 1) { 00532 tprintf("Top choice \"%s %s\" verified (sans case) by bigram " 00533 "model.\n", orig_w1_str.string(), orig_w2_str.string()); 00534 } 00535 continue; 00536 } 00537 STRING new_w1_str = overrides_word1[best_idx]->unichar_string(); 00538 STRING new_w2_str = overrides_word2[best_idx]->unichar_string(); 00539 if (new_w1_str != orig_w1_str) { 00540 w_prev->ReplaceBestChoice(overrides_word1[best_idx]); 00541 } 00542 if (new_w2_str != orig_w2_str) { 00543 w->ReplaceBestChoice(overrides_word2[best_idx]); 00544 } 00545 if (tessedit_bigram_debug > 0) { 00546 STRING choices_description; 00547 int num_bigram_choices 00548 = overrides_word1.size() * overrides_word2.size(); 00549 if (num_bigram_choices == 1) { 00550 choices_description = "This was the unique bigram choice."; 00551 } else { 00552 if (tessedit_bigram_debug > 1) { 00553 STRING bigrams_list; 00554 const int kMaxChoicesToPrint = 20; 00555 for (int i = 0; i < overrides_word1.size() && 00556 i < kMaxChoicesToPrint; i++) { 00557 if (i > 0) { bigrams_list += ", "; } 00558 WERD_CHOICE *p1 = overrides_word1[i]; 00559 WERD_CHOICE *p2 = overrides_word2[i]; 00560 bigrams_list += p1->unichar_string() + " " + p2->unichar_string(); 00561 if (i == kMaxChoicesToPrint) { 00562 bigrams_list += " ..."; 00563 } 00564 } 00565 choices_description = "There were many choices: {"; 00566 choices_description += bigrams_list; 00567 choices_description += "}"; 00568 } else { 00569 choices_description.add_str_int("There were ", num_bigram_choices); 00570 choices_description += " compatible bigrams."; 00571 } 00572 } 00573 tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", 00574 orig_w1_str.string(), orig_w2_str.string(), 00575 new_w1_str.string(), new_w2_str.string(), 00576 choices_description.string()); 00577 } 00578 } 00579 } 00580 } 00581 00582 void Tesseract::rejection_passes(PAGE_RES* page_res, 00583 ETEXT_DESC* monitor, 00584 const TBOX* target_word_box, 00585 const char* word_config) { 00586 PAGE_RES_IT page_res_it(page_res); 00587 // ****************** Pass 5 ******************* 00588 // Gather statistics on rejects. 00589 int word_index = 0; 00590 while (!tessedit_test_adaption && page_res_it.word() != NULL) { 00591 set_global_loc_code(LOC_MM_ADAPT); 00592 WERD_RES* word = page_res_it.word(); 00593 word_index++; 00594 if (monitor != NULL) { 00595 monitor->ocr_alive = TRUE; 00596 monitor->progress = 95 + 5 * word_index / stats_.word_count; 00597 } 00598 if (word->rebuild_word == NULL) { 00599 // Word was not processed by tesseract. 00600 page_res_it.forward(); 00601 continue; 00602 } 00603 check_debug_pt(word, 70); 00604 00605 // changed by jetsoft 00606 // specific to its needs to extract one word when need 00607 if (target_word_box && 00608 !ProcessTargetWord(word->word->bounding_box(), 00609 *target_word_box, word_config, 4)) { 00610 page_res_it.forward(); 00611 continue; 00612 } 00613 // end jetsoft 00614 00615 page_res_it.rej_stat_word(); 00616 int chars_in_word = word->reject_map.length(); 00617 int rejects_in_word = word->reject_map.reject_count(); 00618 00619 int blob_quality = word_blob_quality(word, page_res_it.row()->row); 00620 stats_.doc_blob_quality += blob_quality; 00621 int outline_errs = word_outline_errs(word); 00622 stats_.doc_outline_errs += outline_errs; 00623 inT16 all_char_quality; 00624 inT16 accepted_all_char_quality; 00625 word_char_quality(word, page_res_it.row()->row, 00626 &all_char_quality, &accepted_all_char_quality); 00627 stats_.doc_char_quality += all_char_quality; 00628 uinT8 permuter_type = word->best_choice->permuter(); 00629 if ((permuter_type == SYSTEM_DAWG_PERM) || 00630 (permuter_type == FREQ_DAWG_PERM) || 00631 (permuter_type == USER_DAWG_PERM)) { 00632 stats_.good_char_count += chars_in_word - rejects_in_word; 00633 stats_.doc_good_char_quality += accepted_all_char_quality; 00634 } 00635 check_debug_pt(word, 80); 00636 if (tessedit_reject_bad_qual_wds && 00637 (blob_quality == 0) && (outline_errs >= chars_in_word)) 00638 word->reject_map.rej_word_bad_quality(); 00639 check_debug_pt(word, 90); 00640 page_res_it.forward(); 00641 } 00642 00643 if (tessedit_debug_quality_metrics) { 00644 tprintf 00645 ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f" 00646 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n", 00647 page_res->char_count, page_res->rej_count, 00648 page_res->rej_count / static_cast<float>(page_res->char_count), 00649 stats_.doc_blob_quality, 00650 stats_.doc_blob_quality / static_cast<float>(page_res->char_count), 00651 stats_.doc_outline_errs, 00652 stats_.doc_outline_errs / static_cast<float>(page_res->char_count), 00653 stats_.doc_char_quality, 00654 stats_.doc_char_quality / static_cast<float>(page_res->char_count), 00655 stats_.doc_good_char_quality, 00656 (stats_.good_char_count > 0) ? 00657 (stats_.doc_good_char_quality / 00658 static_cast<float>(stats_.good_char_count)) : 0.0); 00659 } 00660 BOOL8 good_quality_doc = 00661 ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= 00662 quality_rej_pc) && 00663 (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= 00664 quality_blob_pc) && 00665 (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= 00666 quality_outline_pc) && 00667 (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= 00668 quality_char_pc); 00669 00670 // ****************** Pass 6 ******************* 00671 // Do whole document or whole block rejection pass 00672 if (!tessedit_test_adaption) { 00673 set_global_loc_code(LOC_DOC_BLK_REJ); 00674 quality_based_rejection(page_res_it, good_quality_doc); 00675 } 00676 } 00677 00678 void Tesseract::blamer_pass(PAGE_RES* page_res) { 00679 if (!wordrec_run_blamer) return; 00680 PAGE_RES_IT page_res_it(page_res); 00681 for (page_res_it.restart_page(); page_res_it.word() != NULL; 00682 page_res_it.forward()) { 00683 WERD_RES *word = page_res_it.word(); 00684 BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word); 00685 page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++; 00686 } 00687 tprintf("Blame reasons:\n"); 00688 for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) { 00689 tprintf("%s %d\n", BlamerBundle::IncorrectReasonName( 00690 static_cast<IncorrectResultReason>(bl)), 00691 page_res->blame_reasons[bl]); 00692 } 00693 if (page_res->misadaption_log.length() > 0) { 00694 tprintf("Misadaption log:\n"); 00695 for (int i = 0; i < page_res->misadaption_log.length(); ++i) { 00696 tprintf("%s\n", page_res->misadaption_log[i].string()); 00697 } 00698 } 00699 } 00700 00701 // Sets script positions and detects smallcaps on all output words. 00702 void Tesseract::script_pos_pass(PAGE_RES* page_res) { 00703 PAGE_RES_IT page_res_it(page_res); 00704 for (page_res_it.restart_page(); page_res_it.word() != NULL; 00705 page_res_it.forward()) { 00706 WERD_RES* word = page_res_it.word(); 00707 if (word->word->flag(W_REP_CHAR)) { 00708 page_res_it.forward(); 00709 continue; 00710 } 00711 float x_height = page_res_it.block()->block->x_height(); 00712 float word_x_height = word->x_height; 00713 if (word_x_height < word->best_choice->min_x_height() || 00714 word_x_height > word->best_choice->max_x_height()) { 00715 word_x_height = (word->best_choice->min_x_height() + 00716 word->best_choice->max_x_height()) / 2.0f; 00717 } 00718 // Test for small caps. Word capheight must be close to block xheight, 00719 // and word must contain no lower case letters, and at least one upper case. 00720 double small_cap_xheight = x_height * kXHeightCapRatio; 00721 double small_cap_delta = (x_height - small_cap_xheight) / 2.0; 00722 if (word->uch_set->script_has_xheight() && 00723 small_cap_xheight - small_cap_delta <= word_x_height && 00724 word_x_height <= small_cap_xheight + small_cap_delta) { 00725 // Scan for upper/lower. 00726 int num_upper = 0; 00727 int num_lower = 0; 00728 for (int i = 0; i < word->best_choice->length(); ++i) { 00729 if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) 00730 ++num_upper; 00731 else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) 00732 ++num_lower; 00733 } 00734 if (num_upper > 0 && num_lower == 0) 00735 word->small_caps = true; 00736 } 00737 word->SetScriptPositions(); 00738 } 00739 } 00740 00741 // Helper returns true if the new_word is better than the word, using a 00742 // simple test of better certainty AND rating (to reduce false positives 00743 // from cube) or a dictionary vs non-dictionary word. 00744 static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word, 00745 double rating_ratio, 00746 double certainty_margin) { 00747 if (new_word.best_choice == NULL) { 00748 return false; // New one no good. 00749 } 00750 if (word.best_choice == NULL) { 00751 return true; // Old one no good. 00752 } 00753 if (new_word.best_choice->certainty() > word.best_choice->certainty() && 00754 new_word.best_choice->rating() < word.best_choice->rating()) { 00755 return true; // New word has better confidence. 00756 } 00757 if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) && 00758 Dict::valid_word_permuter(new_word.best_choice->permuter(), false) && 00759 new_word.best_choice->rating() < 00760 word.best_choice->rating() * rating_ratio && 00761 new_word.best_choice->certainty() > 00762 word.best_choice->certainty() - certainty_margin) { 00763 return true; // New word is from a dictionary. 00764 } 00765 return false; // New word is no better. 00766 } 00767 00768 // Helper to recognize the word using the given (language-specific) tesseract. 00769 // Returns true if the result was better than previously. 00770 bool Tesseract::RetryWithLanguage(const WERD_RES& best_word, 00771 WordData* word_data, WERD_RES* word, 00772 WordRecognizer recognizer) { 00773 if (classify_debug_level || cube_debug_level) { 00774 tprintf("Retrying word using lang %s, oem %d\n", 00775 lang.string(), static_cast<int>(tessedit_ocr_engine_mode)); 00776 } 00777 // Run the recognizer on the word. 00778 // Initial version is a bit of a hack based on better certainty and rating 00779 // (to reduce false positives from cube) or a dictionary vs non-dictionary 00780 // word. 00781 (this->*recognizer)(word_data, word); 00782 bool new_is_better = NewWordBetter(best_word, *word, 00783 classify_max_rating_ratio, 00784 classify_max_certainty_margin); 00785 if (classify_debug_level || cube_debug_level) { 00786 if (word->best_choice == NULL) { 00787 tprintf("NULL result %s better!\n", 00788 new_is_better ? "IS" : "NOT"); 00789 } else { 00790 tprintf("New result %s better:%s, r=%g, c=%g\n", 00791 new_is_better ? "IS" : "NOT", 00792 word->best_choice->unichar_string().string(), 00793 word->best_choice->rating(), 00794 word->best_choice->certainty()); 00795 } 00796 } 00797 return new_is_better; 00798 } 00799 00800 // Generic function for classifying a word. Can be used either for pass1 or 00801 // pass2 according to the function passed to recognizer. 00802 // word block and row are the current location in the document's PAGE_RES. 00803 // Recognizes in the current language, and if successful that is all. 00804 // If recognition was not successful, tries all available languages until 00805 // it gets a successful result or runs out of languages. Keeps the best result. 00806 void Tesseract::classify_word_and_language(WordRecognizer recognizer, 00807 WordData* word_data) { 00808 // Points to the best result. May be word or in lang_words. 00809 WERD_RES* word = word_data->word; 00810 clock_t start_t = clock(); 00811 if (classify_debug_level || cube_debug_level) { 00812 tprintf("Processing word with lang %s at:", 00813 most_recently_used_->lang.string()); 00814 word->word->bounding_box().print(); 00815 } 00816 const char* result_type = "Initial"; 00817 bool initially_done = !word->tess_failed && word->done; 00818 if (initially_done) { 00819 // If done on pass1, leave it as-is. 00820 most_recently_used_ = word->tesseract; 00821 result_type = "Already done"; 00822 } else { 00823 if (most_recently_used_ != this) { 00824 // Point to the word for most_recently_used_. 00825 for (int s = 0; s < sub_langs_.size(); ++s) { 00826 if (most_recently_used_ == sub_langs_[s]) { 00827 word = &word_data->lang_words[s]; 00828 break; 00829 } 00830 } 00831 } 00832 (most_recently_used_->*recognizer)(word_data, word); 00833 if (!word->tess_failed && word->tess_accepted) 00834 result_type = "Accepted"; 00835 } 00836 if (classify_debug_level || cube_debug_level) { 00837 tprintf("%s result: %s r=%.4g, c=%.4g, accepted=%d, adaptable=%d" 00838 " xht=[%g,%g]\n", 00839 result_type, 00840 word->best_choice->unichar_string().string(), 00841 word->best_choice->rating(), 00842 word->best_choice->certainty(), 00843 word->tess_accepted, word->tess_would_adapt, 00844 word->best_choice->min_x_height(), 00845 word->best_choice->max_x_height()); 00846 } 00847 if (word->tess_failed || !word->tess_accepted) { 00848 // Try all the other languages to see if they are any better. 00849 Tesseract* previous_used = most_recently_used_; 00850 if (most_recently_used_ != this) { 00851 if (classify_debug_level) { 00852 tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string()); 00853 } 00854 if (word_data->word->tesseract == this) { 00855 // This is pass1, and we are trying the main language. 00856 if (RetryWithLanguage(*word, word_data, word_data->word, recognizer)) { 00857 most_recently_used_ = this; 00858 word = word_data->word; 00859 } 00860 } else { 00861 // This is pass2, and we are trying the main language again, but it 00862 // has no word allocated to it, so we must re-initialize it. 00863 WERD_RES main_word(*word_data->word); 00864 main_word.InitForRetryRecognition(*word_data->word); 00865 main_word.SetupForRecognition(unicharset, this, BestPix(), 00866 tessedit_ocr_engine_mode, NULL, 00867 classify_bln_numeric_mode, 00868 textord_use_cjk_fp_model, 00869 poly_allow_detailed_fx, 00870 word_data->row, word_data->block); 00871 if (RetryWithLanguage(*word, word_data, &main_word, recognizer)) { 00872 most_recently_used_ = this; 00873 word_data->word->ConsumeWordResults(&main_word); 00874 word = word_data->word; 00875 } 00876 } 00877 if (!word->tess_failed && word->tess_accepted) 00878 return; // No need to look at the others. 00879 } 00880 00881 for (int i = 0; i < sub_langs_.size(); ++i) { 00882 if (sub_langs_[i] != previous_used) { 00883 if (classify_debug_level) { 00884 tprintf("Retrying with sub-Tesseract[%d] lang: %s\n", 00885 i, sub_langs_[i]->lang.string()); 00886 } 00887 if (sub_langs_[i]->RetryWithLanguage(*word, word_data, 00888 &word_data->lang_words[i], 00889 recognizer)) { 00890 most_recently_used_ = sub_langs_[i]; 00891 word = &word_data->lang_words[i]; 00892 if (!word->tess_failed && word->tess_accepted) 00893 break; // No need to look at the others. 00894 } 00895 } 00896 } 00897 } 00898 if (word != word_data->word) { 00899 // Move the result for the best language to the main word. 00900 word_data->word->ConsumeWordResults(word); 00901 } 00902 clock_t ocr_t = clock(); 00903 if (tessedit_timing_debug) { 00904 tprintf("%s (ocr took %.2f sec)\n", 00905 word->best_choice->unichar_string().string(), 00906 static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC); 00907 } 00908 } 00909 00916 void Tesseract::classify_word_pass1(WordData* word_data, WERD_RES* word) { 00917 ROW* row = word_data->row; 00918 BLOCK* block = word_data->block; 00919 prev_word_best_choice_ = word_data->prev_word != NULL 00920 ? word_data->prev_word->word->best_choice : NULL; 00921 // If we only intend to run cube - run it and return. 00922 if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { 00923 cube_word_pass1(block, row, word); 00924 return; 00925 } 00926 match_word_pass_n(1, word, row, block); 00927 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { 00928 word->tess_would_adapt = AdaptableWord(word); 00929 bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode); 00930 00931 if (adapt_ok) { 00932 // Send word to adaptive classifier for training. 00933 word->BestChoiceToCorrectText(); 00934 LearnWord(NULL, word); 00935 // Mark misadaptions if running blamer. 00936 if (word->blamer_bundle != NULL) { 00937 word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, 00938 wordrec_debug_blamer); 00939 } 00940 } 00941 00942 if (tessedit_enable_doc_dict && !word->IsAmbiguous()) 00943 tess_add_doc_word(word->best_choice); 00944 } 00945 } 00946 00947 // Helper to report the result of the xheight fix. 00948 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, 00949 WERD_RES* word, WERD_RES* new_word) { 00950 tprintf("New XHT Match:%s = %s ", 00951 word->best_choice->unichar_string().string(), 00952 word->best_choice->debug_string().string()); 00953 word->reject_map.print(debug_fp); 00954 tprintf(" -> %s = %s ", 00955 new_word->best_choice->unichar_string().string(), 00956 new_word->best_choice->debug_string().string()); 00957 new_word->reject_map.print(debug_fp); 00958 tprintf(" %s->%s %s %s\n", 00959 word->guessed_x_ht ? "GUESS" : "CERT", 00960 new_word->guessed_x_ht ? "GUESS" : "CERT", 00961 new_x_ht > 0.1 ? "STILL DOUBT" : "OK", 00962 accept_new_word ? "ACCEPTED" : ""); 00963 } 00964 00965 // Run the x-height fix-up, based on min/max top/bottom information in 00966 // unicharset. 00967 // Returns true if the word was changed. 00968 // See the comment in fixxht.cpp for a description of the overall process. 00969 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) { 00970 bool accept_new_x_ht = false; 00971 int original_misfits = CountMisfitTops(word); 00972 if (original_misfits == 0) 00973 return false; 00974 float new_x_ht = ComputeCompatibleXheight(word); 00975 if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { 00976 WERD_RES new_x_ht_word(word->word); 00977 if (word->blamer_bundle != NULL) { 00978 new_x_ht_word.blamer_bundle = new BlamerBundle(); 00979 new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle)); 00980 } 00981 new_x_ht_word.x_height = new_x_ht; 00982 new_x_ht_word.caps_height = 0.0; 00983 new_x_ht_word.SetupForRecognition( 00984 unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, 00985 classify_bln_numeric_mode, textord_use_cjk_fp_model, 00986 poly_allow_detailed_fx, row, block); 00987 match_word_pass_n(2, &new_x_ht_word, row, block); 00988 if (!new_x_ht_word.tess_failed) { 00989 int new_misfits = CountMisfitTops(&new_x_ht_word); 00990 if (debug_x_ht_level >= 1) { 00991 tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", 00992 original_misfits, word->x_height, 00993 new_misfits, new_x_ht); 00994 tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", 00995 word->best_choice->rating(), word->best_choice->certainty(), 00996 new_x_ht_word.best_choice->rating(), 00997 new_x_ht_word.best_choice->certainty()); 00998 } 00999 // The misfits must improve and either the rating or certainty. 01000 accept_new_x_ht = new_misfits < original_misfits && 01001 (new_x_ht_word.best_choice->certainty() > 01002 word->best_choice->certainty() || 01003 new_x_ht_word.best_choice->rating() < 01004 word->best_choice->rating()); 01005 if (debug_x_ht_level >= 1) { 01006 ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word); 01007 } 01008 } 01009 if (accept_new_x_ht) { 01010 word->ConsumeWordResults(&new_x_ht_word); 01011 return true; 01012 } 01013 } 01014 return false; 01015 } 01016 01023 void Tesseract::classify_word_pass2(WordData* word_data, WERD_RES* word) { 01024 // Return if we do not want to run Tesseract. 01025 if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY && 01026 tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED) 01027 return; 01028 ROW* row = word_data->row; 01029 BLOCK* block = word_data->block; 01030 prev_word_best_choice_ = word_data->prev_word != NULL 01031 ? word_data->prev_word->word->best_choice : NULL; 01032 01033 set_global_subloc_code(SUBLOC_NORM); 01034 check_debug_pt(word, 30); 01035 if (!word->done || tessedit_training_tess) { 01036 word->caps_height = 0.0; 01037 if (word->x_height == 0.0f) 01038 word->x_height = row->x_height(); 01039 match_word_pass_n(2, word, row, block); 01040 check_debug_pt(word, 40); 01041 } 01042 01043 SubAndSuperscriptFix(word); 01044 01045 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { 01046 if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() && 01047 block->classify_rotation().y() == 0.0f) { 01048 // Use the tops and bottoms since they are available. 01049 TrainedXheightFix(word, block, row); 01050 } 01051 01052 set_global_subloc_code(SUBLOC_NORM); 01053 } 01054 #ifndef GRAPHICS_DISABLED 01055 if (tessedit_display_outwords) { 01056 if (fx_win == NULL) 01057 create_fx_win(); 01058 clear_fx_win(); 01059 word->rebuild_word->plot(fx_win); 01060 TBOX wbox = word->rebuild_word->bounding_box(); 01061 fx_win->ZoomToRectangle(wbox.left(), wbox.top(), 01062 wbox.right(), wbox.bottom()); 01063 ScrollView::Update(); 01064 } 01065 #endif 01066 set_global_subloc_code(SUBLOC_NORM); 01067 check_debug_pt(word, 50); 01068 } 01069 01070 01077 void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, 01078 ROW *row, BLOCK* block) { 01079 if (word->tess_failed) return; 01080 tess_segment_pass_n(pass_n, word); 01081 01082 if (!word->tess_failed) { 01083 if (!word->word->flag (W_REP_CHAR)) { 01084 word->fix_quotes(); 01085 if (tessedit_fix_hyphens) 01086 word->fix_hyphens(); 01087 /* Dont trust fix_quotes! - though I think I've fixed the bug */ 01088 if (word->best_choice->length() != word->box_word->length()) { 01089 tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;" 01090 " #Blobs=%d\n", 01091 word->best_choice->debug_string().string(), 01092 word->best_choice->length(), 01093 word->box_word->length()); 01094 01095 } 01096 word->tess_accepted = tess_acceptable_word(word); 01097 01098 // Also sets word->done flag 01099 make_reject_map(word, row, pass_n); 01100 } 01101 } 01102 set_word_fonts(word); 01103 01104 ASSERT_HOST(word->raw_choice != NULL); 01105 } 01106 01107 // Helper to return the best rated BLOB_CHOICE in the whole word that matches 01108 // the given char_id, or NULL if none can be found. 01109 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id, 01110 WERD_RES* word_res) { 01111 // Find the corresponding best BLOB_CHOICE from any position in the word_res. 01112 BLOB_CHOICE* best_choice = NULL; 01113 for (int i = 0; i < word_res->best_choice->length(); ++i) { 01114 BLOB_CHOICE* choice = FindMatchingChoice(char_id, 01115 word_res->GetBlobChoices(i)); 01116 if (choice != NULL) { 01117 if (best_choice == NULL || choice->rating() < best_choice->rating()) 01118 best_choice = choice; 01119 } 01120 } 01121 return best_choice; 01122 } 01123 01124 // Helper to insert blob_choice in each location in the leader word if there is 01125 // no matching BLOB_CHOICE there already, and correct any incorrect results 01126 // in the best_choice. 01127 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice, 01128 WERD_RES* word_res) { 01129 WERD_CHOICE* word = word_res->best_choice; 01130 for (int i = 0; i < word_res->best_choice->length(); ++i) { 01131 BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(), 01132 word_res->GetBlobChoices(i)); 01133 if (choice == NULL) { 01134 BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i)); 01135 choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice)); 01136 } 01137 } 01138 // Correct any incorrect results in word. 01139 for (int i = 0; i < word->length(); ++i) { 01140 if (word->unichar_id(i) != blob_choice->unichar_id()) 01141 word->set_unichar_id(blob_choice->unichar_id(), i); 01142 } 01143 } 01144 01152 void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) { 01153 WERD_RES *word_res = page_res_it->word(); 01154 const WERD_CHOICE &word = *(word_res->best_choice); 01155 01156 // Find the frequency of each unique character in the word. 01157 UNICHAR_ID space = word_res->uch_set->unichar_to_id(" "); 01158 SortHelper<UNICHAR_ID> rep_ch(word.length()); 01159 for (int i = 0; i < word.length(); ++i) { 01160 if (word.unichar_id(i) != space) 01161 rep_ch.Add(word.unichar_id(i), 1); 01162 } 01163 01164 // Find the most frequent result. 01165 UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char 01166 int max_count = rep_ch.MaxCount(&maxch_id); 01167 // Find the best exemplar of a classifier result for maxch_id. 01168 BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res); 01169 if (best_choice == NULL) { 01170 tprintf("Failed to find a choice for %s, occurring %d times\n", 01171 word_res->uch_set->debug_str(maxch_id).string(), max_count); 01172 return; 01173 } 01174 word_res->done = TRUE; 01175 01176 // Measure the mean space. 01177 int total_gap = 0; 01178 int gap_count = 0; 01179 WERD* werd = word_res->word; 01180 C_BLOB_IT blob_it(werd->cblob_list()); 01181 C_BLOB* prev_blob = blob_it.data(); 01182 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) { 01183 C_BLOB* blob = blob_it.data(); 01184 int gap = blob->bounding_box().left(); 01185 gap -= prev_blob->bounding_box().right(); 01186 total_gap += gap; 01187 ++gap_count; 01188 prev_blob = blob; 01189 } 01190 if (total_gap > word_res->x_height * gap_count * kRepcharGapThreshold) { 01191 // Needs spaces between. 01192 ExplodeRepeatedWord(best_choice, page_res_it); 01193 } else { 01194 // Just correct existing classification. 01195 CorrectRepcharChoices(best_choice, word_res); 01196 word_res->reject_map.initialise(word.length()); 01197 } 01198 } 01199 01200 // Explode the word at the given iterator location into individual words 01201 // of a single given unichar_id defined by best_choice. 01202 // The original word is deleted, and the replacements copy most of their 01203 // fields from the original. 01204 void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice, 01205 PAGE_RES_IT* page_res_it) { 01206 WERD_RES *word_res = page_res_it->word(); 01207 ASSERT_HOST(best_choice != NULL); 01208 01209 // Make a new word for each blob in the original. 01210 WERD* werd = word_res->word; 01211 C_BLOB_IT blob_it(werd->cblob_list()); 01212 for (; !blob_it.empty(); blob_it.forward()) { 01213 bool first_blob = blob_it.at_first(); 01214 bool last_blob = blob_it.at_last(); 01215 WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob, 01216 blob_it.extract()); 01217 // Note that blamer_bundle (truth information) is not copied, which is 01218 // desirable, since the newly inserted words would not have the original 01219 // bounding box corresponding to the one recorded in truth fields. 01220 WERD_RES* rep_word = 01221 page_res_it->InsertSimpleCloneWord(*word_res, blob_word); 01222 // Setup the single char WERD_RES 01223 if (rep_word->SetupForRecognition(*word_res->uch_set, this, BestPix(), 01224 tessedit_ocr_engine_mode, NULL, false, 01225 textord_use_cjk_fp_model, 01226 poly_allow_detailed_fx, 01227 page_res_it->row()->row, 01228 page_res_it->block()->block)) { 01229 rep_word->CloneChoppedToRebuild(); 01230 BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice); 01231 rep_word->FakeClassifyWord(1, &blob_choice); 01232 } 01233 } 01234 page_res_it->DeleteCurrentWord(); 01235 } 01236 01237 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string( 01238 const UNICHARSET& char_set, const char *s, const char *lengths) { 01239 int i = 0; 01240 int offset = 0; 01241 int leading_punct_count; 01242 int upper_count = 0; 01243 int hyphen_pos = -1; 01244 ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE; 01245 01246 if (strlen (lengths) > 20) 01247 return word_type; 01248 01249 /* Single Leading punctuation char*/ 01250 01251 if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset])) 01252 offset += lengths[i++]; 01253 leading_punct_count = i; 01254 01255 /* Initial cap */ 01256 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) { 01257 offset += lengths[i++]; 01258 upper_count++; 01259 } 01260 if (upper_count > 1) { 01261 word_type = AC_UPPER_CASE; 01262 } else { 01263 /* Lower case word, possibly with an initial cap */ 01264 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) { 01265 offset += lengths[i++]; 01266 } 01267 if (i - leading_punct_count < quality_min_initial_alphas_reqd) 01268 goto not_a_word; 01269 /* 01270 Allow a single hyphen in a lower case word 01271 - dont trust upper case - I've seen several cases of "H" -> "I-I" 01272 */ 01273 if (lengths[i] == 1 && s[offset] == '-') { 01274 hyphen_pos = i; 01275 offset += lengths[i++]; 01276 if (s[offset] != '\0') { 01277 while ((s[offset] != '\0') && 01278 char_set.get_islower(s + offset, lengths[i])) { 01279 offset += lengths[i++]; 01280 } 01281 if (i < hyphen_pos + 3) 01282 goto not_a_word; 01283 } 01284 } else { 01285 /* Allow "'s" in NON hyphenated lower case words */ 01286 if (lengths[i] == 1 && (s[offset] == '\'') && 01287 lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) { 01288 offset += lengths[i++]; 01289 offset += lengths[i++]; 01290 } 01291 } 01292 if (upper_count > 0) 01293 word_type = AC_INITIAL_CAP; 01294 else 01295 word_type = AC_LOWER_CASE; 01296 } 01297 01298 /* Up to two different, constrained trailing punctuation chars */ 01299 if (lengths[i] == 1 && s[offset] != '\0' && 01300 STRING(chs_trailing_punct1).contains(s[offset])) 01301 offset += lengths[i++]; 01302 if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && 01303 s[offset - lengths[i - 1]] != s[offset] && 01304 STRING(chs_trailing_punct2).contains (s[offset])) 01305 offset += lengths[i++]; 01306 01307 if (s[offset] != '\0') 01308 word_type = AC_UNACCEPTABLE; 01309 01310 not_a_word: 01311 01312 if (word_type == AC_UNACCEPTABLE) { 01313 /* Look for abbreviation string */ 01314 i = 0; 01315 offset = 0; 01316 if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) { 01317 word_type = AC_UC_ABBREV; 01318 while (s[offset] != '\0' && 01319 char_set.get_isupper(s + offset, lengths[i]) && 01320 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { 01321 offset += lengths[i++]; 01322 offset += lengths[i++]; 01323 } 01324 } 01325 else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) { 01326 word_type = AC_LC_ABBREV; 01327 while (s[offset] != '\0' && 01328 char_set.get_islower(s + offset, lengths[i]) && 01329 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { 01330 offset += lengths[i++]; 01331 offset += lengths[i++]; 01332 } 01333 } 01334 if (s[offset] != '\0') 01335 word_type = AC_UNACCEPTABLE; 01336 } 01337 01338 return word_type; 01339 } 01340 01341 BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) { 01342 BOOL8 show_map_detail = FALSE; 01343 inT16 i; 01344 01345 #ifndef SECURE_NAMES 01346 if (!test_pt) 01347 return FALSE; 01348 01349 tessedit_rejection_debug.set_value (FALSE); 01350 debug_x_ht_level.set_value (0); 01351 01352 if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) { 01353 if (location < 0) 01354 return TRUE; // For breakpoint use 01355 tessedit_rejection_debug.set_value (TRUE); 01356 debug_x_ht_level.set_value (20); 01357 tprintf ("\n\nTESTWD::"); 01358 switch (location) { 01359 case 0: 01360 tprintf ("classify_word_pass1 start\n"); 01361 word->word->print(); 01362 break; 01363 case 10: 01364 tprintf ("make_reject_map: initial map"); 01365 break; 01366 case 20: 01367 tprintf ("make_reject_map: after NN"); 01368 break; 01369 case 30: 01370 tprintf ("classify_word_pass2 - START"); 01371 break; 01372 case 40: 01373 tprintf ("classify_word_pass2 - Pre Xht"); 01374 break; 01375 case 50: 01376 tprintf ("classify_word_pass2 - END"); 01377 show_map_detail = TRUE; 01378 break; 01379 case 60: 01380 tprintf ("fixspace"); 01381 break; 01382 case 70: 01383 tprintf ("MM pass START"); 01384 break; 01385 case 80: 01386 tprintf ("MM pass END"); 01387 break; 01388 case 90: 01389 tprintf ("After Poor quality rejection"); 01390 break; 01391 case 100: 01392 tprintf ("unrej_good_quality_words - START"); 01393 break; 01394 case 110: 01395 tprintf ("unrej_good_quality_words - END"); 01396 break; 01397 case 120: 01398 tprintf ("Write results pass"); 01399 show_map_detail = TRUE; 01400 break; 01401 } 01402 tprintf(" \"%s\" ", 01403 word->best_choice->unichar_string().string()); 01404 word->reject_map.print (debug_fp); 01405 tprintf ("\n"); 01406 if (show_map_detail) { 01407 tprintf ("\"%s\"\n", word->best_choice->unichar_string().string()); 01408 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { 01409 tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); 01410 word->reject_map[i].full_print(debug_fp); 01411 } 01412 } 01413 tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); 01414 tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); 01415 return TRUE; 01416 } 01417 else 01418 #endif 01419 return FALSE; 01420 } 01421 01427 static void find_modal_font( //good chars in word 01428 STATS *fonts, //font stats 01429 inT16 *font_out, //output font 01430 inT8 *font_count //output count 01431 ) { 01432 inT16 font; //font index 01433 inT32 count; //pile couat 01434 01435 if (fonts->get_total () > 0) { 01436 font = (inT16) fonts->mode (); 01437 *font_out = font; 01438 count = fonts->pile_count (font); 01439 *font_count = count < MAX_INT8 ? count : MAX_INT8; 01440 fonts->add (font, -*font_count); 01441 } 01442 else { 01443 *font_out = -1; 01444 *font_count = 0; 01445 } 01446 } 01447 01453 void Tesseract::set_word_fonts(WERD_RES *word) { 01454 // Don't try to set the word fonts for a cube word, as the configs 01455 // will be meaningless. 01456 if (word->chopped_word == NULL) return; 01457 ASSERT_HOST(word->best_choice != NULL); 01458 01459 inT32 index; // char id index 01460 // character iterator 01461 BLOB_CHOICE_IT choice_it; // choice iterator 01462 int fontinfo_size = get_fontinfo_table().size(); 01463 int fontset_size = get_fontset_table().size(); 01464 if (fontinfo_size == 0 || fontset_size == 0) return; 01465 STATS fonts(0, fontinfo_size); // font counters 01466 01467 word->italic = 0; 01468 word->bold = 0; 01469 if (!word->best_choice_fontinfo_ids.empty()) { 01470 word->best_choice_fontinfo_ids.clear(); 01471 } 01472 // Compute the modal font for the word 01473 for (index = 0; index < word->best_choice->length(); ++index) { 01474 UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index); 01475 choice_it.set_to_list(word->GetBlobChoices(index)); 01476 if (tessedit_debug_fonts) { 01477 tprintf("Examining fonts in %s\n", 01478 word->best_choice->debug_string().string()); 01479 } 01480 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 01481 choice_it.forward()) { 01482 UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id(); 01483 if (blob_ch_id == word_ch_id) { 01484 if (tessedit_debug_fonts) { 01485 tprintf("%s font %s (%d) font2 %s (%d)\n", 01486 word->uch_set->id_to_unichar(blob_ch_id), 01487 choice_it.data()->fontinfo_id() < 0 ? "unknown" : 01488 fontinfo_table_.get(choice_it.data()->fontinfo_id()).name, 01489 choice_it.data()->fontinfo_id(), 01490 choice_it.data()->fontinfo_id2() < 0 ? "unknown" : 01491 fontinfo_table_.get(choice_it.data()->fontinfo_id2()).name, 01492 choice_it.data()->fontinfo_id2()); 01493 } 01494 // 1st choice font gets 2 pts, 2nd choice 1 pt. 01495 if (choice_it.data()->fontinfo_id() >= 0) { 01496 fonts.add(choice_it.data()->fontinfo_id(), 2); 01497 } 01498 if (choice_it.data()->fontinfo_id2() >= 0) { 01499 fonts.add(choice_it.data()->fontinfo_id2(), 1); 01500 } 01501 break; 01502 } 01503 } 01504 } 01505 inT16 font_id1, font_id2; 01506 find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count); 01507 find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count); 01508 word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL; 01509 word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL; 01510 // All the blobs get the word's best choice font. 01511 for (int i = 0; i < word->best_choice->length(); ++i) { 01512 word->best_choice_fontinfo_ids.push_back(font_id1); 01513 } 01514 if (word->fontinfo_id_count > 0) { 01515 FontInfo fi = fontinfo_table_.get(font_id1); 01516 if (tessedit_debug_fonts) { 01517 if (word->fontinfo_id2_count > 0) { 01518 tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", 01519 fi.name, word->fontinfo_id_count, 01520 fontinfo_table_.get(font_id2).name, 01521 word->fontinfo_id2_count); 01522 } else { 01523 tprintf("Word modal font=%s, score=%d. No 2nd choice\n", 01524 fi.name, word->fontinfo_id_count); 01525 } 01526 } 01527 // 1st choices got 2 pts, so we need to halve the score for the mode. 01528 word->italic = (fi.is_italic() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2; 01529 word->bold = (fi.is_bold() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2; 01530 } 01531 } 01532 01533 01540 void Tesseract::font_recognition_pass(PAGE_RES* page_res) { 01541 PAGE_RES_IT page_res_it(page_res); 01542 WERD_RES *word; // current word 01543 STATS doc_fonts(0, font_table_size_); // font counters 01544 01545 // Gather font id statistics. 01546 for (page_res_it.restart_page(); page_res_it.word() != NULL; 01547 page_res_it.forward()) { 01548 word = page_res_it.word(); 01549 if (word->fontinfo != NULL) { 01550 doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count); 01551 } 01552 if (word->fontinfo2 != NULL) { 01553 doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count); 01554 } 01555 } 01556 inT16 doc_font; // modal font 01557 inT8 doc_font_count; // modal font 01558 find_modal_font(&doc_fonts, &doc_font, &doc_font_count); 01559 if (doc_font_count == 0) 01560 return; 01561 // Get the modal font pointer. 01562 const FontInfo* modal_font = NULL; 01563 for (page_res_it.restart_page(); page_res_it.word() != NULL; 01564 page_res_it.forward()) { 01565 word = page_res_it.word(); 01566 if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) { 01567 modal_font = word->fontinfo; 01568 break; 01569 } 01570 if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) { 01571 modal_font = word->fontinfo2; 01572 break; 01573 } 01574 } 01575 ASSERT_HOST(modal_font != NULL); 01576 01577 // Assign modal font to weak words. 01578 for (page_res_it.restart_page(); page_res_it.word() != NULL; 01579 page_res_it.forward()) { 01580 word = page_res_it.word(); 01581 int length = word->best_choice->length(); 01582 01583 // 1st choices got 2 pts, so we need to halve the score for the mode. 01584 int count = (word->fontinfo_id_count + 1) / 2; 01585 if (!(count == length || (length > 3 && count >= length * 3 / 4))) { 01586 word->fontinfo = modal_font; 01587 // Counts only get 1 as it came from the doc. 01588 word->fontinfo_id_count = 1; 01589 word->italic = modal_font->is_italic() ? 1 : -1; 01590 word->bold = modal_font->is_bold() ? 1 : -1; 01591 } 01592 } 01593 } 01594 01595 } // namespace tesseract