tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/control.cpp
Go to the documentation of this file.
00001 /******************************************************************
00002  * File:        control.cpp  (Formerly control.c)
00003  * Description: Module-independent matcher controller.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Thu Apr 23 11:09:58 BST 1992
00006  * ReHacked:    Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
00007  *
00008  * (C) Copyright 1992, Hewlett-Packard Ltd.
00009  ** Licensed under the Apache License, Version 2.0 (the "License");
00010  ** you may not use this file except in compliance with the License.
00011  ** You may obtain a copy of the License at
00012  ** http://www.apache.org/licenses/LICENSE-2.0
00013  ** Unless required by applicable law or agreed to in writing, software
00014  ** distributed under the License is distributed on an "AS IS" BASIS,
00015  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016  ** See the License for the specific language governing permissions and
00017  ** limitations under the License.
00018  *
00019  **********************************************************************/
00020 
00021 #include <string.h>
00022 #include <math.h>
00023 #ifdef __UNIX__
00024 #include <assert.h>
00025 #include <unistd.h>
00026 #include <errno.h>
00027 #endif
00028 #include <ctype.h>
00029 #include "ocrclass.h"
00030 #include "werdit.h"
00031 #include "drawfx.h"
00032 #include "tessbox.h"
00033 #include "tessvars.h"
00034 #include "pgedit.h"
00035 #include "reject.h"
00036 #include "fixspace.h"
00037 #include "docqual.h"
00038 #include "control.h"
00039 #include "secname.h"
00040 #include "output.h"
00041 #include "callcpp.h"
00042 #include "globals.h"
00043 #include "sorthelper.h"
00044 #include "tesseractclass.h"
00045 
00046 // Include automatically generated configuration file if running autoconf.
00047 #ifdef HAVE_CONFIG_H
00048 #include "config_auto.h"
00049 #endif
00050 
00051 #define MIN_FONT_ROW_COUNT  8
00052 #define MAX_XHEIGHT_DIFF  3
00053 
00054 const char* const kBackUpConfigFile = "tempconfigdata.config";
00055 // Multiple of x-height to make a repeated word have spaces in it.
00056 const double kRepcharGapThreshold = 0.5;
00057 // Min believable x-height for any text when refitting as a fraction of
00058 // original x-height
00059 const double kMinRefitXHeightFraction = 0.5;
00060 
00061 
00070 namespace tesseract {
00071 void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
00072                                   TBOX &selection_box) {
00073   WERD *word;
00074   ROW *pseudo_row;               // row of word
00075   BLOCK *pseudo_block;           // block of word
00076 
00077   word = make_pseudo_word(page_res, selection_box,
00078                           pseudo_block, pseudo_row);
00079   if (word != NULL) {
00080     WERD_RES word_res(word);
00081     recog_interactive(pseudo_block, pseudo_row, &word_res);
00082     delete word;
00083   }
00084 }
00085 
00086 
00096 BOOL8 Tesseract::recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res) {
00097   inT16 char_qual;
00098   inT16 good_char_qual;
00099 
00100   WordData word_data(block, row, word_res);
00101   SetupWordPassN(2, &word_data);
00102   classify_word_and_language(&Tesseract::classify_word_pass2, &word_data);
00103   if (tessedit_debug_quality_metrics) {
00104     word_char_quality(word_res, row, &char_qual, &good_char_qual);
00105     tprintf
00106       ("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
00107       word_res->reject_map.length(), word_blob_quality(word_res, row),
00108       word_outline_errs(word_res), char_qual, good_char_qual);
00109   }
00110   return TRUE;
00111 }
00112 
00113 // Helper function to check for a target word and handle it appropriately.
00114 // Inspired by Jetsoft's requirement to process only single words on pass2
00115 // and beyond.
00116 // If word_config is not null:
00117 //   If the word_box and target_word_box overlap, read the word_config file
00118 //   else reset to previous config data.
00119 //   return true.
00120 // else
00121 //   If the word_box and target_word_box overlap or pass <= 1, return true.
00122 // Note that this function uses a fixed temporary file for storing the previous
00123 // configs, so it is neither thread-safe, nor process-safe, but the assumption
00124 // is that it will only be used for one debug window at a time.
00125 //
00126 // Since this function is used for debugging (and not to change OCR results)
00127 // set only debug params from the word config file.
00128 bool Tesseract::ProcessTargetWord(const TBOX& word_box,
00129                                   const TBOX& target_word_box,
00130                                   const char* word_config,
00131                                   int pass) {
00132   if (word_config != NULL) {
00133     if (word_box.major_overlap(target_word_box)) {
00134       if (backup_config_file_ == NULL) {
00135         backup_config_file_ = kBackUpConfigFile;
00136         FILE* config_fp = fopen(backup_config_file_, "wb");
00137         ParamUtils::PrintParams(config_fp, params());
00138         fclose(config_fp);
00139         ParamUtils::ReadParamsFile(word_config,
00140                                    SET_PARAM_CONSTRAINT_DEBUG_ONLY,
00141                                    params());
00142       }
00143     } else {
00144       if (backup_config_file_ != NULL) {
00145         ParamUtils::ReadParamsFile(backup_config_file_,
00146                                    SET_PARAM_CONSTRAINT_DEBUG_ONLY,
00147                                    params());
00148         backup_config_file_ = NULL;
00149       }
00150     }
00151   } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
00152     return false;
00153   }
00154   return true;
00155 }
00156 
00157 // If tesseract is to be run, sets the words up ready for it.
00158 void Tesseract::SetupAllWordsPassN(int pass_n,
00159                                    const TBOX* target_word_box,
00160                                    const char* word_config,
00161                                    PAGE_RES* page_res,
00162                                    GenericVector<WordData>* words) {
00163   // Prepare all the words.
00164   PAGE_RES_IT page_res_it(page_res);
00165   for (page_res_it.restart_page(); page_res_it.word() != NULL;
00166        page_res_it.forward()) {
00167     if (pass_n == 1)
00168       page_res_it.word()->SetupFake(unicharset);
00169     if (target_word_box == NULL ||
00170         ProcessTargetWord(page_res_it.word()->word->bounding_box(),
00171                           *target_word_box, word_config, 1)) {
00172       words->push_back(WordData(page_res_it));
00173     }
00174   }
00175   // Setup all the words for recognition with polygonal approximation.
00176   for (int w = 0; w < words->size(); ++w) {
00177     SetupWordPassN(pass_n, &(*words)[w]);
00178     if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
00179   }
00180 }
00181 
00182 // Sets up the single word ready for whichever engine is to be run.
00183 void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
00184   if (pass_n == 1 || !word->word->done || tessedit_training_tess) {
00185     if (pass_n == 2) {
00186       // TODO(rays) Should we do this on pass1 too?
00187       word->word->caps_height = 0.0;
00188       if (word->word->x_height == 0.0f)
00189         word->word->x_height = word->row->x_height();
00190     }
00191     // Cube doesn't get setup for pass2.
00192     if (pass_n != 2 || tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {
00193       word->word->SetupForRecognition(
00194             unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL,
00195             classify_bln_numeric_mode, textord_use_cjk_fp_model,
00196             poly_allow_detailed_fx, word->row, word->block);
00197     }
00198   }
00199   if (!sub_langs_.empty()) {
00200     if (word->lang_words.size() != sub_langs_.size()) {
00201       // Setup the words for all the sub-languages now.
00202       WERD_RES empty;
00203       word->lang_words.init_to_size(sub_langs_.size(), empty);
00204     }
00205     for (int s = 0; s < sub_langs_.size(); ++s) {
00206       Tesseract* lang_t = sub_langs_[s];
00207       if (pass_n == 1 || (lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY &&
00208           (!word->lang_words[s].done || lang_t->tessedit_training_tess))) {
00209         word->lang_words[s].InitForRetryRecognition(*word->word);
00210         word->lang_words[s].SetupForRecognition(
00211               lang_t->unicharset, lang_t, BestPix(),
00212               lang_t->tessedit_ocr_engine_mode, NULL,
00213               lang_t->classify_bln_numeric_mode,
00214               lang_t->textord_use_cjk_fp_model,
00215               lang_t->poly_allow_detailed_fx, word->row, word->block);
00216       }
00217     }
00218   }
00219 }
00220 
00221 
00222 // Runs word recognition on all the words.
00223 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
00224                                    GenericVector<WordData>* words) {
00225   // TODO(rays) Before this loop can be parallelized (it would yield a massive
00226   // speed-up) all remaining member globals need to be converted to local/heap
00227   // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
00228   // added. The results will be significantly different with adaption on, and
00229   // deterioration will need investigation.
00230   for (int w = 0; w < words->size(); ++w) {
00231     WordData* word = &(*words)[w];
00232     if (monitor != NULL) {
00233       monitor->ocr_alive = TRUE;
00234       if (pass_n == 1)
00235         monitor->progress = 30 + 50 * w / words->size();
00236       else
00237         monitor->progress = 80 + 10 * w / words->size();
00238       if (monitor->deadline_exceeded() ||
00239           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
00240                                                          words->size()))) {
00241         // Timeout. Fake out the rest of the words.
00242         for (; w < words->size(); ++w) {
00243           (*words)[w].word->SetupFake(unicharset);
00244         }
00245         return false;
00246       }
00247     }
00248     if (word->word->tess_failed) continue;
00249     WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
00250                                             : &Tesseract::classify_word_pass2;
00251     classify_word_and_language(recognizer, word);
00252     if (tessedit_dump_choices) {
00253       word_dumper(NULL, word->row, word->word);
00254       tprintf("Pass%d: %s [%s]\n", pass_n,
00255               word->word->best_choice->unichar_string().string(),
00256               word->word->best_choice->debug_string().string());
00257     }
00258   }
00259   return true;
00260 }
00261 
00283 bool Tesseract::recog_all_words(PAGE_RES* page_res,
00284                                 ETEXT_DESC* monitor,
00285                                 const TBOX* target_word_box,
00286                                 const char* word_config,
00287                                 int dopasses) {
00288   PAGE_RES_IT page_res_it(page_res);
00289 
00290   if (tessedit_minimal_rej_pass1) {
00291     tessedit_test_adaption.set_value (TRUE);
00292     tessedit_minimal_rejection.set_value (TRUE);
00293   }
00294 
00295   if (dopasses==0 || dopasses==1) {
00296     page_res_it.restart_page();
00297     // ****************** Pass 1 *******************
00298 
00299     // Clear adaptive classifier at the beginning of the page if it is full.
00300     // This is done only at the beginning of the page to ensure that the
00301     // classifier is not reset at an arbitrary point while processing the page,
00302     // which would cripple Passes 2+ if the reset happens towards the end of
00303     // Pass 1 on a page with very difficult text.
00304     // TODO(daria): preemptively clear the classifier if it is almost full.
00305     if (AdaptiveClassifierIsFull()) ResetAdaptiveClassifierInternal();
00306     // Now check the sub-langs as well.
00307     for (int i = 0; i < sub_langs_.size(); ++i) {
00308       if (sub_langs_[i]->AdaptiveClassifierIsFull())
00309         sub_langs_[i]->ResetAdaptiveClassifierInternal();
00310     }
00311     // Set up all words ready for recognition, so that if parallelism is on
00312     // all the input and output classes are ready to run the classifier.
00313     GenericVector<WordData> words;
00314     SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
00315     if (tessedit_parallelize) {
00316       PrerecAllWordsPar(words);
00317     }
00318 
00319     stats_.word_count = words.size();
00320 
00321     stats_.dict_words = 0;
00322     stats_.doc_blob_quality = 0;
00323     stats_.doc_outline_errs = 0;
00324     stats_.doc_char_quality = 0;
00325     stats_.good_char_count = 0;
00326     stats_.doc_good_char_quality = 0;
00327 
00328     most_recently_used_ = this;
00329     // Run pass 1 word recognition.
00330     if (!RecogAllWordsPassN(1, monitor, &words)) return false;
00331     // Pass 1 post-processing.
00332     while (page_res_it.word() != NULL) {
00333       if (page_res_it.word()->word->flag(W_REP_CHAR)) {
00334         fix_rep_char(&page_res_it);
00335         page_res_it.forward();
00336         continue;
00337       }
00338 
00339       // Count dict words.
00340       if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
00341         ++(stats_.dict_words);
00342 
00343       // Update misadaption log (we only need to do it on pass 1, since
00344       // adaption only happens on this pass).
00345       if (page_res_it.word()->blamer_bundle != NULL &&
00346           page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
00347         page_res->misadaption_log.push_back(
00348             page_res_it.word()->blamer_bundle->misadaption_debug());
00349       }
00350 
00351       page_res_it.forward();
00352     }
00353   }
00354 
00355   if (dopasses == 1) return true;
00356 
00357   // ****************** Pass 2 *******************
00358   if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption) {
00359     page_res_it.restart_page();
00360     GenericVector<WordData> words;
00361     SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
00362     if (tessedit_parallelize) {
00363       PrerecAllWordsPar(words);
00364     }
00365     most_recently_used_ = this;
00366     // Run pass 2 word recognition.
00367     if (!RecogAllWordsPassN(2, monitor, &words)) return false;
00368     // Pass 2 post-processing.
00369     while (page_res_it.word() != NULL) {
00370       WERD_RES* word = page_res_it.word();
00371        if (word->word->flag(W_REP_CHAR) && !word->done) {
00372         fix_rep_char(&page_res_it);
00373         page_res_it.forward();
00374         continue;
00375       }
00376       page_res_it.forward();
00377     }
00378   }
00379 
00380   // The next passes can only be run if tesseract has been used, as cube
00381   // doesn't set all the necessary outputs in WERD_RES.
00382   if (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY ||
00383       tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
00384     // ****************** Pass 3 *******************
00385     // Fix fuzzy spaces.
00386     set_global_loc_code(LOC_FUZZY_SPACE);
00387 
00388     if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
00389         && !tessedit_word_for_word && !right_to_left())
00390       fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
00391 
00392     // ****************** Pass 4 *******************
00393     if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res);
00394 
00395     // ****************** Pass 5,6 *******************
00396     rejection_passes(page_res, monitor, target_word_box, word_config);
00397 
00398     // ****************** Pass 7 *******************
00399     // Cube combiner.
00400     // If cube is loaded and its combiner is present, run it.
00401     if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
00402       run_cube_combiner(page_res);
00403     }
00404 
00405     // ****************** Pass 8 *******************
00406     font_recognition_pass(page_res);
00407 
00408     // ****************** Pass 9 *******************
00409     // Check the correctness of the final results.
00410     blamer_pass(page_res);
00411   }
00412   script_pos_pass(page_res);
00413 
00414   // Write results pass.
00415   set_global_loc_code(LOC_WRITE_RESULTS);
00416   // This is now redundant, but retained commented so show how to obtain
00417   // bounding boxes and style information.
00418 
00419   // changed by jetsoft
00420   // needed for dll to output memory structure
00421   if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
00422     output_pass(page_res_it, target_word_box);
00423   // end jetsoft
00424   PageSegMode pageseg_mode = static_cast<PageSegMode>(
00425       static_cast<int>(tessedit_pageseg_mode));
00426   textord_.CleanupSingleRowResult(pageseg_mode, page_res);
00427 
00428   if (monitor != NULL) {
00429     monitor->progress = 100;
00430   }
00431   return true;
00432 }
00433 
00434 void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
00435   PAGE_RES_IT word_it(page_res);
00436 
00437   WERD_RES *w_prev = NULL;
00438   WERD_RES *w = word_it.word();
00439   while (1) {
00440     w_prev = w;
00441     while (word_it.forward() != NULL &&
00442            (!word_it.word() || word_it.word()->part_of_combo)) {
00443       // advance word_it, skipping over parts of combos
00444     }
00445     if (!word_it.word()) break;
00446     w = word_it.word();
00447     if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
00448       continue;
00449     }
00450     if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
00451       if (tessedit_bigram_debug) {
00452         tprintf("Skipping because one of the words is W_REP_CHAR\n");
00453       }
00454       continue;
00455     }
00456     // Two words sharing the same language model, excellent!
00457     GenericVector<WERD_CHOICE *> overrides_word1;
00458     GenericVector<WERD_CHOICE *> overrides_word2;
00459 
00460     STRING orig_w1_str = w_prev->best_choice->unichar_string();
00461     STRING orig_w2_str = w->best_choice->unichar_string();
00462     WERD_CHOICE prev_best(w->uch_set);
00463     {
00464       int w1start, w1end;
00465       w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
00466       prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
00467     }
00468     WERD_CHOICE this_best(w->uch_set);
00469     {
00470       int w2start, w2end;
00471       w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
00472       this_best = w->best_choice->shallow_copy(w2start, w2end);
00473     }
00474 
00475     if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
00476       if (tessedit_bigram_debug) {
00477         tprintf("Top choice \"%s %s\" verified by bigram model.\n",
00478                 orig_w1_str.string(), orig_w2_str.string());
00479       }
00480       continue;
00481     }
00482     if (tessedit_bigram_debug > 2) {
00483       tprintf("Examining alt choices for \"%s %s\".\n",
00484               orig_w1_str.string(), orig_w2_str.string());
00485     }
00486     if (tessedit_bigram_debug > 1) {
00487       if (!w_prev->best_choices.singleton()) {
00488         w_prev->PrintBestChoices();
00489       }
00490       if (!w->best_choices.singleton()) {
00491         w->PrintBestChoices();
00492       }
00493     }
00494     float best_rating = 0.0;
00495     int best_idx = 0;
00496     WERD_CHOICE_IT prev_it(&w_prev->best_choices);
00497     for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
00498       WERD_CHOICE *p1 = prev_it.data();
00499       WERD_CHOICE strip1(w->uch_set);
00500       {
00501         int p1start, p1end;
00502         p1->GetNonSuperscriptSpan(&p1start, &p1end);
00503         strip1 = p1->shallow_copy(p1start, p1end);
00504       }
00505       WERD_CHOICE_IT w_it(&w->best_choices);
00506       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00507         WERD_CHOICE *p2 = w_it.data();
00508         WERD_CHOICE strip2(w->uch_set);
00509         {
00510           int p2start, p2end;
00511           p2->GetNonSuperscriptSpan(&p2start, &p2end);
00512           strip2 = p2->shallow_copy(p2start, p2end);
00513         }
00514         if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
00515           overrides_word1.push_back(p1);
00516           overrides_word2.push_back(p2);
00517           if (overrides_word1.size() == 1 ||
00518               p1->rating() + p2->rating() < best_rating) {
00519             best_rating = p1->rating() + p2->rating();
00520             best_idx = overrides_word1.size() - 1;
00521           }
00522         }
00523       }
00524     }
00525     if (overrides_word1.size() >= 1) {
00526       // Excellent, we have some bigram matches.
00527       if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice,
00528                                             *overrides_word1[best_idx]) &&
00529           EqualIgnoringCaseAndTerminalPunct(*w->best_choice,
00530                                             *overrides_word2[best_idx])) {
00531         if (tessedit_bigram_debug > 1) {
00532           tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
00533                   "model.\n", orig_w1_str.string(), orig_w2_str.string());
00534         }
00535         continue;
00536       }
00537       STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
00538       STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
00539       if (new_w1_str != orig_w1_str) {
00540         w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
00541       }
00542       if (new_w2_str != orig_w2_str) {
00543         w->ReplaceBestChoice(overrides_word2[best_idx]);
00544       }
00545       if (tessedit_bigram_debug > 0) {
00546         STRING choices_description;
00547         int num_bigram_choices
00548             = overrides_word1.size() * overrides_word2.size();
00549         if (num_bigram_choices == 1) {
00550           choices_description = "This was the unique bigram choice.";
00551         } else {
00552           if (tessedit_bigram_debug > 1) {
00553             STRING bigrams_list;
00554             const int kMaxChoicesToPrint = 20;
00555             for (int i = 0; i < overrides_word1.size() &&
00556                  i < kMaxChoicesToPrint; i++) {
00557               if (i > 0) { bigrams_list += ", "; }
00558               WERD_CHOICE *p1 = overrides_word1[i];
00559               WERD_CHOICE *p2 = overrides_word2[i];
00560               bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
00561               if (i == kMaxChoicesToPrint) {
00562                 bigrams_list += " ...";
00563               }
00564             }
00565             choices_description = "There were many choices: {";
00566             choices_description += bigrams_list;
00567             choices_description += "}";
00568           } else {
00569             choices_description.add_str_int("There were ", num_bigram_choices);
00570             choices_description += " compatible bigrams.";
00571           }
00572         }
00573         tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
00574                 orig_w1_str.string(), orig_w2_str.string(),
00575                 new_w1_str.string(), new_w2_str.string(),
00576                 choices_description.string());
00577       }
00578     }
00579   }
00580 }
00581 
00582 void Tesseract::rejection_passes(PAGE_RES* page_res,
00583                                  ETEXT_DESC* monitor,
00584                                  const TBOX* target_word_box,
00585                                  const char* word_config) {
00586   PAGE_RES_IT page_res_it(page_res);
00587   // ****************** Pass 5 *******************
00588   // Gather statistics on rejects.
00589   int word_index = 0;
00590   while (!tessedit_test_adaption && page_res_it.word() != NULL) {
00591     set_global_loc_code(LOC_MM_ADAPT);
00592     WERD_RES* word = page_res_it.word();
00593     word_index++;
00594     if (monitor != NULL) {
00595       monitor->ocr_alive = TRUE;
00596       monitor->progress = 95 + 5 * word_index / stats_.word_count;
00597     }
00598     if (word->rebuild_word == NULL) {
00599       // Word was not processed by tesseract.
00600       page_res_it.forward();
00601       continue;
00602     }
00603     check_debug_pt(word, 70);
00604 
00605     // changed by jetsoft
00606     // specific to its needs to extract one word when need
00607     if (target_word_box &&
00608         !ProcessTargetWord(word->word->bounding_box(),
00609                            *target_word_box, word_config, 4)) {
00610       page_res_it.forward();
00611       continue;
00612     }
00613     // end jetsoft
00614 
00615     page_res_it.rej_stat_word();
00616     int chars_in_word = word->reject_map.length();
00617     int rejects_in_word = word->reject_map.reject_count();
00618 
00619     int blob_quality = word_blob_quality(word, page_res_it.row()->row);
00620     stats_.doc_blob_quality += blob_quality;
00621     int outline_errs = word_outline_errs(word);
00622     stats_.doc_outline_errs += outline_errs;
00623     inT16 all_char_quality;
00624     inT16 accepted_all_char_quality;
00625     word_char_quality(word, page_res_it.row()->row,
00626                       &all_char_quality, &accepted_all_char_quality);
00627     stats_.doc_char_quality += all_char_quality;
00628     uinT8 permuter_type = word->best_choice->permuter();
00629     if ((permuter_type == SYSTEM_DAWG_PERM) ||
00630         (permuter_type == FREQ_DAWG_PERM) ||
00631         (permuter_type == USER_DAWG_PERM)) {
00632       stats_.good_char_count += chars_in_word - rejects_in_word;
00633       stats_.doc_good_char_quality += accepted_all_char_quality;
00634     }
00635     check_debug_pt(word, 80);
00636     if (tessedit_reject_bad_qual_wds &&
00637         (blob_quality == 0) && (outline_errs >= chars_in_word))
00638       word->reject_map.rej_word_bad_quality();
00639     check_debug_pt(word, 90);
00640     page_res_it.forward();
00641   }
00642 
00643   if (tessedit_debug_quality_metrics) {
00644     tprintf
00645       ("QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f"
00646        " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
00647       page_res->char_count, page_res->rej_count,
00648       page_res->rej_count / static_cast<float>(page_res->char_count),
00649       stats_.doc_blob_quality,
00650       stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
00651       stats_.doc_outline_errs,
00652       stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
00653       stats_.doc_char_quality,
00654       stats_.doc_char_quality / static_cast<float>(page_res->char_count),
00655       stats_.doc_good_char_quality,
00656       (stats_.good_char_count > 0) ?
00657       (stats_.doc_good_char_quality /
00658        static_cast<float>(stats_.good_char_count)) : 0.0);
00659   }
00660   BOOL8 good_quality_doc =
00661     ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
00662      quality_rej_pc) &&
00663     (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
00664      quality_blob_pc) &&
00665     (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
00666      quality_outline_pc) &&
00667     (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
00668      quality_char_pc);
00669 
00670   // ****************** Pass 6 *******************
00671   // Do whole document or whole block rejection pass
00672   if (!tessedit_test_adaption) {
00673     set_global_loc_code(LOC_DOC_BLK_REJ);
00674     quality_based_rejection(page_res_it, good_quality_doc);
00675   }
00676 }
00677 
00678 void Tesseract::blamer_pass(PAGE_RES* page_res) {
00679   if (!wordrec_run_blamer) return;
00680   PAGE_RES_IT page_res_it(page_res);
00681   for (page_res_it.restart_page(); page_res_it.word() != NULL;
00682       page_res_it.forward()) {
00683     WERD_RES *word = page_res_it.word();
00684     BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
00685     page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
00686   }
00687   tprintf("Blame reasons:\n");
00688   for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
00689     tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(
00690         static_cast<IncorrectResultReason>(bl)),
00691         page_res->blame_reasons[bl]);
00692   }
00693   if (page_res->misadaption_log.length() > 0) {
00694     tprintf("Misadaption log:\n");
00695     for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
00696       tprintf("%s\n", page_res->misadaption_log[i].string());
00697     }
00698   }
00699 }
00700 
00701 // Sets script positions and detects smallcaps on all output words.
00702 void Tesseract::script_pos_pass(PAGE_RES* page_res) {
00703   PAGE_RES_IT page_res_it(page_res);
00704   for (page_res_it.restart_page(); page_res_it.word() != NULL;
00705       page_res_it.forward()) {
00706     WERD_RES* word = page_res_it.word();
00707      if (word->word->flag(W_REP_CHAR)) {
00708       page_res_it.forward();
00709       continue;
00710     }
00711     float x_height = page_res_it.block()->block->x_height();
00712     float word_x_height = word->x_height;
00713     if (word_x_height < word->best_choice->min_x_height() ||
00714         word_x_height > word->best_choice->max_x_height()) {
00715       word_x_height = (word->best_choice->min_x_height() +
00716           word->best_choice->max_x_height()) / 2.0f;
00717     }
00718     // Test for small caps. Word capheight must be close to block xheight,
00719     // and word must contain no lower case letters, and at least one upper case.
00720     double small_cap_xheight = x_height * kXHeightCapRatio;
00721     double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
00722     if (word->uch_set->script_has_xheight() &&
00723         small_cap_xheight - small_cap_delta <= word_x_height &&
00724         word_x_height <= small_cap_xheight + small_cap_delta) {
00725       // Scan for upper/lower.
00726       int num_upper = 0;
00727       int num_lower = 0;
00728       for (int i = 0; i < word->best_choice->length(); ++i) {
00729         if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
00730           ++num_upper;
00731         else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
00732           ++num_lower;
00733       }
00734       if (num_upper > 0 && num_lower == 0)
00735         word->small_caps = true;
00736     }
00737     word->SetScriptPositions();
00738   }
00739 }
00740 
00741 // Helper returns true if the new_word is better than the word, using a
00742 // simple test of better certainty AND rating (to reduce false positives
00743 // from cube) or a dictionary vs non-dictionary word.
00744 static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word,
00745                           double rating_ratio,
00746                           double certainty_margin) {
00747   if (new_word.best_choice == NULL) {
00748     return false;  // New one no good.
00749   }
00750   if (word.best_choice == NULL) {
00751     return true;  // Old one no good.
00752   }
00753   if (new_word.best_choice->certainty() > word.best_choice->certainty() &&
00754       new_word.best_choice->rating() < word.best_choice->rating()) {
00755     return true;  // New word has better confidence.
00756   }
00757   if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) &&
00758       Dict::valid_word_permuter(new_word.best_choice->permuter(), false) &&
00759       new_word.best_choice->rating() <
00760           word.best_choice->rating() * rating_ratio &&
00761       new_word.best_choice->certainty() >
00762           word.best_choice->certainty() - certainty_margin) {
00763     return true;  // New word is from a dictionary.
00764   }
00765   return false;  // New word is no better.
00766 }
00767 
00768 // Helper to recognize the word using the given (language-specific) tesseract.
00769 // Returns true if the result was better than previously.
00770 bool Tesseract::RetryWithLanguage(const WERD_RES& best_word,
00771                                   WordData* word_data, WERD_RES* word,
00772                                   WordRecognizer recognizer) {
00773   if (classify_debug_level || cube_debug_level) {
00774     tprintf("Retrying word using lang %s, oem %d\n",
00775             lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
00776   }
00777   // Run the recognizer on the word.
00778   // Initial version is a bit of a hack based on better certainty and rating
00779   // (to reduce false positives from cube) or a dictionary vs non-dictionary
00780   // word.
00781   (this->*recognizer)(word_data, word);
00782   bool new_is_better = NewWordBetter(best_word, *word,
00783                                      classify_max_rating_ratio,
00784                                      classify_max_certainty_margin);
00785   if (classify_debug_level || cube_debug_level) {
00786     if (word->best_choice == NULL) {
00787       tprintf("NULL result %s better!\n",
00788               new_is_better ? "IS" : "NOT");
00789     } else {
00790       tprintf("New result %s better:%s, r=%g, c=%g\n",
00791               new_is_better ? "IS" : "NOT",
00792               word->best_choice->unichar_string().string(),
00793               word->best_choice->rating(),
00794               word->best_choice->certainty());
00795     }
00796   }
00797   return new_is_better;
00798 }
00799 
00800 // Generic function for classifying a word. Can be used either for pass1 or
00801 // pass2 according to the function passed to recognizer.
00802 // word block and row are the current location in the document's PAGE_RES.
00803 // Recognizes in the current language, and if successful that is all.
00804 // If recognition was not successful, tries all available languages until
00805 // it gets a successful result or runs out of languages. Keeps the best result.
00806 void Tesseract::classify_word_and_language(WordRecognizer recognizer,
00807                                            WordData* word_data) {
00808   // Points to the best result. May be word or in lang_words.
00809   WERD_RES* word = word_data->word;
00810   clock_t start_t = clock();
00811   if (classify_debug_level || cube_debug_level) {
00812     tprintf("Processing word with lang %s at:",
00813             most_recently_used_->lang.string());
00814     word->word->bounding_box().print();
00815   }
00816   const char* result_type = "Initial";
00817   bool initially_done = !word->tess_failed && word->done;
00818   if (initially_done) {
00819     // If done on pass1, leave it as-is.
00820     most_recently_used_ = word->tesseract;
00821     result_type = "Already done";
00822   } else {
00823     if (most_recently_used_ != this) {
00824       // Point to the word for most_recently_used_.
00825       for (int s = 0; s < sub_langs_.size(); ++s) {
00826         if (most_recently_used_ == sub_langs_[s]) {
00827           word = &word_data->lang_words[s];
00828           break;
00829         }
00830       }
00831     }
00832     (most_recently_used_->*recognizer)(word_data, word);
00833     if (!word->tess_failed && word->tess_accepted)
00834       result_type = "Accepted";
00835   }
00836   if (classify_debug_level || cube_debug_level) {
00837     tprintf("%s result: %s r=%.4g, c=%.4g, accepted=%d, adaptable=%d"
00838             " xht=[%g,%g]\n",
00839             result_type,
00840             word->best_choice->unichar_string().string(),
00841             word->best_choice->rating(),
00842             word->best_choice->certainty(),
00843             word->tess_accepted, word->tess_would_adapt,
00844             word->best_choice->min_x_height(),
00845             word->best_choice->max_x_height());
00846   }
00847   if (word->tess_failed || !word->tess_accepted) {
00848     // Try all the other languages to see if they are any better.
00849     Tesseract* previous_used = most_recently_used_;
00850     if (most_recently_used_ != this) {
00851       if (classify_debug_level) {
00852         tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string());
00853       }
00854       if (word_data->word->tesseract == this) {
00855         // This is pass1, and we are trying the main language.
00856         if (RetryWithLanguage(*word, word_data, word_data->word, recognizer)) {
00857           most_recently_used_ = this;
00858           word = word_data->word;
00859         }
00860       } else {
00861         // This is pass2, and we are trying the main language again, but it
00862         // has no word allocated to it, so we must re-initialize it.
00863         WERD_RES main_word(*word_data->word);
00864         main_word.InitForRetryRecognition(*word_data->word);
00865         main_word.SetupForRecognition(unicharset, this, BestPix(),
00866                                       tessedit_ocr_engine_mode, NULL,
00867                                       classify_bln_numeric_mode,
00868                                       textord_use_cjk_fp_model,
00869                                       poly_allow_detailed_fx,
00870                                       word_data->row, word_data->block);
00871         if (RetryWithLanguage(*word, word_data, &main_word, recognizer)) {
00872           most_recently_used_ = this;
00873           word_data->word->ConsumeWordResults(&main_word);
00874           word = word_data->word;
00875         }
00876       }
00877       if (!word->tess_failed && word->tess_accepted)
00878         return;  // No need to look at the others.
00879     }
00880 
00881     for (int i = 0; i < sub_langs_.size(); ++i) {
00882       if (sub_langs_[i] != previous_used) {
00883         if (classify_debug_level) {
00884           tprintf("Retrying with sub-Tesseract[%d] lang: %s\n",
00885                   i, sub_langs_[i]->lang.string());
00886         }
00887         if (sub_langs_[i]->RetryWithLanguage(*word, word_data,
00888                                              &word_data->lang_words[i],
00889                                              recognizer)) {
00890           most_recently_used_ = sub_langs_[i];
00891           word = &word_data->lang_words[i];
00892           if (!word->tess_failed && word->tess_accepted)
00893             break;  // No need to look at the others.
00894         }
00895       }
00896     }
00897   }
00898   if (word != word_data->word) {
00899     // Move the result for the best language to the main word.
00900     word_data->word->ConsumeWordResults(word);
00901   }
00902   clock_t ocr_t = clock();
00903   if (tessedit_timing_debug) {
00904     tprintf("%s (ocr took %.2f sec)\n",
00905             word->best_choice->unichar_string().string(),
00906             static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
00907   }
00908 }
00909 
00916 void Tesseract::classify_word_pass1(WordData* word_data, WERD_RES* word) {
00917   ROW* row = word_data->row;
00918   BLOCK* block = word_data->block;
00919   prev_word_best_choice_ = word_data->prev_word != NULL
00920       ? word_data->prev_word->word->best_choice : NULL;
00921   // If we only intend to run cube - run it and return.
00922   if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
00923     cube_word_pass1(block, row, word);
00924     return;
00925   }
00926   match_word_pass_n(1, word, row, block);
00927   if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
00928     word->tess_would_adapt = AdaptableWord(word);
00929     bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
00930 
00931     if (adapt_ok) {
00932       // Send word to adaptive classifier for training.
00933       word->BestChoiceToCorrectText();
00934       LearnWord(NULL, word);
00935       // Mark misadaptions if running blamer.
00936       if (word->blamer_bundle != NULL) {
00937         word->blamer_bundle->SetMisAdaptionDebug(word->best_choice,
00938                                                  wordrec_debug_blamer);
00939       }
00940     }
00941 
00942     if (tessedit_enable_doc_dict && !word->IsAmbiguous())
00943       tess_add_doc_word(word->best_choice);
00944   }
00945 }
00946 
00947 // Helper to report the result of the xheight fix.
00948 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
00949                                    WERD_RES* word, WERD_RES* new_word) {
00950   tprintf("New XHT Match:%s = %s ",
00951           word->best_choice->unichar_string().string(),
00952           word->best_choice->debug_string().string());
00953   word->reject_map.print(debug_fp);
00954   tprintf(" -> %s = %s ",
00955           new_word->best_choice->unichar_string().string(),
00956           new_word->best_choice->debug_string().string());
00957   new_word->reject_map.print(debug_fp);
00958   tprintf(" %s->%s %s %s\n",
00959           word->guessed_x_ht ? "GUESS" : "CERT",
00960           new_word->guessed_x_ht ? "GUESS" : "CERT",
00961           new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
00962           accept_new_word ? "ACCEPTED" : "");
00963 }
00964 
00965 // Run the x-height fix-up, based on min/max top/bottom information in
00966 // unicharset.
00967 // Returns true if the word was changed.
00968 // See the comment in fixxht.cpp for a description of the overall process.
00969 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
00970   bool accept_new_x_ht = false;
00971   int original_misfits = CountMisfitTops(word);
00972   if (original_misfits == 0)
00973     return false;
00974   float new_x_ht = ComputeCompatibleXheight(word);
00975   if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
00976     WERD_RES new_x_ht_word(word->word);
00977     if (word->blamer_bundle != NULL) {
00978       new_x_ht_word.blamer_bundle = new BlamerBundle();
00979       new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
00980     }
00981     new_x_ht_word.x_height = new_x_ht;
00982     new_x_ht_word.caps_height = 0.0;
00983     new_x_ht_word.SetupForRecognition(
00984           unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL,
00985           classify_bln_numeric_mode, textord_use_cjk_fp_model,
00986           poly_allow_detailed_fx, row, block);
00987     match_word_pass_n(2, &new_x_ht_word, row, block);
00988     if (!new_x_ht_word.tess_failed) {
00989       int new_misfits = CountMisfitTops(&new_x_ht_word);
00990       if (debug_x_ht_level >= 1) {
00991         tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
00992                 original_misfits, word->x_height,
00993                 new_misfits, new_x_ht);
00994         tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
00995                 word->best_choice->rating(), word->best_choice->certainty(),
00996                 new_x_ht_word.best_choice->rating(),
00997                 new_x_ht_word.best_choice->certainty());
00998       }
00999       // The misfits must improve and either the rating or certainty.
01000       accept_new_x_ht = new_misfits < original_misfits &&
01001                         (new_x_ht_word.best_choice->certainty() >
01002                             word->best_choice->certainty() ||
01003                          new_x_ht_word.best_choice->rating() <
01004                             word->best_choice->rating());
01005       if (debug_x_ht_level >= 1) {
01006         ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
01007       }
01008     }
01009     if (accept_new_x_ht) {
01010       word->ConsumeWordResults(&new_x_ht_word);
01011       return true;
01012     }
01013   }
01014   return false;
01015 }
01016 
01023 void Tesseract::classify_word_pass2(WordData* word_data, WERD_RES* word) {
01024   // Return if we do not want to run Tesseract.
01025   if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY &&
01026       tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED)
01027     return;
01028   ROW* row = word_data->row;
01029   BLOCK* block = word_data->block;
01030   prev_word_best_choice_ = word_data->prev_word != NULL
01031       ? word_data->prev_word->word->best_choice : NULL;
01032 
01033   set_global_subloc_code(SUBLOC_NORM);
01034   check_debug_pt(word, 30);
01035   if (!word->done || tessedit_training_tess) {
01036     word->caps_height = 0.0;
01037     if (word->x_height == 0.0f)
01038       word->x_height = row->x_height();
01039     match_word_pass_n(2, word, row, block);
01040     check_debug_pt(word, 40);
01041   }
01042 
01043   SubAndSuperscriptFix(word);
01044 
01045   if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
01046     if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
01047         block->classify_rotation().y() == 0.0f) {
01048       // Use the tops and bottoms since they are available.
01049       TrainedXheightFix(word, block, row);
01050     }
01051 
01052     set_global_subloc_code(SUBLOC_NORM);
01053   }
01054 #ifndef GRAPHICS_DISABLED
01055   if (tessedit_display_outwords) {
01056     if (fx_win == NULL)
01057       create_fx_win();
01058     clear_fx_win();
01059     word->rebuild_word->plot(fx_win);
01060     TBOX wbox = word->rebuild_word->bounding_box();
01061     fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
01062                             wbox.right(), wbox.bottom());
01063     ScrollView::Update();
01064   }
01065 #endif
01066   set_global_subloc_code(SUBLOC_NORM);
01067   check_debug_pt(word, 50);
01068 }
01069 
01070 
01077 void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
01078                                   ROW *row, BLOCK* block) {
01079   if (word->tess_failed) return;
01080   tess_segment_pass_n(pass_n, word);
01081 
01082   if (!word->tess_failed) {
01083     if (!word->word->flag (W_REP_CHAR)) {
01084        word->fix_quotes();
01085       if (tessedit_fix_hyphens)
01086         word->fix_hyphens();
01087       /* Dont trust fix_quotes! - though I think I've fixed the bug */
01088       if (word->best_choice->length() != word->box_word->length()) {
01089         tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
01090                 " #Blobs=%d\n",
01091                 word->best_choice->debug_string().string(),
01092                 word->best_choice->length(),
01093                 word->box_word->length());
01094 
01095       }
01096       word->tess_accepted = tess_acceptable_word(word);
01097 
01098       // Also sets word->done flag
01099       make_reject_map(word, row, pass_n);
01100     }
01101   }
01102   set_word_fonts(word);
01103 
01104   ASSERT_HOST(word->raw_choice != NULL);
01105 }
01106 
01107 // Helper to return the best rated BLOB_CHOICE in the whole word that matches
01108 // the given char_id, or NULL if none can be found.
01109 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
01110                                            WERD_RES* word_res) {
01111   // Find the corresponding best BLOB_CHOICE from any position in the word_res.
01112   BLOB_CHOICE* best_choice = NULL;
01113   for (int i = 0; i < word_res->best_choice->length(); ++i) {
01114     BLOB_CHOICE* choice = FindMatchingChoice(char_id,
01115                                              word_res->GetBlobChoices(i));
01116     if (choice != NULL) {
01117       if (best_choice == NULL || choice->rating() < best_choice->rating())
01118         best_choice = choice;
01119     }
01120   }
01121   return best_choice;
01122 }
01123 
01124 // Helper to insert blob_choice in each location in the leader word if there is
01125 // no matching BLOB_CHOICE there already, and correct any incorrect results
01126 // in the best_choice.
01127 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
01128                                   WERD_RES* word_res) {
01129   WERD_CHOICE* word = word_res->best_choice;
01130   for (int i = 0; i < word_res->best_choice->length(); ++i) {
01131     BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
01132                                              word_res->GetBlobChoices(i));
01133     if (choice == NULL) {
01134       BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
01135       choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
01136     }
01137   }
01138   // Correct any incorrect results in word.
01139   for (int i = 0; i < word->length(); ++i) {
01140     if (word->unichar_id(i) != blob_choice->unichar_id())
01141       word->set_unichar_id(blob_choice->unichar_id(), i);
01142   }
01143 }
01144 
01152 void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {
01153   WERD_RES *word_res = page_res_it->word();
01154   const WERD_CHOICE &word = *(word_res->best_choice);
01155 
01156   // Find the frequency of each unique character in the word.
01157   UNICHAR_ID space = word_res->uch_set->unichar_to_id(" ");
01158   SortHelper<UNICHAR_ID> rep_ch(word.length());
01159   for (int i = 0; i < word.length(); ++i) {
01160     if (word.unichar_id(i) != space)
01161       rep_ch.Add(word.unichar_id(i), 1);
01162   }
01163 
01164   // Find the most frequent result.
01165   UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
01166   int max_count = rep_ch.MaxCount(&maxch_id);
01167   // Find the best exemplar of a classifier result for maxch_id.
01168   BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
01169   if (best_choice == NULL) {
01170     tprintf("Failed to find a choice for %s, occurring %d times\n",
01171             word_res->uch_set->debug_str(maxch_id).string(), max_count);
01172     return;
01173   }
01174   word_res->done = TRUE;
01175 
01176   // Measure the mean space.
01177   int total_gap = 0;
01178   int gap_count = 0;
01179   WERD* werd = word_res->word;
01180   C_BLOB_IT blob_it(werd->cblob_list());
01181   C_BLOB* prev_blob = blob_it.data();
01182   for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
01183     C_BLOB* blob = blob_it.data();
01184     int gap = blob->bounding_box().left();
01185     gap -= prev_blob->bounding_box().right();
01186     total_gap += gap;
01187     ++gap_count;
01188     prev_blob = blob;
01189   }
01190   if (total_gap > word_res->x_height * gap_count * kRepcharGapThreshold) {
01191     // Needs spaces between.
01192     ExplodeRepeatedWord(best_choice, page_res_it);
01193   } else {
01194     // Just correct existing classification.
01195     CorrectRepcharChoices(best_choice, word_res);
01196     word_res->reject_map.initialise(word.length());
01197   }
01198 }
01199 
01200 // Explode the word at the given iterator location into individual words
01201 // of a single given unichar_id defined by best_choice.
01202 // The original word is deleted, and the replacements copy most of their
01203 // fields from the original.
01204 void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice,
01205                                     PAGE_RES_IT* page_res_it) {
01206   WERD_RES *word_res = page_res_it->word();
01207   ASSERT_HOST(best_choice != NULL);
01208 
01209   // Make a new word for each blob in the original.
01210   WERD* werd = word_res->word;
01211   C_BLOB_IT blob_it(werd->cblob_list());
01212   for (; !blob_it.empty(); blob_it.forward()) {
01213     bool first_blob = blob_it.at_first();
01214     bool last_blob = blob_it.at_last();
01215     WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob,
01216                                                     blob_it.extract());
01217     // Note that blamer_bundle (truth information) is not copied, which is
01218     // desirable, since the newly inserted words would not have the original
01219     // bounding box corresponding to the one recorded in truth fields.
01220     WERD_RES* rep_word =
01221         page_res_it->InsertSimpleCloneWord(*word_res, blob_word);
01222     // Setup the single char WERD_RES
01223     if (rep_word->SetupForRecognition(*word_res->uch_set, this, BestPix(),
01224                                       tessedit_ocr_engine_mode, NULL, false,
01225                                       textord_use_cjk_fp_model,
01226                                       poly_allow_detailed_fx,
01227                                       page_res_it->row()->row,
01228                                       page_res_it->block()->block)) {
01229       rep_word->CloneChoppedToRebuild();
01230       BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice);
01231       rep_word->FakeClassifyWord(1, &blob_choice);
01232     }
01233   }
01234   page_res_it->DeleteCurrentWord();
01235 }
01236 
01237 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
01238     const UNICHARSET& char_set, const char *s, const char *lengths) {
01239   int i = 0;
01240   int offset = 0;
01241   int leading_punct_count;
01242   int upper_count = 0;
01243   int hyphen_pos = -1;
01244   ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
01245 
01246   if (strlen (lengths) > 20)
01247     return word_type;
01248 
01249   /* Single Leading punctuation char*/
01250 
01251   if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
01252     offset += lengths[i++];
01253   leading_punct_count = i;
01254 
01255   /* Initial cap */
01256   while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
01257     offset += lengths[i++];
01258     upper_count++;
01259   }
01260   if (upper_count > 1) {
01261     word_type = AC_UPPER_CASE;
01262   } else {
01263     /* Lower case word, possibly with an initial cap */
01264     while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
01265       offset += lengths[i++];
01266     }
01267     if (i - leading_punct_count < quality_min_initial_alphas_reqd)
01268       goto not_a_word;
01269     /*
01270     Allow a single hyphen in a lower case word
01271     - dont trust upper case - I've seen several cases of "H" -> "I-I"
01272     */
01273     if (lengths[i] == 1 && s[offset] == '-') {
01274       hyphen_pos = i;
01275       offset += lengths[i++];
01276       if (s[offset] != '\0') {
01277         while ((s[offset] != '\0') &&
01278                char_set.get_islower(s + offset, lengths[i])) {
01279           offset += lengths[i++];
01280         }
01281         if (i < hyphen_pos + 3)
01282           goto not_a_word;
01283       }
01284     } else {
01285       /* Allow "'s" in NON hyphenated lower case words */
01286       if (lengths[i] == 1 && (s[offset] == '\'') &&
01287           lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
01288         offset += lengths[i++];
01289         offset += lengths[i++];
01290       }
01291     }
01292     if (upper_count > 0)
01293       word_type = AC_INITIAL_CAP;
01294     else
01295       word_type = AC_LOWER_CASE;
01296   }
01297 
01298   /* Up to two different, constrained trailing punctuation chars */
01299   if (lengths[i] == 1 && s[offset] != '\0' &&
01300       STRING(chs_trailing_punct1).contains(s[offset]))
01301     offset += lengths[i++];
01302   if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
01303       s[offset - lengths[i - 1]] != s[offset] &&
01304       STRING(chs_trailing_punct2).contains (s[offset]))
01305     offset += lengths[i++];
01306 
01307   if (s[offset] != '\0')
01308     word_type = AC_UNACCEPTABLE;
01309 
01310   not_a_word:
01311 
01312   if (word_type == AC_UNACCEPTABLE) {
01313     /* Look for abbreviation string */
01314     i = 0;
01315     offset = 0;
01316     if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
01317       word_type = AC_UC_ABBREV;
01318       while (s[offset] != '\0' &&
01319              char_set.get_isupper(s + offset, lengths[i]) &&
01320              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
01321         offset += lengths[i++];
01322         offset += lengths[i++];
01323       }
01324     }
01325     else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
01326       word_type = AC_LC_ABBREV;
01327       while (s[offset] != '\0' &&
01328              char_set.get_islower(s + offset, lengths[i]) &&
01329              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
01330         offset += lengths[i++];
01331         offset += lengths[i++];
01332       }
01333     }
01334     if (s[offset] != '\0')
01335       word_type = AC_UNACCEPTABLE;
01336   }
01337 
01338   return word_type;
01339 }
01340 
01341 BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
01342   BOOL8 show_map_detail = FALSE;
01343   inT16 i;
01344 
01345   #ifndef SECURE_NAMES
01346   if (!test_pt)
01347     return FALSE;
01348 
01349   tessedit_rejection_debug.set_value (FALSE);
01350   debug_x_ht_level.set_value (0);
01351 
01352   if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
01353     if (location < 0)
01354       return TRUE;               // For breakpoint use
01355     tessedit_rejection_debug.set_value (TRUE);
01356     debug_x_ht_level.set_value (20);
01357     tprintf ("\n\nTESTWD::");
01358     switch (location) {
01359       case 0:
01360         tprintf ("classify_word_pass1 start\n");
01361         word->word->print();
01362         break;
01363       case 10:
01364         tprintf ("make_reject_map: initial map");
01365         break;
01366       case 20:
01367         tprintf ("make_reject_map: after NN");
01368         break;
01369       case 30:
01370         tprintf ("classify_word_pass2 - START");
01371         break;
01372       case 40:
01373         tprintf ("classify_word_pass2 - Pre Xht");
01374         break;
01375       case 50:
01376         tprintf ("classify_word_pass2 - END");
01377         show_map_detail = TRUE;
01378         break;
01379       case 60:
01380         tprintf ("fixspace");
01381         break;
01382       case 70:
01383         tprintf ("MM pass START");
01384         break;
01385       case 80:
01386         tprintf ("MM pass END");
01387         break;
01388       case 90:
01389         tprintf ("After Poor quality rejection");
01390         break;
01391       case 100:
01392         tprintf ("unrej_good_quality_words - START");
01393         break;
01394       case 110:
01395         tprintf ("unrej_good_quality_words - END");
01396         break;
01397       case 120:
01398         tprintf ("Write results pass");
01399         show_map_detail = TRUE;
01400         break;
01401     }
01402     tprintf(" \"%s\" ",
01403             word->best_choice->unichar_string().string());
01404     word->reject_map.print (debug_fp);
01405     tprintf ("\n");
01406     if (show_map_detail) {
01407       tprintf ("\"%s\"\n", word->best_choice->unichar_string().string());
01408       for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
01409         tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
01410         word->reject_map[i].full_print(debug_fp);
01411       }
01412     }
01413     tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
01414     tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
01415     return TRUE;
01416   }
01417   else
01418   #endif
01419     return FALSE;
01420 }
01421 
01427 static void find_modal_font(           //good chars in word
01428                      STATS *fonts,     //font stats
01429                      inT16 *font_out,   //output font
01430                      inT8 *font_count  //output count
01431                     ) {
01432   inT16 font;                     //font index
01433   inT32 count;                   //pile couat
01434 
01435   if (fonts->get_total () > 0) {
01436     font = (inT16) fonts->mode ();
01437     *font_out = font;
01438     count = fonts->pile_count (font);
01439     *font_count = count < MAX_INT8 ? count : MAX_INT8;
01440     fonts->add (font, -*font_count);
01441   }
01442   else {
01443     *font_out = -1;
01444     *font_count = 0;
01445   }
01446 }
01447 
01453 void Tesseract::set_word_fonts(WERD_RES *word) {
01454   // Don't try to set the word fonts for a cube word, as the configs
01455   // will be meaningless.
01456   if (word->chopped_word == NULL) return;
01457   ASSERT_HOST(word->best_choice != NULL);
01458 
01459   inT32 index;                   // char id index
01460                                  // character iterator
01461   BLOB_CHOICE_IT choice_it;      // choice iterator
01462   int fontinfo_size = get_fontinfo_table().size();
01463   int fontset_size = get_fontset_table().size();
01464   if (fontinfo_size == 0 || fontset_size == 0) return;
01465   STATS fonts(0, fontinfo_size);  // font counters
01466 
01467   word->italic = 0;
01468   word->bold = 0;
01469   if (!word->best_choice_fontinfo_ids.empty()) {
01470     word->best_choice_fontinfo_ids.clear();
01471   }
01472   // Compute the modal font for the word
01473   for (index = 0; index < word->best_choice->length(); ++index) {
01474     UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index);
01475     choice_it.set_to_list(word->GetBlobChoices(index));
01476     if (tessedit_debug_fonts) {
01477       tprintf("Examining fonts in %s\n",
01478               word->best_choice->debug_string().string());
01479     }
01480     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
01481          choice_it.forward()) {
01482       UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id();
01483       if (blob_ch_id == word_ch_id) {
01484         if (tessedit_debug_fonts) {
01485           tprintf("%s font %s (%d) font2 %s (%d)\n",
01486                   word->uch_set->id_to_unichar(blob_ch_id),
01487                   choice_it.data()->fontinfo_id() < 0 ? "unknown" :
01488                   fontinfo_table_.get(choice_it.data()->fontinfo_id()).name,
01489                   choice_it.data()->fontinfo_id(),
01490                   choice_it.data()->fontinfo_id2() < 0 ? "unknown" :
01491                   fontinfo_table_.get(choice_it.data()->fontinfo_id2()).name,
01492                   choice_it.data()->fontinfo_id2());
01493         }
01494         // 1st choice font gets 2 pts, 2nd choice 1 pt.
01495         if (choice_it.data()->fontinfo_id() >= 0) {
01496           fonts.add(choice_it.data()->fontinfo_id(), 2);
01497         }
01498         if (choice_it.data()->fontinfo_id2() >= 0) {
01499           fonts.add(choice_it.data()->fontinfo_id2(), 1);
01500         }
01501         break;
01502       }
01503     }
01504   }
01505   inT16 font_id1, font_id2;
01506   find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count);
01507   find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count);
01508   word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
01509   word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
01510   // All the blobs get the word's best choice font.
01511   for (int i = 0; i < word->best_choice->length(); ++i) {
01512     word->best_choice_fontinfo_ids.push_back(font_id1);
01513   }
01514   if (word->fontinfo_id_count > 0) {
01515     FontInfo fi = fontinfo_table_.get(font_id1);
01516     if (tessedit_debug_fonts) {
01517       if (word->fontinfo_id2_count > 0) {
01518         tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
01519                 fi.name, word->fontinfo_id_count,
01520                 fontinfo_table_.get(font_id2).name,
01521                 word->fontinfo_id2_count);
01522       } else {
01523         tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
01524                 fi.name, word->fontinfo_id_count);
01525       }
01526     }
01527     // 1st choices got 2 pts, so we need to halve the score for the mode.
01528     word->italic = (fi.is_italic() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
01529     word->bold = (fi.is_bold() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2;
01530   }
01531 }
01532 
01533 
01540 void Tesseract::font_recognition_pass(PAGE_RES* page_res) {
01541   PAGE_RES_IT page_res_it(page_res);
01542   WERD_RES *word;                // current word
01543   STATS doc_fonts(0, font_table_size_);           // font counters
01544 
01545   // Gather font id statistics.
01546   for (page_res_it.restart_page(); page_res_it.word() != NULL;
01547        page_res_it.forward()) {
01548     word = page_res_it.word();
01549     if (word->fontinfo != NULL) {
01550       doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
01551     }
01552     if (word->fontinfo2 != NULL) {
01553       doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
01554     }
01555   }
01556   inT16 doc_font;                 // modal font
01557   inT8 doc_font_count;           // modal font
01558   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
01559   if (doc_font_count == 0)
01560     return;
01561   // Get the modal font pointer.
01562   const FontInfo* modal_font = NULL;
01563   for (page_res_it.restart_page(); page_res_it.word() != NULL;
01564        page_res_it.forward()) {
01565     word = page_res_it.word();
01566     if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
01567       modal_font = word->fontinfo;
01568       break;
01569     }
01570     if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
01571       modal_font = word->fontinfo2;
01572       break;
01573     }
01574   }
01575   ASSERT_HOST(modal_font != NULL);
01576 
01577   // Assign modal font to weak words.
01578   for (page_res_it.restart_page(); page_res_it.word() != NULL;
01579        page_res_it.forward()) {
01580     word = page_res_it.word();
01581     int length = word->best_choice->length();
01582 
01583     // 1st choices got 2 pts, so we need to halve the score for the mode.
01584     int count = (word->fontinfo_id_count + 1) / 2;
01585     if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
01586       word->fontinfo = modal_font;
01587       // Counts only get 1 as it came from the doc.
01588       word->fontinfo_id_count = 1;
01589       word->italic = modal_font->is_italic() ? 1 : -1;
01590       word->bold = modal_font->is_bold() ? 1 : -1;
01591     }
01592   }
01593 }
01594 
01595 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines