tesseract
3.03
|
00001 00002 // File: tesseractclass.h 00003 // Description: An instance of Tesseract. For thread safety, *every* 00004 // global variable goes in here, directly, or indirectly. 00005 // Author: Ray Smith 00006 // Created: Fri Mar 07 08:17:01 PST 2008 00007 // 00008 // (C) Copyright 2008, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__ 00022 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__ 00023 00024 #include "allheaders.h" 00025 #include "control.h" 00026 #include "docqual.h" 00027 #include "devanagari_processing.h" 00028 #include "genericvector.h" 00029 #include "params.h" 00030 #include "ocrclass.h" 00031 #include "textord.h" 00032 #include "wordrec.h" 00033 00034 class PAGE_RES; 00035 class PAGE_RES_IT; 00036 class BLOCK_LIST; 00037 class CharSamp; 00038 class TO_BLOCK_LIST; 00039 class IMAGE; 00040 class WERD_RES; 00041 class ROW; 00042 class TBOX; 00043 class SVMenuNode; 00044 struct Pix; 00045 class WERD_CHOICE; 00046 class WERD; 00047 class BLOB_CHOICE_LIST_CLIST; 00048 struct OSResults; 00049 00050 00051 // Top-level class for all tesseract global instance data. 00052 // This class either holds or points to all data used by an instance 00053 // of Tesseract, including the memory allocator. When this is 00054 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT! 00055 // 00056 // NOTE to developers: Do not create cyclic dependencies through this class! 00057 // The directory dependency tree must remain a tree! The keep this clean, 00058 // lower-level code (eg in ccutil, the bottom level) must never need to 00059 // know about the content of a higher-level directory. 00060 // The following scheme will grant the easiest access to lower-level 00061 // global members without creating a cyclic dependency: 00062 // 00063 // Class Hierarchy (^ = inheritance): 00064 // 00065 // CCUtil (ccutil/ccutil.h) 00066 // ^ Members include: UNICHARSET 00067 // CUtil (cutil/cutil_class.h) 00068 // ^ Members include: TBLOB*, TEXTBLOCK* 00069 // CCStruct (ccstruct/ccstruct.h) 00070 // ^ Members include: Image 00071 // Classify (classify/classify.h) 00072 // ^ Members include: Dict 00073 // WordRec (wordrec/wordrec.h) 00074 // ^ Members include: WERD*, DENORM* 00075 // Tesseract (ccmain/tesseractclass.h) 00076 // Members include: Pix*, CubeRecoContext*, 00077 // TesseractCubeCombiner* 00078 // 00079 // Other important classes: 00080 // 00081 // TessBaseAPI (api/baseapi.h) 00082 // Members include: BLOCK_LIST*, PAGE_RES*, 00083 // Tesseract*, ImageThresholder* 00084 // Dict (dict/dict.h) 00085 // Members include: Image* (private) 00086 // 00087 // NOTE: that each level contains members that correspond to global 00088 // data that is defined (and used) at that level, not necessarily where 00089 // the type is defined so for instance: 00090 // BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs"); 00091 // goes inside the Textord class, not the cc_util class. 00092 00093 namespace tesseract { 00094 00095 class ColumnFinder; 00096 class CubeLineObject; 00097 class CubeObject; 00098 class CubeRecoContext; 00099 class EquationDetect; 00100 class Tesseract; 00101 class TesseractCubeCombiner; 00102 00103 // A collection of various variables for statistics and debugging. 00104 struct TesseractStats { 00105 TesseractStats() 00106 : adaption_word_number(0), 00107 doc_blob_quality(0), 00108 doc_outline_errs(0), 00109 doc_char_quality(0), 00110 good_char_count(0), 00111 doc_good_char_quality(0), 00112 word_count(0), 00113 dict_words(0), 00114 tilde_crunch_written(false), 00115 last_char_was_newline(true), 00116 last_char_was_tilde(false), 00117 write_results_empty_block(true) {} 00118 00119 inT32 adaption_word_number; 00120 inT16 doc_blob_quality; 00121 inT16 doc_outline_errs; 00122 inT16 doc_char_quality; 00123 inT16 good_char_count; 00124 inT16 doc_good_char_quality; 00125 inT32 word_count; // count of word in the document 00126 inT32 dict_words; // number of dicitionary words in the document 00127 STRING dump_words_str; // accumulator used by dump_words() 00128 // Flags used by write_results() 00129 bool tilde_crunch_written; 00130 bool last_char_was_newline; 00131 bool last_char_was_tilde; 00132 bool write_results_empty_block; 00133 }; 00134 00135 // Struct to hold all the pointers to relevant data for processing a word. 00136 struct WordData { 00137 WordData() : word(NULL), row(NULL), block(NULL), prev_word(NULL) {} 00138 explicit WordData(const PAGE_RES_IT& page_res_it) 00139 : word(page_res_it.word()), row(page_res_it.row()->row), 00140 block(page_res_it.block()->block), prev_word(NULL) {} 00141 WordData(BLOCK* block_in, ROW* row_in, WERD_RES* word_res) 00142 : word(word_res), row(row_in), block(block_in), prev_word(NULL) {} 00143 00144 WERD_RES* word; 00145 ROW* row; 00146 BLOCK* block; 00147 WordData* prev_word; 00148 GenericVector<WERD_RES> lang_words; 00149 }; 00150 00151 typedef void (Tesseract::*WordRecognizer)(WordData* word_data, WERD_RES* word); 00152 00153 class Tesseract : public Wordrec { 00154 public: 00155 Tesseract(); 00156 ~Tesseract(); 00157 00158 // Clear as much used memory as possible without resetting the adaptive 00159 // classifier or losing any other classifier data. 00160 void Clear(); 00161 // Clear all memory of adaption for this and all subclassifiers. 00162 void ResetAdaptiveClassifier(); 00163 // Clear the document dictionary for this and all subclassifiers. 00164 void ResetDocumentDictionary(); 00165 00166 // Set the equation detector. 00167 void SetEquationDetect(EquationDetect* detector); 00168 00169 // Simple accessors. 00170 const FCOORD& reskew() const { 00171 return reskew_; 00172 } 00173 // Destroy any existing pix and return a pointer to the pointer. 00174 Pix** mutable_pix_binary() { 00175 Clear(); 00176 return &pix_binary_; 00177 } 00178 Pix* pix_binary() const { 00179 return pix_binary_; 00180 } 00181 Pix* pix_grey() const { 00182 return pix_grey_; 00183 } 00184 void set_pix_grey(Pix* grey_pix) { 00185 pixDestroy(&pix_grey_); 00186 pix_grey_ = grey_pix; 00187 } 00188 // Returns a pointer to a Pix representing the best available image of the 00189 // page. The image will be 8-bit grey if the input was grey or color. Note 00190 // that in grey 0 is black and 255 is white. If the input was binary, then 00191 // the returned Pix will be binary. Note that here black is 1 and white is 0. 00192 // To tell the difference pixGetDepth() will return 8 or 1. 00193 // In either case, the return value is a borrowed Pix, and should not be 00194 // deleted or pixDestroyed. 00195 Pix* BestPix() const { 00196 return pix_grey_ != NULL ? pix_grey_ : pix_binary_; 00197 } 00198 void set_pix_thresholds(Pix* thresholds) { 00199 pixDestroy(&pix_thresholds_); 00200 pix_thresholds_ = thresholds; 00201 } 00202 int source_resolution() const { 00203 return source_resolution_; 00204 } 00205 void set_source_resolution(int ppi) { 00206 source_resolution_ = ppi; 00207 } 00208 int ImageWidth() const { 00209 return pixGetWidth(pix_binary_); 00210 } 00211 int ImageHeight() const { 00212 return pixGetHeight(pix_binary_); 00213 } 00214 Pix* scaled_color() const { 00215 return scaled_color_; 00216 } 00217 int scaled_factor() const { 00218 return scaled_factor_; 00219 } 00220 void SetScaledColor(int factor, Pix* color) { 00221 scaled_factor_ = factor; 00222 scaled_color_ = color; 00223 } 00224 const Textord& textord() const { 00225 return textord_; 00226 } 00227 Textord* mutable_textord() { 00228 return &textord_; 00229 } 00230 00231 bool right_to_left() const { 00232 return right_to_left_; 00233 } 00234 int num_sub_langs() const { 00235 return sub_langs_.size(); 00236 } 00237 Tesseract* get_sub_lang(int index) const { 00238 return sub_langs_[index]; 00239 } 00240 00241 void SetBlackAndWhitelist(); 00242 00243 // Perform steps to prepare underlying binary image/other data structures for 00244 // page segmentation. Uses the strategy specified in the global variable 00245 // pageseg_devanagari_split_strategy for perform splitting while preparing for 00246 // page segmentation. 00247 void PrepareForPageseg(); 00248 00249 // Perform steps to prepare underlying binary image/other data structures for 00250 // Tesseract OCR. The current segmentation is required by this method. 00251 // Uses the strategy specified in the global variable 00252 // ocr_devanagari_split_strategy for performing splitting while preparing for 00253 // Tesseract ocr. 00254 void PrepareForTessOCR(BLOCK_LIST* block_list, 00255 Tesseract* osd_tess, OSResults* osr); 00256 00257 int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, 00258 Tesseract* osd_tess, OSResults* osr); 00259 void SetupWordScripts(BLOCK_LIST* blocks); 00260 int AutoPageSeg(PageSegMode pageseg_mode, 00261 BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, 00262 Tesseract* osd_tess, OSResults* osr); 00263 ColumnFinder* SetupPageSegAndDetectOrientation( 00264 bool single_column, bool osd, bool only_osd, 00265 BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr, 00266 TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix); 00267 // par_control.cpp 00268 void PrerecAllWordsPar(const GenericVector<WordData>& words); 00269 00271 bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box, 00272 const char* word_config, int pass); 00273 // Sets up the words ready for whichever engine is to be run 00274 void SetupAllWordsPassN(int pass_n, 00275 const TBOX* target_word_box, 00276 const char* word_config, 00277 PAGE_RES* page_res, 00278 GenericVector<WordData>* words); 00279 // Sets up the single word ready for whichever engine is to be run. 00280 void SetupWordPassN(int pass_n, WordData* word); 00281 // Runs word recognition on all the words. 00282 bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, 00283 GenericVector<WordData>* words); 00284 bool recog_all_words(PAGE_RES* page_res, 00285 ETEXT_DESC* monitor, 00286 const TBOX* target_word_box, 00287 const char* word_config, 00288 int dopasses); 00289 void rejection_passes(PAGE_RES* page_res, 00290 ETEXT_DESC* monitor, 00291 const TBOX* target_word_box, 00292 const char* word_config); 00293 void bigram_correction_pass(PAGE_RES *page_res); 00294 void blamer_pass(PAGE_RES* page_res); 00295 // Sets script positions and detects smallcaps on all output words. 00296 void script_pos_pass(PAGE_RES* page_res); 00297 // Helper to recognize the word using the given (language-specific) tesseract. 00298 // Returns true if the result was better than previously. 00299 bool RetryWithLanguage(const WERD_RES& best_word, WordData* word_data, 00300 WERD_RES* word, WordRecognizer recognizer); 00301 void classify_word_and_language(WordRecognizer recognizer, 00302 WordData* word_data); 00303 void classify_word_pass1(WordData* word_data, WERD_RES* word); 00304 void recog_pseudo_word(PAGE_RES* page_res, // blocks to check 00305 TBOX &selection_box); 00306 00307 void fix_rep_char(PAGE_RES_IT* page_res_it); 00308 void ExplodeRepeatedWord(BLOB_CHOICE* best_choice, PAGE_RES_IT* page_res_it); 00309 00310 ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set, 00311 const char *s, 00312 const char *lengths); 00313 void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK* block); 00314 void classify_word_pass2(WordData* word_data, WERD_RES* word); 00315 void ReportXhtFixResult(bool accept_new_word, float new_x_ht, 00316 WERD_RES* word, WERD_RES* new_word); 00317 bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row); 00318 bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row); 00319 BOOL8 recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res); 00320 00321 // Set fonts of this word. 00322 void set_word_fonts(WERD_RES *word); 00323 void font_recognition_pass(PAGE_RES* page_res); 00324 BOOL8 check_debug_pt(WERD_RES *word, int location); 00325 00327 bool SubAndSuperscriptFix(WERD_RES *word_res); 00328 void GetSubAndSuperscriptCandidates(const WERD_RES *word, 00329 int *num_rebuilt_leading, 00330 ScriptPos *leading_pos, 00331 float *leading_certainty, 00332 int *num_rebuilt_trailing, 00333 ScriptPos *trailing_pos, 00334 float *trailing_certainty, 00335 float *avg_certainty, 00336 float *unlikely_threshold); 00337 WERD_RES *TrySuperscriptSplits(int num_chopped_leading, 00338 float leading_certainty, 00339 ScriptPos leading_pos, 00340 int num_chopped_trailing, 00341 float trailing_certainty, 00342 ScriptPos trailing_pos, 00343 WERD_RES *word, 00344 bool *is_good, 00345 int *retry_leading, 00346 int *retry_trailing); 00347 bool BelievableSuperscript(bool debug, 00348 const WERD_RES &word, 00349 float certainty_threshold, 00350 int *left_ok, 00351 int *right_ok) const; 00352 00354 bool init_cube_objects(bool load_combiner, 00355 TessdataManager *tessdata_manager); 00356 // Iterates through tesseract's results and calls cube on each word, 00357 // combining the results with the existing tesseract result. 00358 void run_cube_combiner(PAGE_RES *page_res); 00359 // Recognizes a single word using (only) cube. Compatible with 00360 // Tesseract's classify_word_pass1/classify_word_pass2. 00361 void cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word); 00362 // Cube recognizer to recognize a single word as with classify_word_pass1 00363 // but also returns the cube object in case the combiner is needed. 00364 CubeObject* cube_recognize_word(BLOCK* block, WERD_RES* word); 00365 // Combines the cube and tesseract results for a single word, leaving the 00366 // result in tess_word. 00367 void cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word, 00368 WERD_RES* tess_word); 00369 // Call cube on the current word, and write the result to word. 00370 // Sets up a fake result and returns false if something goes wrong. 00371 bool cube_recognize(CubeObject *cube_obj, BLOCK* block, WERD_RES *word); 00372 void fill_werd_res(const BoxWord& cube_box_word, 00373 const char* cube_best_str, 00374 WERD_RES* tess_werd_res); 00375 bool extract_cube_state(CubeObject* cube_obj, int* num_chars, 00376 Boxa** char_boxes, CharSamp*** char_samples); 00377 bool create_cube_box_word(Boxa *char_boxes, int num_chars, 00378 TBOX word_box, BoxWord* box_word); 00380 00381 void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box); 00382 void write_results(PAGE_RES_IT &page_res_it, // full info 00383 char newline_type, // type of newline 00384 BOOL8 force_eol // override tilde crunch? 00385 ); 00386 void set_unlv_suspects(WERD_RES *word); 00387 UNICHAR_ID get_rep_char(WERD_RES *word); // what char is repeated? 00388 BOOL8 acceptable_number_string(const char *s, 00389 const char *lengths); 00390 inT16 count_alphanums(const WERD_CHOICE &word); 00391 inT16 count_alphas(const WERD_CHOICE &word); 00393 void read_config_file(const char *filename, SetParamConstraint constraint); 00394 // Initialize for potentially a set of languages defined by the language 00395 // string and recursively any additional languages required by any language 00396 // traineddata file (via tessedit_load_sublangs in its config) that is loaded. 00397 // See init_tesseract_internal for args. 00398 int init_tesseract(const char *arg0, 00399 const char *textbase, 00400 const char *language, 00401 OcrEngineMode oem, 00402 char **configs, 00403 int configs_size, 00404 const GenericVector<STRING> *vars_vec, 00405 const GenericVector<STRING> *vars_values, 00406 bool set_only_init_params); 00407 int init_tesseract(const char *datapath, 00408 const char *language, 00409 OcrEngineMode oem) { 00410 return init_tesseract(datapath, NULL, language, oem, 00411 NULL, 0, NULL, NULL, false); 00412 } 00413 // Common initialization for a single language. 00414 // arg0 is the datapath for the tessdata directory, which could be the 00415 // path of the tessdata directory with no trailing /, or (if tessdata 00416 // lives in the same directory as the executable, the path of the executable, 00417 // hence the name arg0. 00418 // textbase is an optional output file basename (used only for training) 00419 // language is the language code to load. 00420 // oem controls which engine(s) will operate on the image 00421 // configs (argv) is an array of config filenames to load variables from. 00422 // May be NULL. 00423 // configs_size (argc) is the number of elements in configs. 00424 // vars_vec is an optional vector of variables to set. 00425 // vars_values is an optional corresponding vector of values for the variables 00426 // in vars_vec. 00427 // If set_only_init_params is true, then only the initialization variables 00428 // will be set. 00429 int init_tesseract_internal(const char *arg0, 00430 const char *textbase, 00431 const char *language, 00432 OcrEngineMode oem, 00433 char **configs, 00434 int configs_size, 00435 const GenericVector<STRING> *vars_vec, 00436 const GenericVector<STRING> *vars_values, 00437 bool set_only_init_params); 00438 00439 // Set the universal_id member of each font to be unique among all 00440 // instances of the same font loaded. 00441 void SetupUniversalFontIds(); 00442 00443 int init_tesseract_lm(const char *arg0, 00444 const char *textbase, 00445 const char *language); 00446 00447 void recognize_page(STRING& image_name); 00448 void end_tesseract(); 00449 00450 bool init_tesseract_lang_data(const char *arg0, 00451 const char *textbase, 00452 const char *language, 00453 OcrEngineMode oem, 00454 char **configs, 00455 int configs_size, 00456 const GenericVector<STRING> *vars_vec, 00457 const GenericVector<STRING> *vars_values, 00458 bool set_only_init_params); 00459 00460 void ParseLanguageString(const char* lang_str, 00461 GenericVector<STRING>* to_load, 00462 GenericVector<STRING>* not_to_load); 00463 00465 SVMenuNode *build_menu_new(); 00466 #ifndef GRAPHICS_DISABLED 00467 void pgeditor_main(int width, int height, PAGE_RES* page_res); 00468 #endif // GRAPHICS_DISABLED 00469 void process_image_event( // action in image win 00470 const SVEvent &event); 00471 BOOL8 process_cmd_win_event( // UI command semantics 00472 inT32 cmd_event, // which menu item? 00473 char *new_value // any prompt data 00474 ); 00475 void debug_word(PAGE_RES* page_res, const TBOX &selection_box); 00476 void do_re_display( 00477 BOOL8 (tesseract::Tesseract::*word_painter)(BLOCK* block, 00478 ROW* row, 00479 WERD_RES* word_res)); 00480 BOOL8 word_display(BLOCK* block, ROW* row, WERD_RES* word_res); 00481 BOOL8 word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res); 00482 BOOL8 word_blank_and_set_display(BLOCK* block, ROW* row, WERD_RES* word_res); 00483 BOOL8 word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res); 00484 // #ifndef GRAPHICS_DISABLED 00485 BOOL8 word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res); 00486 // #endif // GRAPHICS_DISABLED 00487 void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box); 00489 // make rej map for word 00490 void make_reject_map(WERD_RES *word, ROW *row, inT16 pass); 00491 BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map); 00492 inT16 first_alphanum_index(const char *word, 00493 const char *word_lengths); 00494 inT16 first_alphanum_offset(const char *word, 00495 const char *word_lengths); 00496 inT16 alpha_count(const char *word, 00497 const char *word_lengths); 00498 BOOL8 word_contains_non_1_digit(const char *word, 00499 const char *word_lengths); 00500 void dont_allow_1Il(WERD_RES *word); 00501 inT16 count_alphanums( //how many alphanums 00502 WERD_RES *word); 00503 void flip_0O(WERD_RES *word); 00504 BOOL8 non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id); 00505 BOOL8 non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id); 00506 BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row); 00507 void nn_match_word( //Match a word 00508 WERD_RES *word, 00509 ROW *row); 00510 void nn_recover_rejects(WERD_RES *word, ROW *row); 00511 void set_done( //set done flag 00512 WERD_RES *word, 00513 inT16 pass); 00514 inT16 safe_dict_word(const WERD_RES *werd_res); // is best_choice in dict? 00515 void flip_hyphens(WERD_RES *word); 00516 void reject_I_1_L(WERD_RES *word); 00517 void reject_edge_blobs(WERD_RES *word); 00518 void reject_mostly_rejects(WERD_RES *word); 00520 BOOL8 word_adaptable( //should we adapt? 00521 WERD_RES *word, 00522 uinT16 mode); 00523 00525 void recog_word_recursive(WERD_RES* word); 00526 void recog_word(WERD_RES *word); 00527 void split_and_recog_word(WERD_RES* word); 00528 void split_word(WERD_RES *word, 00529 int split_pt, 00530 WERD_RES **right_piece, 00531 BlamerBundle **orig_blamer_bundle) const; 00532 void join_words(WERD_RES *word, 00533 WERD_RES *word2, 00534 BlamerBundle *orig_bb) const; 00536 BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position); 00537 inT16 eval_word_spacing(WERD_RES_LIST &word_res_list); 00538 void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block); 00539 inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list); 00540 void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block); 00541 void fix_fuzzy_space_list( //space explorer 00542 WERD_RES_LIST &best_perm, 00543 ROW *row, 00544 BLOCK* block); 00545 void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block); 00546 void fix_fuzzy_spaces( //find fuzzy words 00547 ETEXT_DESC *monitor, //progress monitor 00548 inT32 word_count, //count of words in doc 00549 PAGE_RES *page_res); 00550 void dump_words(WERD_RES_LIST &perm, inT16 score, 00551 inT16 mode, BOOL8 improved); 00552 BOOL8 fixspace_thinks_word_done(WERD_RES *word); 00553 inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score); 00554 float blob_noise_score(TBLOB *blob); 00555 void break_noisiest_blob_word(WERD_RES_LIST &words); 00557 GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word); 00558 BOOL8 potential_word_crunch(WERD_RES *word, 00559 GARBAGE_LEVEL garbage_level, 00560 BOOL8 ok_dict_word); 00561 void tilde_crunch(PAGE_RES_IT &page_res_it); 00562 void unrej_good_quality_words( //unreject potential 00563 PAGE_RES_IT &page_res_it); 00564 void doc_and_block_rejection( //reject big chunks 00565 PAGE_RES_IT &page_res_it, 00566 BOOL8 good_quality_doc); 00567 void quality_based_rejection(PAGE_RES_IT &page_res_it, 00568 BOOL8 good_quality_doc); 00569 void convert_bad_unlv_chs(WERD_RES *word_res); 00570 void tilde_delete(PAGE_RES_IT &page_res_it); 00571 inT16 word_blob_quality(WERD_RES *word, ROW *row); 00572 void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, 00573 inT16 *accepted_match_count); 00574 void unrej_good_chs(WERD_RES *word, ROW *row); 00575 inT16 count_outline_errs(char c, inT16 outline_count); 00576 inT16 word_outline_errs(WERD_RES *word); 00577 BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level); 00578 CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode); 00579 inT16 failure_count(WERD_RES *word); 00580 BOOL8 noise_outlines(TWERD *word); 00582 void 00583 process_selected_words ( 00584 PAGE_RES* page_res, // blocks to check 00585 //function to call 00586 TBOX & selection_box, 00587 BOOL8 (tesseract::Tesseract::*word_processor) (BLOCK* block, 00588 ROW* row, 00589 WERD_RES* word_res)); 00591 void tess_add_doc_word( //test acceptability 00592 WERD_CHOICE *word_choice //after context 00593 ); 00594 void tess_segment_pass_n(int pass_n, WERD_RES *word); 00595 bool tess_acceptable_word(WERD_RES *word); 00596 00598 // Applies the box file based on the image name fname, and resegments 00599 // the words in the block_list (page), with: 00600 // blob-mode: one blob per line in the box file, words as input. 00601 // word/line-mode: one blob per space-delimited unit after the #, and one word 00602 // per line in the box file. (See comment above for box file format.) 00603 // If find_segmentation is true, (word/line mode) then the classifier is used 00604 // to re-segment words/lines to match the space-delimited truth string for 00605 // each box. In this case, the input box may be for a word or even a whole 00606 // text line, and the output words will contain multiple blobs corresponding 00607 // to the space-delimited input string. 00608 // With find_segmentation false, no classifier is needed, but the chopper 00609 // can still be used to correctly segment touching characters with the help 00610 // of the input boxes. 00611 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned 00612 // from normal classification, ie. with a word, chopped_word, rebuild_word, 00613 // seam_array, denorm, box_word, and best_state, but NO best_choice or 00614 // raw_choice, as they would require a UNICHARSET, which we aim to avoid. 00615 // Instead, the correct_text member of WERD_RES is set, and this may be later 00616 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords 00617 // is not required before calling ApplyBoxTraining. 00618 PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation, 00619 BLOCK_LIST *block_list); 00620 00621 // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: 00622 // All fuzzy spaces are removed, and all the words are maximally chopped. 00623 PAGE_RES* SetupApplyBoxes(const GenericVector<TBOX>& boxes, 00624 BLOCK_LIST *block_list); 00625 // Tests the chopper by exhaustively running chop_one_blob. 00626 // The word_res will contain filled chopped_word, seam_array, denorm, 00627 // box_word and best_state for the maximally chopped word. 00628 void MaximallyChopWord(const GenericVector<TBOX>& boxes, 00629 BLOCK* block, ROW* row, WERD_RES* word_res); 00630 // Gather consecutive blobs that match the given box into the best_state 00631 // and corresponding correct_text. 00632 // Fights over which box owns which blobs are settled by pre-chopping and 00633 // applying the blobs to box or next_box with the least non-overlap. 00634 // Returns false if the box was in error, which can only be caused by 00635 // failing to find an appropriate blob for a box. 00636 // This means that occasionally, blobs may be incorrectly segmented if the 00637 // chopper fails to find a suitable chop point. 00638 bool ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box, 00639 const TBOX& box, const TBOX& next_box, 00640 const char* correct_text); 00641 // Consume all source blobs that strongly overlap the given box, 00642 // putting them into a new word, with the correct_text label. 00643 // Fights over which box owns which blobs are settled by 00644 // applying the blobs to box or next_box with the least non-overlap. 00645 // Returns false if the box was in error, which can only be caused by 00646 // failing to find an overlapping blob for a box. 00647 bool ResegmentWordBox(BLOCK_LIST *block_list, 00648 const TBOX& box, const TBOX& next_box, 00649 const char* correct_text); 00650 // Resegments the words by running the classifier in an attempt to find the 00651 // correct segmentation that produces the required string. 00652 void ReSegmentByClassification(PAGE_RES* page_res); 00653 // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. 00654 // Returns false if an invalid UNICHAR_ID is encountered. 00655 bool ConvertStringToUnichars(const char* utf8, 00656 GenericVector<UNICHAR_ID>* class_ids); 00657 // Resegments the word to achieve the target_text from the classifier. 00658 // Returns false if the re-segmentation fails. 00659 // Uses brute-force combination of upto kMaxGroupSize adjacent blobs, and 00660 // applies a full search on the classifier results to find the best classified 00661 // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity 00662 // substitutions ARE used. 00663 bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text, 00664 WERD_RES* word_res); 00665 // Recursive helper to find a match to the target_text (from text_index 00666 // position) in the choices (from choices_pos position). 00667 // Choices is an array of GenericVectors, of length choices_length, with each 00668 // element representing a starting position in the word, and the 00669 // GenericVector holding classification results for a sequence of consecutive 00670 // blobs, with index 0 being a single blob, index 1 being 2 blobs etc. 00671 void SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices, 00672 int choices_pos, int choices_length, 00673 const GenericVector<UNICHAR_ID>& target_text, 00674 int text_index, 00675 float rating, GenericVector<int>* segmentation, 00676 float* best_rating, GenericVector<int>* best_segmentation); 00677 // Counts up the labelled words and the blobs within. 00678 // Deletes all unused or emptied words, counting the unused ones. 00679 // Resets W_BOL and W_EOL flags correctly. 00680 // Builds the rebuild_word and rebuilds the box_word. 00681 void TidyUp(PAGE_RES* page_res); 00682 // Logs a bad box by line in the box file and box coords. 00683 void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, 00684 const char *err_msg); 00685 // Creates a fake best_choice entry in each WERD_RES with the correct text. 00686 void CorrectClassifyWords(PAGE_RES* page_res); 00687 // Call LearnWord to extract features for labelled blobs within each word. 00688 // Features are written to the given filename. 00689 void ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res); 00690 00692 // Returns the number of misfit blob tops in this word. 00693 int CountMisfitTops(WERD_RES *word_res); 00694 // Returns a new x-height in pixels (original image coords) that is 00695 // maximally compatible with the result in word_res. 00696 // Returns 0.0f if no x-height is found that is better than the current 00697 // estimate. 00698 float ComputeCompatibleXheight(WERD_RES *word_res); 00700 // TODO(ocr-team): Find and remove obsolete parameters. 00701 BOOL_VAR_H(tessedit_resegment_from_boxes, false, 00702 "Take segmentation and labeling from box file"); 00703 BOOL_VAR_H(tessedit_resegment_from_line_boxes, false, 00704 "Conversion of word/line box file to char box file"); 00705 BOOL_VAR_H(tessedit_train_from_boxes, false, 00706 "Generate training data from boxed chars"); 00707 BOOL_VAR_H(tessedit_make_boxes_from_boxes, false, 00708 "Generate more boxes from boxed chars"); 00709 BOOL_VAR_H(tessedit_dump_pageseg_images, false, 00710 "Dump intermediate images made during page segmentation"); 00711 INT_VAR_H(tessedit_pageseg_mode, PSM_SINGLE_BLOCK, 00712 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," 00713 " 5=line, 6=word, 7=char" 00714 " (Values from PageSegMode enum in publictypes.h)"); 00715 INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, 00716 "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults" 00717 " to loading and running only Tesseract (no Cube, no combiner)." 00718 " (Values from OcrEngineMode enum in tesseractclass.h)"); 00719 STRING_VAR_H(tessedit_char_blacklist, "", 00720 "Blacklist of chars not to recognize"); 00721 STRING_VAR_H(tessedit_char_whitelist, "", 00722 "Whitelist of chars to recognize"); 00723 BOOL_VAR_H(tessedit_ambigs_training, false, 00724 "Perform training for ambiguities"); 00725 INT_VAR_H(pageseg_devanagari_split_strategy, 00726 tesseract::ShiroRekhaSplitter::NO_SPLIT, 00727 "Whether to use the top-line splitting process for Devanagari " 00728 "documents while performing page-segmentation."); 00729 INT_VAR_H(ocr_devanagari_split_strategy, 00730 tesseract::ShiroRekhaSplitter::NO_SPLIT, 00731 "Whether to use the top-line splitting process for Devanagari " 00732 "documents while performing ocr."); 00733 STRING_VAR_H(tessedit_write_params_to_file, "", 00734 "Write all parameters to the given file."); 00735 BOOL_VAR_H(tessedit_adaption_debug, false, 00736 "Generate and print debug information for adaption"); 00737 INT_VAR_H(bidi_debug, 0, "Debug level for BiDi"); 00738 INT_VAR_H(applybox_debug, 1, "Debug level"); 00739 INT_VAR_H(applybox_page, 0, "Page number to apply boxes from"); 00740 STRING_VAR_H(applybox_exposure_pattern, ".exp", 00741 "Exposure value follows this pattern in the image" 00742 " filename. The name of the image files are expected" 00743 " to be in the form [lang].[fontname].exp[num].tif"); 00744 BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode, false, 00745 "Learn both character fragments (as is done in the" 00746 " special low exposure mode) as well as unfragmented" 00747 " characters."); 00748 BOOL_VAR_H(applybox_learn_ngrams_mode, false, 00749 "Each bounding box is assumed to contain ngrams. Only" 00750 " learn the ngrams whose outlines overlap horizontally."); 00751 BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words"); 00752 BOOL_VAR_H(tessedit_training_tess, false, "Call Tess to learn blobs"); 00753 BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices"); 00754 BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats"); 00755 BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true, 00756 "Try to improve fuzzy spaces"); 00757 BOOL_VAR_H(tessedit_unrej_any_wd, false, 00758 "Dont bother with word plausibility"); 00759 BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?"); 00760 BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height"); 00761 BOOL_VAR_H(tessedit_enable_doc_dict, true, 00762 "Add words to the document dictionary"); 00763 BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char"); 00764 BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats"); 00765 BOOL_VAR_H(tessedit_enable_bigram_correction, true, 00766 "Enable correction based on the word bigram dictionary."); 00767 INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram " 00768 "correction."); 00769 INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug"); 00770 BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk"); 00771 STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation"); 00772 STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation"); 00773 STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation"); 00774 double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit"); 00775 double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit"); 00776 double_VAR_H(quality_outline_pc, 1.0, 00777 "good_quality_doc lte outline error limit"); 00778 double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit"); 00779 INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word"); 00780 INT_VAR_H(tessedit_tess_adaption_mode, 0x27, 00781 "Adaptation decision algorithm for tess"); 00782 BOOL_VAR_H(tessedit_minimal_rej_pass1, false, 00783 "Do minimal rejection on pass 1 output"); 00784 BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria"); 00785 BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity"); 00786 INT_VAR_H(tessedit_test_adaption_mode, 3, 00787 "Adaptation decision algorithm for tess"); 00788 BOOL_VAR_H(test_pt, false, "Test for point"); 00789 double_VAR_H(test_pt_x, 99999.99, "xcoord"); 00790 double_VAR_H(test_pt_y, 99999.99, "ycoord"); 00791 INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info."); 00792 BOOL_VAR_H(paragraph_text_based, true, 00793 "Run paragraph detection on the post-text-recognition " 00794 "(more accurate)"); 00795 INT_VAR_H(cube_debug_level, 1, "Print cube debug info."); 00796 STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines"); 00797 STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines"); 00798 BOOL_VAR_H(docqual_excuse_outline_errs, false, 00799 "Allow outline errs in unrejection?"); 00800 BOOL_VAR_H(tessedit_good_quality_unrej, true, 00801 "Reduce rejection on good docs"); 00802 BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?"); 00803 double_VAR_H(tessedit_reject_doc_percent, 65.00, 00804 "%rej allowed before rej whole doc"); 00805 double_VAR_H(tessedit_reject_block_percent, 45.00, 00806 "%rej allowed before rej whole block"); 00807 double_VAR_H(tessedit_reject_row_percent, 40.00, 00808 "%rej allowed before rej whole row"); 00809 double_VAR_H(tessedit_whole_wd_rej_row_percent, 70.00, 00810 "Number of row rejects in whole word rejects" 00811 "which prevents whole row rejection"); 00812 BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds, true, 00813 "Only rej partially rejected words in block rejection"); 00814 BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds, true, 00815 "Only rej partially rejected words in row rejection"); 00816 BOOL_VAR_H(tessedit_dont_blkrej_good_wds, false, 00817 "Use word segmentation quality metric"); 00818 BOOL_VAR_H(tessedit_dont_rowrej_good_wds, false, 00819 "Use word segmentation quality metric"); 00820 INT_VAR_H(tessedit_preserve_min_wd_len, 2, 00821 "Only preserve wds longer than this"); 00822 BOOL_VAR_H(tessedit_row_rej_good_docs, true, 00823 "Apply row rejection to good docs"); 00824 double_VAR_H(tessedit_good_doc_still_rowrej_wd, 1.1, 00825 "rej good doc wd if more than this fraction rejected"); 00826 BOOL_VAR_H(tessedit_reject_bad_qual_wds, true, 00827 "Reject all bad quality wds"); 00828 BOOL_VAR_H(tessedit_debug_doc_rejection, false, "Page stats"); 00829 BOOL_VAR_H(tessedit_debug_quality_metrics, false, 00830 "Output data to debug file"); 00831 BOOL_VAR_H(bland_unrej, false, "unrej potential with no chekcs"); 00832 double_VAR_H(quality_rowrej_pc, 1.1, 00833 "good_quality_doc gte good char limit"); 00834 BOOL_VAR_H(unlv_tilde_crunching, true, 00835 "Mark v.bad words for tilde crunch"); 00836 BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?"); 00837 BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?"); 00838 double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this"); 00839 BOOL_VAR_H(crunch_terrible_garbage, true, "As it says"); 00840 double_VAR_H(crunch_poor_garbage_cert, -9.0, 00841 "crunch garbage cert lt this"); 00842 double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this"); 00843 double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this"); 00844 double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this"); 00845 BOOL_VAR_H(crunch_pot_garbage, true, "POTENTIAL crunch garbage"); 00846 double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this"); 00847 double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this"); 00848 double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this"); 00849 double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this"); 00850 double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this"); 00851 double_VAR_H(crunch_del_high_word, 1.5, 00852 "Del if word gt xht x this above bl"); 00853 double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl"); 00854 double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this"); 00855 INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch"); 00856 INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed"); 00857 BOOL_VAR_H(crunch_leave_ok_strings, true, "Dont touch sensible strings"); 00858 BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring"); 00859 BOOL_VAR_H(crunch_leave_accept_strings, false, 00860 "Dont pot crunch sensible strings"); 00861 BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures"); 00862 INT_VAR_H(crunch_leave_lc_strings, 4, 00863 "Dont crunch words with long lower case strings"); 00864 INT_VAR_H(crunch_leave_uc_strings, 4, 00865 "Dont crunch words with long lower case strings"); 00866 INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions"); 00867 INT_VAR_H(crunch_debug, 0, "As it says"); 00868 INT_VAR_H(fixsp_non_noise_limit, 1, 00869 "How many non-noise blbs either side?"); 00870 double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this"); 00871 BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctation joins"); 00872 INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing"); 00873 INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug"); 00874 STRING_VAR_H(numeric_punctuation, ".,", 00875 "Punct. chs expected WITHIN numbers"); 00876 INT_VAR_H(x_ht_acceptance_tolerance, 8, 00877 "Max allowed deviation of blob top outside of font data"); 00878 INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it"); 00879 INT_VAR_H(superscript_debug, 0, "Debug level for sub & superscript fixer"); 00880 double_VAR_H(superscript_worse_certainty, 2.0, "How many times worse " 00881 "certainty does a superscript position glyph need to be for us " 00882 "to try classifying it as a char with a different baseline?"); 00883 double_VAR_H(superscript_bettered_certainty, 0.97, "What reduction in " 00884 "badness do we think sufficient to choose a superscript over " 00885 "what we'd thought. For example, a value of 0.6 means we want " 00886 "to reduce badness of certainty by 40%"); 00887 double_VAR_H(superscript_scaledown_ratio, 0.4, 00888 "A superscript scaled down more than this is unbelievably " 00889 "small. For example, 0.3 means we expect the font size to " 00890 "be no smaller than 30% of the text line font size."); 00891 double_VAR_H(subscript_max_y_top, 0.5, 00892 "Maximum top of a character measured as a multiple of x-height " 00893 "above the baseline for us to reconsider whether it's a " 00894 "subscript."); 00895 double_VAR_H(superscript_min_y_bottom, 0.3, 00896 "Minimum bottom of a character measured as a multiple of " 00897 "x-height above the baseline for us to reconsider whether it's " 00898 "a superscript."); 00899 BOOL_VAR_H(tessedit_write_block_separators, false, 00900 "Write block separators in output"); 00901 BOOL_VAR_H(tessedit_write_rep_codes, false, 00902 "Write repetition char code"); 00903 BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file"); 00904 BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); 00905 BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); 00906 STRING_VAR_H(unrecognised_char, "|", 00907 "Output char for unidentified blobs"); 00908 INT_VAR_H(suspect_level, 99, "Suspect marker level"); 00909 INT_VAR_H(suspect_space_level, 100, 00910 "Min suspect level for rejecting spaces"); 00911 INT_VAR_H(suspect_short_words, 2, 00912 "Dont Suspect dict wds longer than this"); 00913 BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected"); 00914 double_VAR_H(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit"); 00915 double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit"); 00916 BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures"); 00917 BOOL_VAR_H(tessedit_zero_rejection, false, "Dont reject ANYTHING"); 00918 BOOL_VAR_H(tessedit_word_for_word, false, 00919 "Make output have exactly one word per WERD"); 00920 BOOL_VAR_H(tessedit_zero_kelvin_rejection, false, 00921 "Dont reject ANYTHING AT ALL"); 00922 BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same"); 00923 INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm"); 00924 BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug"); 00925 BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips"); 00926 double_VAR_H(tessedit_lower_flip_hyphen, 1.5, 00927 "Aspect ratio dot/hyphen test"); 00928 double_VAR_H(tessedit_upper_flip_hyphen, 1.8, 00929 "Aspect ratio dot/hyphen test"); 00930 BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector"); 00931 BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test"); 00932 BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Dont double check"); 00933 BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control"); 00934 BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control"); 00935 BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control"); 00936 BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check"); 00937 BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check"); 00938 double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract"); 00939 INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit"); 00940 STRING_VAR_H(ok_repeated_ch_non_alphanum_wds, "-?*\075", 00941 "Allow NN to unrej"); 00942 STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set"); 00943 INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this"); 00944 BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes"); 00945 INT_VAR_H(tessedit_page_number, -1, 00946 "-1 -> All pages, else specifc page to process"); 00947 BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE"); 00948 BOOL_VAR_H(interactive_display_mode, false, "Run interactively?"); 00949 STRING_VAR_H(file_type, ".tif", "Filename extension"); 00950 BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word"); 00951 INT_VAR_H(tessdata_manager_debug_level, 0, 00952 "Debug level for TessdataManager functions."); 00953 STRING_VAR_H(tessedit_load_sublangs, "", 00954 "List of languages to load with this one"); 00955 BOOL_VAR_H(tessedit_use_primary_params_model, false, 00956 "In multilingual mode use params model of the primary language"); 00957 // Min acceptable orientation margin (difference in scores between top and 2nd 00958 // choice in OSResults::orientations) to believe the page orientation. 00959 double_VAR_H(min_orientation_margin, 7.0, 00960 "Min acceptable orientation margin"); 00961 BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding"); 00962 BOOL_VAR_H(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model"); 00963 BOOL_VAR_H(poly_allow_detailed_fx, false, 00964 "Allow feature extractors to see the original outline"); 00965 BOOL_VAR_H(tessedit_init_config_only, false, 00966 "Only initialize with the config file. Useful if the instance is " 00967 "not going to be used for OCR but say only for layout analysis."); 00968 BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector"); 00969 INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible"); 00970 00971 // The following parameters were deprecated and removed from their original 00972 // locations. The parameters are temporarily kept here to give Tesseract 00973 // users a chance to updated their [lang].traineddata and config files 00974 // without introducing failures during Tesseract initialization. 00975 // TODO(ocr-team): remove these parameters from the code once we are 00976 // reasonably sure that Tesseract users have updated their data files. 00977 // 00978 // BEGIN DEPRECATED PARAMETERS 00979 INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm"); 00980 BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length" 00981 " dawgs (e.g. for non-space delimited languages)"); 00982 INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process"); 00983 BOOL_VAR_H(permute_debug, 0, "char permutation debug"); 00984 double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of" 00985 " current best rate to prune other hypotheses"); 00986 BOOL_VAR_H(permute_script_word, 0, 00987 "Turn on word script consistency permuter"); 00988 BOOL_VAR_H(segment_segcost_rating, 0, 00989 "incorporate segmentation cost in word rating?"); 00990 double_VAR_H(segment_reward_script, 0.95, 00991 "Score multipler for script consistency within a word. " 00992 "Being a 'reward' factor, it should be <= 1. " 00993 "Smaller value implies bigger reward."); 00994 BOOL_VAR_H(permute_fixed_length_dawg, 0, 00995 "Turn on fixed-length phrasebook search permuter"); 00996 BOOL_VAR_H(permute_chartype_word, 0, 00997 "Turn on character type (property) consistency permuter"); 00998 double_VAR_H(segment_reward_chartype, 0.97, 00999 "Score multipler for char type consistency within a word. "); 01000 double_VAR_H(segment_reward_ngram_best_choice, 0.99, 01001 "Score multipler for ngram permuter's best choice" 01002 " (only used in the Han script path)."); 01003 BOOL_VAR_H(ngram_permuter_activated, false, 01004 "Activate character-level n-gram-based permuter"); 01005 BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter"); 01006 INT_VAR_H(language_model_fixed_length_choices_depth, 3, 01007 "Depth of blob choice lists to explore" 01008 " when fixed length dawgs are on"); 01009 BOOL_VAR_H(use_new_state_cost, FALSE, 01010 "use new state cost heuristics for segmentation state evaluation"); 01011 double_VAR_H(heuristic_segcost_rating_base, 1.25, 01012 "base factor for adding segmentation cost into word rating." 01013 "It's a multiplying factor, the larger the value above 1, " 01014 "the bigger the effect of segmentation cost."); 01015 double_VAR_H(heuristic_weight_rating, 1, 01016 "weight associated with char rating in combined cost of state"); 01017 double_VAR_H(heuristic_weight_width, 1000.0, 01018 "weight associated with width evidence in combined cost of" 01019 " state"); 01020 double_VAR_H(heuristic_weight_seamcut, 0, 01021 "weight associated with seam cut in combined cost of state"); 01022 double_VAR_H(heuristic_max_char_wh_ratio, 2.0, 01023 "max char width-to-height ratio allowed in segmentation"); 01024 BOOL_VAR_H(enable_new_segsearch, false, 01025 "Enable new segmentation search path."); 01026 double_VAR_H(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, 01027 "Maximum character width-to-height ratio for" 01028 "fixed pitch fonts"); 01029 // END DEPRECATED PARAMETERS 01030 01032 FILE *init_recog_training(const STRING &fname); 01033 void recog_training_segmented(const STRING &fname, 01034 PAGE_RES *page_res, 01035 volatile ETEXT_DESC *monitor, 01036 FILE *output_file); 01037 void ambigs_classify_and_output(WERD_RES *werd_res, 01038 ROW_RES *row_res, 01039 BLOCK_RES *block_res, 01040 const char *label, 01041 FILE *output_file); 01042 01043 inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; } 01044 01045 private: 01046 // The filename of a backup config file. If not null, then we currently 01047 // have a temporary debug config file loaded, and backup_config_file_ 01048 // will be loaded, and set to null when debug is complete. 01049 const char* backup_config_file_; 01050 // The filename of a config file to read when processing a debug word. 01051 STRING word_config_; 01052 // Image used for input to layout analysis and tesseract recognition. 01053 // May be modified by the ShiroRekhaSplitter to eliminate the top-line. 01054 Pix* pix_binary_; 01055 // Unmodified image used for input to cube. Always valid. 01056 Pix* cube_binary_; 01057 // Grey-level input image if the input was not binary, otherwise NULL. 01058 Pix* pix_grey_; 01059 // Thresholds that were used to generate the thresholded image from grey. 01060 Pix* pix_thresholds_; 01061 // Input image resolution after any scaling. The resolution is not well 01062 // transmitted by operations on Pix, so we keep an independent record here. 01063 int source_resolution_; 01064 // The shiro-rekha splitter object which is used to split top-lines in 01065 // Devanagari words to provide a better word and grapheme segmentation. 01066 ShiroRekhaSplitter splitter_; 01067 // Page segmentation/layout 01068 Textord textord_; 01069 // True if the primary language uses right_to_left reading order. 01070 bool right_to_left_; 01071 Pix* scaled_color_; 01072 int scaled_factor_; 01073 FCOORD deskew_; 01074 FCOORD reskew_; 01075 TesseractStats stats_; 01076 // Sub-languages to be tried in addition to this. 01077 GenericVector<Tesseract*> sub_langs_; 01078 // Most recently used Tesseract out of this and sub_langs_. The default 01079 // language for the next word. 01080 Tesseract* most_recently_used_; 01081 // The size of the font table, ie max possible font id + 1. 01082 int font_table_size_; 01083 // Cube objects. 01084 CubeRecoContext* cube_cntxt_; 01085 TesseractCubeCombiner *tess_cube_combiner_; 01086 // Equation detector. Note: this pointer is NOT owned by the class. 01087 EquationDetect* equ_detect_; 01088 }; 01089 01090 } // namespace tesseract 01091 01092 01093 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__