tesseract
3.03
|
00001 /********************************************************************** 00002 * File: pageres.h (Formerly page_res.h) 00003 * Description: Results classes used by control.c 00004 * Author: Phil Cheatle 00005 * Created: Tue Sep 22 08:42:49 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 #ifndef PAGERES_H 00020 #define PAGERES_H 00021 00022 #include "blamer.h" 00023 #include "blobs.h" 00024 #include "boxword.h" 00025 #include "elst.h" 00026 #include "genericvector.h" 00027 #include "normalis.h" 00028 #include "ocrblock.h" 00029 #include "ocrrow.h" 00030 #include "params_training_featdef.h" 00031 #include "ratngs.h" 00032 #include "rejctmap.h" 00033 #include "seam.h" 00034 #include "werd.h" 00035 00036 namespace tesseract { 00037 struct FontInfo; 00038 class Tesseract; 00039 } 00040 using tesseract::FontInfo; 00041 00042 /* Forward declarations */ 00043 00044 class BLOCK_RES; 00045 00046 ELISTIZEH (BLOCK_RES) CLISTIZEH (BLOCK_RES) 00047 class 00048 ROW_RES; 00049 00050 ELISTIZEH (ROW_RES) 00051 class WERD_RES; 00052 00053 ELISTIZEH (WERD_RES) 00054 00055 /************************************************************************* 00056 * PAGE_RES - Page results 00057 *************************************************************************/ 00058 class PAGE_RES { // page result 00059 public: 00060 inT32 char_count; 00061 inT32 rej_count; 00062 BLOCK_RES_LIST block_res_list; 00063 BOOL8 rejected; 00064 // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to 00065 // the next word. This pointer is not owned by PAGE_RES class. 00066 WERD_CHOICE **prev_word_best_choice; 00067 // Sums of blame reasons computed by the blamer. 00068 GenericVector<int> blame_reasons; 00069 // Debug information about all the misadaptions on this page. 00070 // Each BlamerBundle contains an index into this vector, so that words that 00071 // caused misadaption could be marked. However, since words could be 00072 // deleted/split/merged, the log is stored on the PAGE_RES level. 00073 GenericVector<STRING> misadaption_log; 00074 00075 inline void Init() { 00076 char_count = 0; 00077 rej_count = 0; 00078 rejected = FALSE; 00079 prev_word_best_choice = NULL; 00080 blame_reasons.init_to_size(IRR_NUM_REASONS, 0); 00081 } 00082 00083 PAGE_RES() { Init(); } // empty constructor 00084 00085 PAGE_RES(BLOCK_LIST *block_list, // real blocks 00086 WERD_CHOICE **prev_word_best_choice_ptr); 00087 00088 ~PAGE_RES () { // destructor 00089 } 00090 }; 00091 00092 /************************************************************************* 00093 * BLOCK_RES - Block results 00094 *************************************************************************/ 00095 00096 class BLOCK_RES:public ELIST_LINK { 00097 public: 00098 BLOCK * block; // real block 00099 inT32 char_count; // chars in block 00100 inT32 rej_count; // rejected chars 00101 inT16 font_class; // 00102 inT16 row_count; 00103 float x_height; 00104 BOOL8 font_assigned; // block already 00105 // processed 00106 BOOL8 bold; // all bold 00107 BOOL8 italic; // all italic 00108 00109 ROW_RES_LIST row_res_list; 00110 00111 BLOCK_RES() { 00112 } // empty constructor 00113 00114 BLOCK_RES(BLOCK *the_block); // real block 00115 00116 ~BLOCK_RES () { // destructor 00117 } 00118 }; 00119 00120 /************************************************************************* 00121 * ROW_RES - Row results 00122 *************************************************************************/ 00123 00124 class ROW_RES:public ELIST_LINK { 00125 public: 00126 ROW * row; // real row 00127 inT32 char_count; // chars in block 00128 inT32 rej_count; // rejected chars 00129 inT32 whole_word_rej_count; // rejs in total rej wds 00130 WERD_RES_LIST word_res_list; 00131 00132 ROW_RES() { 00133 } // empty constructor 00134 00135 ROW_RES(ROW *the_row); // real row 00136 00137 ~ROW_RES() { // destructor 00138 } 00139 }; 00140 00141 /************************************************************************* 00142 * WERD_RES - Word results 00143 *************************************************************************/ 00144 enum CRUNCH_MODE 00145 { 00146 CR_NONE, 00147 CR_KEEP_SPACE, 00148 CR_LOOSE_SPACE, 00149 CR_DELETE 00150 }; 00151 00152 // WERD_RES is a collection of publicly accessible members that gathers 00153 // information about a word result. 00154 class WERD_RES : public ELIST_LINK { 00155 public: 00156 // Which word is which? 00157 // There are 3 coordinate spaces in use here: a possibly rotated pixel space, 00158 // the original image coordinate space, and the BLN space in which the 00159 // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight, 00160 // and the x-middle of the word is at 0. 00161 // In the rotated pixel space, coordinates correspond to the input image, 00162 // but may be rotated about the origin by a multiple of 90 degrees, 00163 // and may therefore be negative. 00164 // In any case a rotation by denorm.block()->re_rotation() will take them 00165 // back to the original image. 00166 // The other differences between words all represent different stages of 00167 // processing during recognition. 00168 00169 // ---------------------------INPUT------------------------------------- 00170 00171 // The word is the input C_BLOBs in the rotated pixel space. 00172 // word is NOT owned by the WERD_RES unless combination is true. 00173 // All the other word pointers ARE owned by the WERD_RES. 00174 WERD* word; // Input C_BLOB word. 00175 00176 // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------ 00177 00178 // The bln_boxes contains the bounding boxes (only) of the input word, in the 00179 // BLN space. The lengths of word and bln_boxes 00180 // match as they are both before any chopping. 00181 // TODO(rays) determine if docqual does anything useful and delete bln_boxes 00182 // if it doesn't. 00183 tesseract::BoxWord* bln_boxes; // BLN input bounding boxes. 00184 // The ROW that this word sits in. NOT owned by the WERD_RES. 00185 ROW* blob_row; 00186 // The denorm provides the transformation to get back to the rotated image 00187 // coords from the chopped_word/rebuild_word BLN coords, but each blob also 00188 // has its own denorm. 00189 DENORM denorm; // For use on chopped_word. 00190 // Unicharset used by the classifier output in best_choice and raw_choice. 00191 const UNICHARSET* uch_set; // For converting back to utf8. 00192 00193 // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION---- 00194 // ----Setup to a (different!) state expected by the various classifiers---- 00195 // TODO(rays) Tidy and make more consistent. 00196 00197 // The chopped_word is also in BLN space, and represents the fully chopped 00198 // character fragments that make up the word. 00199 // The length of chopped_word matches length of seam_array + 1 (if set). 00200 TWERD* chopped_word; // BLN chopped fragments output. 00201 // Vector of SEAM* holding chopping points matching chopped_word. 00202 GenericVector<SEAM*> seam_array; 00203 // Widths of blobs in chopped_word. 00204 GenericVector<int> blob_widths; 00205 // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between 00206 // blob i and blob i+1. 00207 GenericVector<int> blob_gaps; 00208 // Ratings matrix contains classifier choices for each classified combination 00209 // of blobs. The dimension is the same as the number of blobs in chopped_word 00210 // and the leading diagonal corresponds to classifier results of the blobs 00211 // in chopped_word. The state_ members of best_choice, raw_choice and 00212 // best_choices all correspond to this ratings matrix and allow extraction 00213 // of the blob choices for any given WERD_CHOICE. 00214 MATRIX* ratings; // Owned pointer. 00215 // Pointer to the first WERD_CHOICE in best_choices. This is the result that 00216 // will be output from Tesseract. Note that this is now a borrowed pointer 00217 // and should NOT be deleted. 00218 WERD_CHOICE* best_choice; // Borrowed pointer. 00219 // The best raw_choice found during segmentation search. Differs from the 00220 // best_choice by being the best result according to just the character 00221 // classifier, not taking any language model information into account. 00222 // Unlike best_choice, the pointer IS owned by this WERD_RES. 00223 WERD_CHOICE* raw_choice; // Owned pointer. 00224 // Alternative results found during chopping/segmentation search stages. 00225 // Note that being an ELIST, best_choices owns the WERD_CHOICEs. 00226 WERD_CHOICE_LIST best_choices; 00227 00228 // Truth bounding boxes, text and incorrect choice reason. 00229 BlamerBundle *blamer_bundle; 00230 00231 // --------------OUTPUT FROM RECOGNITION------------------------------- 00232 // --------------Not all fields are necessarily set.------------------- 00233 // ---best_choice, raw_choice *must* end up set, with a box_word------- 00234 // ---In complete output, the number of blobs in rebuild_word matches--- 00235 // ---the number of boxes in box_word, the number of unichar_ids in--- 00236 // ---best_choice, the number of ints in best_state, and the number--- 00237 // ---of strings in correct_text-------------------------------------- 00238 // ---SetupFake Sets everything to appropriate values if the word is--- 00239 // ---known to be bad before recognition.------------------------------ 00240 00241 // The rebuild_word is also in BLN space, but represents the final best 00242 // segmentation of the word. Its length is therefore the same as box_word. 00243 TWERD* rebuild_word; // BLN best segmented word. 00244 // The box_word is in the original image coordinate space. It is the 00245 // bounding boxes of the rebuild_word, after denormalization. 00246 // The length of box_word matches rebuild_word, best_state (if set) and 00247 // correct_text (if set), as well as best_choice and represents the 00248 // number of classified units in the output. 00249 tesseract::BoxWord* box_word; // Denormalized output boxes. 00250 // The best_state stores the relationship between chopped_word and 00251 // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i] 00252 // adjacent blobs in chopped_word. The seams in seam_array are hidden 00253 // within a rebuild_word blob and revealed between them. 00254 GenericVector<int> best_state; // Number of blobs in each best blob. 00255 // The correct_text is used during training and adaption to carry the 00256 // text to the training system without the need for a unicharset. There 00257 // is one entry in the vector for each blob in rebuild_word and box_word. 00258 GenericVector<STRING> correct_text; 00259 // The Tesseract that was used to recognize this word. Just a borrowed 00260 // pointer. Note: Tesseract's class definition is in a higher-level library. 00261 // We avoid introducing a cyclic dependency by not using the Tesseract 00262 // within WERD_RES. We are just storing it to provide access to it 00263 // for the top-level multi-language controller, and maybe for output of 00264 // the recognized language. 00265 tesseract::Tesseract* tesseract; 00266 00267 // Less-well documented members. 00268 // TODO(rays) Add more documentation here. 00269 WERD_CHOICE *ep_choice; // ep text TODO(rays) delete this. 00270 REJMAP reject_map; // best_choice rejects 00271 BOOL8 tess_failed; 00272 /* 00273 If tess_failed is TRUE, one of the following tests failed when Tess 00274 returned: 00275 - The outword blob list was not the same length as the best_choice string; 00276 - The best_choice string contained ALL blanks; 00277 - The best_choice string was zero length 00278 */ 00279 BOOL8 tess_accepted; // Tess thinks its ok? 00280 BOOL8 tess_would_adapt; // Tess would adapt? 00281 BOOL8 done; // ready for output? 00282 bool small_caps; // word appears to be small caps 00283 inT8 italic; 00284 inT8 bold; 00285 // The fontinfos are pointers to data owned by the classifier. 00286 const FontInfo* fontinfo; 00287 const FontInfo* fontinfo2; 00288 inT8 fontinfo_id_count; // number of votes 00289 inT8 fontinfo_id2_count; // number of votes 00290 BOOL8 guessed_x_ht; 00291 BOOL8 guessed_caps_ht; 00292 CRUNCH_MODE unlv_crunch_mode; 00293 float x_height; // post match estimate 00294 float caps_height; // post match estimate 00295 00296 /* 00297 To deal with fuzzy spaces we need to be able to combine "words" to form 00298 combinations when we suspect that the gap is a non-space. The (new) text 00299 ord code generates separate words for EVERY fuzzy gap - flags in the word 00300 indicate whether the gap is below the threshold (fuzzy kern) and is thus 00301 NOT a real word break by default, or above the threshold (fuzzy space) and 00302 this is a real word break by default. 00303 00304 The WERD_RES list contains all these words PLUS "combination" words built 00305 out of (copies of) the words split by fuzzy kerns. The separate parts have 00306 their "part_of_combo" flag set true and should be IGNORED on a default 00307 reading of the list. 00308 00309 Combination words are FOLLOWED by the sequence of part_of_combo words 00310 which they combine. 00311 */ 00312 BOOL8 combination; //of two fuzzy gap wds 00313 BOOL8 part_of_combo; //part of a combo 00314 BOOL8 reject_spaces; //Reject spacing? 00315 // FontInfo ids for each unichar in best_choice. 00316 GenericVector<inT8> best_choice_fontinfo_ids; 00317 00318 WERD_RES() { 00319 InitNonPointers(); 00320 InitPointers(); 00321 } 00322 WERD_RES(WERD *the_word) { 00323 InitNonPointers(); 00324 InitPointers(); 00325 word = the_word; 00326 } 00327 // Deep copies everything except the ratings MATRIX. 00328 // To get that use deep_copy below. 00329 WERD_RES(const WERD_RES &source) { 00330 InitPointers(); 00331 *this = source; // see operator= 00332 } 00333 00334 ~WERD_RES(); 00335 00336 // Returns the UTF-8 string for the given blob index in the best_choice word, 00337 // given that we know whether we are in a right-to-left reading context. 00338 // This matters for mirrorable characters such as parentheses. We recognize 00339 // characters purely based on their shape on the page, and by default produce 00340 // the corresponding unicode for a left-to-right context. 00341 const char* const BestUTF8(int blob_index, bool in_rtl_context) const { 00342 if (blob_index < 0 || best_choice == NULL || 00343 blob_index >= best_choice->length()) 00344 return NULL; 00345 UNICHAR_ID id = best_choice->unichar_id(blob_index); 00346 if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID) 00347 return NULL; 00348 UNICHAR_ID mirrored = uch_set->get_mirror(id); 00349 if (in_rtl_context && mirrored > 0 && mirrored != INVALID_UNICHAR_ID) 00350 id = mirrored; 00351 return uch_set->id_to_unichar_ext(id); 00352 } 00353 // Returns the UTF-8 string for the given blob index in the raw_choice word. 00354 const char* const RawUTF8(int blob_index) const { 00355 if (blob_index < 0 || blob_index >= raw_choice->length()) 00356 return NULL; 00357 UNICHAR_ID id = raw_choice->unichar_id(blob_index); 00358 if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID) 00359 return NULL; 00360 return uch_set->id_to_unichar(id); 00361 } 00362 00363 UNICHARSET::Direction SymbolDirection(int blob_index) const { 00364 if (best_choice == NULL || 00365 blob_index >= best_choice->length() || 00366 blob_index < 0) 00367 return UNICHARSET::U_OTHER_NEUTRAL; 00368 return uch_set->get_direction(best_choice->unichar_id(blob_index)); 00369 } 00370 00371 bool AnyRtlCharsInWord() const { 00372 if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1) 00373 return false; 00374 for (int id = 0; id < best_choice->length(); id++) { 00375 int unichar_id = best_choice->unichar_id(id); 00376 if (unichar_id < 0 || unichar_id >= uch_set->size()) 00377 continue; // Ignore illegal chars. 00378 UNICHARSET::Direction dir = 00379 uch_set->get_direction(unichar_id); 00380 if (dir == UNICHARSET::U_RIGHT_TO_LEFT || 00381 dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC || 00382 dir == UNICHARSET::U_ARABIC_NUMBER) 00383 return true; 00384 } 00385 return false; 00386 } 00387 00388 bool AnyLtrCharsInWord() const { 00389 if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1) 00390 return false; 00391 for (int id = 0; id < best_choice->length(); id++) { 00392 int unichar_id = best_choice->unichar_id(id); 00393 if (unichar_id < 0 || unichar_id >= uch_set->size()) 00394 continue; // Ignore illegal chars. 00395 UNICHARSET::Direction dir = uch_set->get_direction(unichar_id); 00396 if (dir == UNICHARSET::U_LEFT_TO_RIGHT) 00397 return true; 00398 } 00399 return false; 00400 } 00401 00402 // Return whether the blobs in this WERD_RES 0, 1,... come from an engine 00403 // that gave us the unichars in reading order (as opposed to strict left 00404 // to right). 00405 bool UnicharsInReadingOrder() const { 00406 return best_choice->unichars_in_script_order(); 00407 } 00408 00409 void InitNonPointers(); 00410 void InitPointers(); 00411 void Clear(); 00412 void ClearResults(); 00413 void ClearWordChoices(); 00414 void ClearRatings(); 00415 00416 // Deep copies everything except the ratings MATRIX. 00417 // To get that use deep_copy below. 00418 WERD_RES& operator=(const WERD_RES& source); //from this 00419 00420 void CopySimpleFields(const WERD_RES& source); 00421 00422 // Initializes a blank (default constructed) WERD_RES from one that has 00423 // already been recognized. 00424 // Use SetupFor*Recognition afterwards to complete the setup and make 00425 // it ready for a retry recognition. 00426 void InitForRetryRecognition(const WERD_RES& source); 00427 00428 // Sets up the members used in recognition: bln_boxes, chopped_word, 00429 // seam_array, denorm. Returns false if 00430 // the word is empty and sets up fake results. If use_body_size is 00431 // true and row->body_size is set, then body_size will be used for 00432 // blob normalization instead of xheight + ascrise. This flag is for 00433 // those languages that are using CJK pitch model and thus it has to 00434 // be true if and only if tesseract->textord_use_cjk_fp_model is 00435 // true. 00436 // If allow_detailed_fx is true, the feature extractor will receive fine 00437 // precision outline information, allowing smoother features and better 00438 // features on low resolution images. 00439 // The norm_mode sets the default mode for normalization in absence 00440 // of any of the above flags. It should really be a tesseract::OcrEngineMode 00441 // but is declared as int for ease of use with tessedit_ocr_engine_mode. 00442 // Returns false if the word is empty and sets up fake results. 00443 bool SetupForRecognition(const UNICHARSET& unicharset_in, 00444 tesseract::Tesseract* tesseract, Pix* pix, 00445 int norm_mode, 00446 const TBOX* norm_box, bool numeric_mode, 00447 bool use_body_size, bool allow_detailed_fx, 00448 ROW *row, const BLOCK* block); 00449 00450 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty 00451 // accumulators from a made chopped word. We presume the fields are already 00452 // empty. 00453 void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in); 00454 00455 // Sets up the members used in recognition for an empty recognition result: 00456 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. 00457 void SetupFake(const UNICHARSET& uch); 00458 00459 // Set the word as having the script of the input unicharset. 00460 void SetupWordScript(const UNICHARSET& unicharset_in); 00461 00462 // Sets up the blamer_bundle if it is not null, using the initialized denorm. 00463 void SetupBlamerBundle(); 00464 00465 // Computes the blob_widths and blob_gaps from the chopped_word. 00466 void SetupBlobWidthsAndGaps(); 00467 00468 // Updates internal data to account for a new SEAM (chop) at the given 00469 // blob_number. Fixes the ratings matrix and states in the choices, as well 00470 // as the blob widths and gaps. 00471 void InsertSeam(int blob_number, SEAM* seam); 00472 00473 // Returns true if all the word choices except the first have adjust_factors 00474 // worse than the given threshold. 00475 bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const; 00476 00477 // Returns true if the current word is ambiguous (by number of answers or 00478 // by dangerous ambigs.) 00479 bool IsAmbiguous(); 00480 00481 // Returns true if the ratings matrix size matches the sum of each of the 00482 // segmentation states. 00483 bool StatesAllValid(); 00484 00485 // Prints a list of words found if debug is true or the word result matches 00486 // the word_to_debug. 00487 void DebugWordChoices(bool debug, const char* word_to_debug); 00488 00489 // Removes from best_choices all choices which are not within a reasonable 00490 // range of the best choice. 00491 void FilterWordChoices(int debug_level); 00492 00493 // Computes a set of distance thresholds used to control adaption. 00494 // Compares the best choice for the current word to the best raw choice 00495 // to determine which characters were classified incorrectly by the 00496 // classifier. Then places a separate threshold into thresholds for each 00497 // character in the word. If the classifier was correct, max_rating is placed 00498 // into thresholds. If the classifier was incorrect, the mean match rating 00499 // (error percentage) of the classifier's incorrect choice minus some margin 00500 // is placed into thresholds. This can then be used by the caller to try to 00501 // create a new template for the desired class that will classify the 00502 // character with a rating better than the threshold value. The match rating 00503 // placed into thresholds is never allowed to be below min_rating in order to 00504 // prevent trying to make overly tight templates. 00505 // min_rating limits how tight to make a template. 00506 // max_rating limits how loose to make a template. 00507 // rating_margin denotes the amount of margin to put in template. 00508 void ComputeAdaptionThresholds(float certainty_scale, 00509 float min_rating, 00510 float max_rating, 00511 float rating_margin, 00512 float* thresholds); 00513 00514 // Saves a copy of the word_choice if it has the best unadjusted rating. 00515 // Returns true if the word_choice was the new best. 00516 bool LogNewRawChoice(WERD_CHOICE* word_choice); 00517 // Consumes word_choice by adding it to best_choices, (taking ownership) if 00518 // the certainty for word_choice is some distance of the best choice in 00519 // best_choices, or by deleting the word_choice and returning false. 00520 // The best_choices list is kept in sorted order by rating. Duplicates are 00521 // removed, and the list is kept no longer than max_num_choices in length. 00522 // Returns true if the word_choice is still a valid pointer. 00523 bool LogNewCookedChoice(int max_num_choices, bool debug, 00524 WERD_CHOICE* word_choice); 00525 00526 // Prints a brief list of all the best choices. 00527 void PrintBestChoices() const; 00528 00529 // Returns the sum of the widths of the blob between start_blob and last_blob 00530 // inclusive. 00531 int GetBlobsWidth(int start_blob, int last_blob); 00532 // Returns the width of a gap between the specified blob and the next one. 00533 int GetBlobsGap(int blob_index); 00534 00535 // Returns the BLOB_CHOICE corresponding to the given index in the 00536 // best choice word taken from the appropriate cell in the ratings MATRIX. 00537 // Borrowed pointer, so do not delete. May return NULL if there is no 00538 // BLOB_CHOICE matching the unichar_id at the given index. 00539 BLOB_CHOICE* GetBlobChoice(int index) const; 00540 00541 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the 00542 // best choice word taken from the appropriate cell in the ratings MATRIX. 00543 // Borrowed pointer, so do not delete. 00544 BLOB_CHOICE_LIST* GetBlobChoices(int index) const; 00545 00546 // Moves the results fields from word to this. This takes ownership of all 00547 // the data, so src can be destructed. 00548 // word1.ConsumeWordResult(word); 00549 // delete word; 00550 // is simpler and faster than: 00551 // word1 = *word; 00552 // delete word; 00553 // as it doesn't need to copy and reallocate anything. 00554 void ConsumeWordResults(WERD_RES* word); 00555 00556 // Replace the best choice and rebuild box word. 00557 // choice must be from the current best_choices list. 00558 void ReplaceBestChoice(WERD_CHOICE* choice); 00559 00560 // Builds the rebuild_word and sets the best_state from the chopped_word and 00561 // the best_choice->state. 00562 void RebuildBestState(); 00563 00564 // Copies the chopped_word to the rebuild_word, faking a best_state as well. 00565 // Also sets up the output box_word. 00566 void CloneChoppedToRebuild(); 00567 00568 // Sets/replaces the box_word with one made from the rebuild_word. 00569 void SetupBoxWord(); 00570 00571 // Sets up the script positions in the best_choice using the best_choice 00572 // to get the unichars, and the unicharset to get the target positions. 00573 void SetScriptPositions(); 00574 // Sets all the blobs in all the words (best choice and alternates) to be 00575 // the given position. (When a sub/superscript is recognized as a separate 00576 // word, it falls victim to the rule that a whole word cannot be sub or 00577 // superscript, so this function overrides that problem.) 00578 void SetAllScriptPositions(tesseract::ScriptPos position); 00579 00580 // Classifies the word with some already-calculated BLOB_CHOICEs. 00581 // The choices are an array of blob_count pointers to BLOB_CHOICE, 00582 // providing a single classifier result for each blob. 00583 // The BLOB_CHOICEs are consumed and the word takes ownership. 00584 // The number of blobs in the box_word must match blob_count. 00585 void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices); 00586 00587 // Creates a WERD_CHOICE for the word using the top choices from the leading 00588 // diagonal of the ratings matrix. 00589 void FakeWordFromRatings(); 00590 00591 // Copies the best_choice strings to the correct_text for adaption/training. 00592 void BestChoiceToCorrectText(); 00593 00594 // Merges 2 adjacent blobs in the result if the permanent callback 00595 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent 00596 // callback box_cb is NULL or returns true, setting the merged blob 00597 // result to the class returned from class_cb. 00598 // Returns true if anything was merged. 00599 bool ConditionalBlobMerge( 00600 TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb, 00601 TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb); 00602 00603 // Merges 2 adjacent blobs in the result (index and index+1) and corrects 00604 // all the data to account for the change. 00605 void MergeAdjacentBlobs(int index); 00606 00607 // Callback helper for fix_quotes returns a double quote if both 00608 // arguments are quote, otherwise INVALID_UNICHAR_ID. 00609 UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2); 00610 void fix_quotes(); 00611 00612 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both 00613 // arguments are hyphen, otherwise INVALID_UNICHAR_ID. 00614 UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2); 00615 // Callback helper for fix_hyphens returns true if box1 and box2 overlap 00616 // (assuming both on the same textline, are in order and a chopped em dash.) 00617 bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2); 00618 void fix_hyphens(); 00619 00620 // Callback helper for merge_tess_fails returns a space if both 00621 // arguments are space, otherwise INVALID_UNICHAR_ID. 00622 UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2); 00623 void merge_tess_fails(); 00624 00625 // Returns a really deep copy of *src, including the ratings MATRIX. 00626 static WERD_RES* deep_copy(const WERD_RES* src) { 00627 WERD_RES* result = new WERD_RES(*src); 00628 // That didn't copy the ratings, but we want a copy if there is one to 00629 // begin width. 00630 if (src->ratings != NULL) 00631 result->ratings = src->ratings->DeepCopy(); 00632 return result; 00633 } 00634 00635 // Copy blobs from word_res onto this word (eliminating spaces between). 00636 // Since this may be called bidirectionally OR both the BOL and EOL flags. 00637 void copy_on(WERD_RES *word_res) { //from this word 00638 word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL)); 00639 word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL)); 00640 word->copy_on(word_res->word); 00641 } 00642 00643 // Returns true if the collection of count pieces, starting at start, are all 00644 // natural connected components, ie there are no real chops involved. 00645 bool PiecesAllNatural(int start, int count) const; 00646 }; 00647 00648 /************************************************************************* 00649 * PAGE_RES_IT - Page results iterator 00650 *************************************************************************/ 00651 00652 class PAGE_RES_IT { 00653 public: 00654 PAGE_RES * page_res; // page being iterated 00655 00656 PAGE_RES_IT() { 00657 } // empty contructor 00658 00659 PAGE_RES_IT(PAGE_RES *the_page_res) { // page result 00660 page_res = the_page_res; 00661 restart_page(); // ready to scan 00662 } 00663 00664 // Do two PAGE_RES_ITs point at the same word? 00665 // This is much cheaper than cmp(). 00666 bool operator ==(const PAGE_RES_IT &other) const; 00667 00668 bool operator !=(const PAGE_RES_IT &other) const {return !(*this == other); } 00669 00670 // Given another PAGE_RES_IT to the same page, 00671 // this before other: -1 00672 // this equal to other: 0 00673 // this later than other: 1 00674 int cmp(const PAGE_RES_IT &other) const; 00675 00676 WERD_RES *restart_page() { 00677 return start_page(false); // Skip empty blocks. 00678 } 00679 WERD_RES *restart_page_with_empties() { 00680 return start_page(true); // Allow empty blocks. 00681 } 00682 WERD_RES *start_page(bool empty_ok); 00683 00684 WERD_RES *restart_row(); 00685 00686 // ============ Methods that mutate the underling structures =========== 00687 // Note that these methods will potentially invalidate other PAGE_RES_ITs 00688 // and are intended to be used only while a single PAGE_RES_IT is active. 00689 // This problem needs to be taken into account if these mutation operators 00690 // are ever provided to PageIterator or its subclasses. 00691 00692 // Inserts the new_word and a corresponding WERD_RES before the current 00693 // position. The simple fields of the WERD_RES are copied from clone_res and 00694 // the resulting WERD_RES is returned for further setup with best_choice etc. 00695 WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word); 00696 00697 // Deletes the current WERD_RES and its underlying WERD. 00698 void DeleteCurrentWord(); 00699 00700 WERD_RES *forward() { // Get next word. 00701 return internal_forward(false, false); 00702 } 00703 // Move forward, but allow empty blocks to show as single NULL words. 00704 WERD_RES *forward_with_empties() { 00705 return internal_forward(false, true); 00706 } 00707 00708 WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph 00709 WERD_RES *forward_block(); // get first word in next non-empty block 00710 00711 WERD_RES *prev_word() const { // previous word 00712 return prev_word_res; 00713 } 00714 ROW_RES *prev_row() const { // row of prev word 00715 return prev_row_res; 00716 } 00717 BLOCK_RES *prev_block() const { // block of prev word 00718 return prev_block_res; 00719 } 00720 WERD_RES *word() const { // current word 00721 return word_res; 00722 } 00723 ROW_RES *row() const { // row of current word 00724 return row_res; 00725 } 00726 BLOCK_RES *block() const { // block of cur. word 00727 return block_res; 00728 } 00729 WERD_RES *next_word() const { // next word 00730 return next_word_res; 00731 } 00732 ROW_RES *next_row() const { // row of next word 00733 return next_row_res; 00734 } 00735 BLOCK_RES *next_block() const { // block of next word 00736 return next_block_res; 00737 } 00738 void rej_stat_word(); // for page/block/row 00739 00740 private: 00741 void ResetWordIterator(); 00742 WERD_RES *internal_forward(bool new_block, bool empty_ok); 00743 00744 WERD_RES * prev_word_res; // previous word 00745 ROW_RES *prev_row_res; // row of prev word 00746 BLOCK_RES *prev_block_res; // block of prev word 00747 00748 WERD_RES *word_res; // current word 00749 ROW_RES *row_res; // row of current word 00750 BLOCK_RES *block_res; // block of cur. word 00751 00752 WERD_RES *next_word_res; // next word 00753 ROW_RES *next_row_res; // row of next word 00754 BLOCK_RES *next_block_res; // block of next word 00755 00756 BLOCK_RES_IT block_res_it; // iterators 00757 ROW_RES_IT row_res_it; 00758 WERD_RES_IT word_res_it; 00759 }; 00760 #endif