tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccstruct/pageres.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        pageres.h  (Formerly page_res.h)
00003  * Description: Results classes used by control.c
00004  * Author:              Phil Cheatle
00005  * Created:     Tue Sep 22 08:42:49 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 #ifndef           PAGERES_H
00020 #define           PAGERES_H
00021 
00022 #include "blamer.h"
00023 #include "blobs.h"
00024 #include "boxword.h"
00025 #include "elst.h"
00026 #include "genericvector.h"
00027 #include "normalis.h"
00028 #include "ocrblock.h"
00029 #include "ocrrow.h"
00030 #include "params_training_featdef.h"
00031 #include "ratngs.h"
00032 #include "rejctmap.h"
00033 #include "seam.h"
00034 #include "werd.h"
00035 
00036 namespace tesseract {
00037 struct FontInfo;
00038 class Tesseract;
00039 }
00040 using tesseract::FontInfo;
00041 
00042 /* Forward declarations */
00043 
00044 class BLOCK_RES;
00045 
00046 ELISTIZEH (BLOCK_RES) CLISTIZEH (BLOCK_RES)
00047 class
00048 ROW_RES;
00049 
00050 ELISTIZEH (ROW_RES)
00051 class WERD_RES;
00052 
00053 ELISTIZEH (WERD_RES)
00054 
00055 /*************************************************************************
00056  * PAGE_RES - Page results
00057  *************************************************************************/
00058 class PAGE_RES {                 // page result
00059  public:
00060   inT32 char_count;
00061   inT32 rej_count;
00062   BLOCK_RES_LIST block_res_list;
00063   BOOL8 rejected;
00064   // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to
00065   // the next word. This pointer is not owned by PAGE_RES class.
00066   WERD_CHOICE **prev_word_best_choice;
00067   // Sums of blame reasons computed by the blamer.
00068   GenericVector<int> blame_reasons;
00069   // Debug information about all the misadaptions on this page.
00070   // Each BlamerBundle contains an index into this vector, so that words that
00071   // caused misadaption could be marked. However, since words could be
00072   // deleted/split/merged, the log is stored on the PAGE_RES level.
00073   GenericVector<STRING> misadaption_log;
00074 
00075   inline void Init() {
00076     char_count = 0;
00077     rej_count = 0;
00078     rejected = FALSE;
00079     prev_word_best_choice = NULL;
00080     blame_reasons.init_to_size(IRR_NUM_REASONS, 0);
00081   }
00082 
00083   PAGE_RES() { Init(); }  // empty constructor
00084 
00085   PAGE_RES(BLOCK_LIST *block_list,   // real blocks
00086            WERD_CHOICE **prev_word_best_choice_ptr);
00087 
00088   ~PAGE_RES () {               // destructor
00089   }
00090 };
00091 
00092 /*************************************************************************
00093  * BLOCK_RES - Block results
00094  *************************************************************************/
00095 
00096 class BLOCK_RES:public ELIST_LINK {
00097  public:
00098   BLOCK * block;               // real block
00099   inT32 char_count;            // chars in block
00100   inT32 rej_count;             // rejected chars
00101   inT16 font_class;            //
00102   inT16 row_count;
00103   float x_height;
00104   BOOL8 font_assigned;         // block already
00105   //      processed
00106   BOOL8 bold;                  // all bold
00107   BOOL8 italic;                // all italic
00108 
00109   ROW_RES_LIST row_res_list;
00110 
00111   BLOCK_RES() {
00112   }                            // empty constructor
00113 
00114   BLOCK_RES(BLOCK *the_block);  // real block
00115 
00116   ~BLOCK_RES () {              // destructor
00117   }
00118 };
00119 
00120 /*************************************************************************
00121  * ROW_RES - Row results
00122  *************************************************************************/
00123 
00124 class ROW_RES:public ELIST_LINK {
00125  public:
00126   ROW * row;                   // real row
00127   inT32 char_count;            // chars in block
00128   inT32 rej_count;             // rejected chars
00129   inT32 whole_word_rej_count;  // rejs in total rej wds
00130   WERD_RES_LIST word_res_list;
00131 
00132   ROW_RES() {
00133   }                            // empty constructor
00134 
00135   ROW_RES(ROW *the_row);  // real row
00136 
00137   ~ROW_RES() {                // destructor
00138   }
00139 };
00140 
00141 /*************************************************************************
00142  * WERD_RES - Word results
00143  *************************************************************************/
00144 enum CRUNCH_MODE
00145 {
00146   CR_NONE,
00147   CR_KEEP_SPACE,
00148   CR_LOOSE_SPACE,
00149   CR_DELETE
00150 };
00151 
00152 // WERD_RES is a collection of publicly accessible members that gathers
00153 // information about a word result.
00154 class WERD_RES : public ELIST_LINK {
00155  public:
00156   // Which word is which?
00157   // There are 3 coordinate spaces in use here: a possibly rotated pixel space,
00158   // the original image coordinate space, and the BLN space in which the
00159   // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight,
00160   // and the x-middle of the word is at 0.
00161   // In the rotated pixel space, coordinates correspond to the input image,
00162   // but may be rotated about the origin by a multiple of 90 degrees,
00163   // and may therefore be negative.
00164   // In any case a rotation by denorm.block()->re_rotation() will take them
00165   // back to the original image.
00166   // The other differences between words all represent different stages of
00167   // processing during recognition.
00168 
00169   // ---------------------------INPUT-------------------------------------
00170 
00171   // The word is the input C_BLOBs in the rotated pixel space.
00172   // word is NOT owned by the WERD_RES unless combination is true.
00173   // All the other word pointers ARE owned by the WERD_RES.
00174   WERD* word;                     // Input C_BLOB word.
00175 
00176   // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------
00177 
00178   // The bln_boxes contains the bounding boxes (only) of the input word, in the
00179   // BLN space. The lengths of word and bln_boxes
00180   // match as they are both before any chopping.
00181   // TODO(rays) determine if docqual does anything useful and delete bln_boxes
00182   // if it doesn't.
00183   tesseract::BoxWord* bln_boxes;  // BLN input bounding boxes.
00184   // The ROW that this word sits in. NOT owned by the WERD_RES.
00185   ROW* blob_row;
00186   // The denorm provides the transformation to get back to the rotated image
00187   // coords from the chopped_word/rebuild_word BLN coords, but each blob also
00188   // has its own denorm.
00189   DENORM denorm;                  // For use on chopped_word.
00190   // Unicharset used by the classifier output in best_choice and raw_choice.
00191   const UNICHARSET* uch_set;  // For converting back to utf8.
00192 
00193   // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----
00194   // ----Setup to a (different!) state expected by the various classifiers----
00195   // TODO(rays) Tidy and make more consistent.
00196 
00197   // The chopped_word is also in BLN space, and represents the fully chopped
00198   // character fragments that make up the word.
00199   // The length of chopped_word matches length of seam_array + 1 (if set).
00200   TWERD* chopped_word;            // BLN chopped fragments output.
00201   // Vector of SEAM* holding chopping points matching chopped_word.
00202   GenericVector<SEAM*> seam_array;
00203   // Widths of blobs in chopped_word.
00204   GenericVector<int> blob_widths;
00205   // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
00206   // blob i and blob i+1.
00207   GenericVector<int> blob_gaps;
00208   // Ratings matrix contains classifier choices for each classified combination
00209   // of blobs. The dimension is the same as the number of blobs in chopped_word
00210   // and the leading diagonal corresponds to classifier results of the blobs
00211   // in chopped_word. The state_ members of best_choice, raw_choice and
00212   // best_choices all correspond to this ratings matrix and allow extraction
00213   // of the blob choices for any given WERD_CHOICE.
00214   MATRIX* ratings;                // Owned pointer.
00215   // Pointer to the first WERD_CHOICE in best_choices. This is the result that
00216   // will be output from Tesseract. Note that this is now a borrowed pointer
00217   // and should NOT be deleted.
00218   WERD_CHOICE* best_choice;       // Borrowed pointer.
00219   // The best raw_choice found during segmentation search. Differs from the
00220   // best_choice by being the best result according to just the character
00221   // classifier, not taking any language model information into account.
00222   // Unlike best_choice, the pointer IS owned by this WERD_RES.
00223   WERD_CHOICE* raw_choice;        // Owned pointer.
00224   // Alternative results found during chopping/segmentation search stages.
00225   // Note that being an ELIST, best_choices owns the WERD_CHOICEs.
00226   WERD_CHOICE_LIST best_choices;
00227 
00228   // Truth bounding boxes, text and incorrect choice reason.
00229   BlamerBundle *blamer_bundle;
00230 
00231   // --------------OUTPUT FROM RECOGNITION-------------------------------
00232   // --------------Not all fields are necessarily set.-------------------
00233   // ---best_choice, raw_choice *must* end up set, with a box_word-------
00234   // ---In complete output, the number of blobs in rebuild_word matches---
00235   // ---the number of boxes in box_word, the number of unichar_ids in---
00236   // ---best_choice, the number of ints in best_state, and the number---
00237   // ---of strings in correct_text--------------------------------------
00238   // ---SetupFake Sets everything to appropriate values if the word is---
00239   // ---known to be bad before recognition.------------------------------
00240 
00241   // The rebuild_word is also in BLN space, but represents the final best
00242   // segmentation of the word. Its length is therefore the same as box_word.
00243   TWERD* rebuild_word;            // BLN best segmented word.
00244   // The box_word is in the original image coordinate space. It is the
00245   // bounding boxes of the rebuild_word, after denormalization.
00246   // The length of box_word matches rebuild_word, best_state (if set) and
00247   // correct_text (if set), as well as best_choice and represents the
00248   // number of classified units in the output.
00249   tesseract::BoxWord* box_word;   // Denormalized output boxes.
00250   // The best_state stores the relationship between chopped_word and
00251   // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
00252   // adjacent blobs in chopped_word. The seams in seam_array are hidden
00253   // within a rebuild_word blob and revealed between them.
00254   GenericVector<int> best_state;  // Number of blobs in each best blob.
00255   // The correct_text is used during training and adaption to carry the
00256   // text to the training system without the need for a unicharset. There
00257   // is one entry in the vector for each blob in rebuild_word and box_word.
00258   GenericVector<STRING> correct_text;
00259   // The Tesseract that was used to recognize this word. Just a borrowed
00260   // pointer. Note: Tesseract's class definition is in a higher-level library.
00261   // We avoid introducing a cyclic dependency by not using the Tesseract
00262   // within WERD_RES. We are just storing it to provide access to it
00263   // for the top-level multi-language controller, and maybe for output of
00264   // the recognized language.
00265   tesseract::Tesseract* tesseract;
00266 
00267   // Less-well documented members.
00268   // TODO(rays) Add more documentation here.
00269   WERD_CHOICE *ep_choice;      // ep text TODO(rays) delete this.
00270   REJMAP reject_map;           // best_choice rejects
00271   BOOL8 tess_failed;
00272   /*
00273     If tess_failed is TRUE, one of the following tests failed when Tess
00274     returned:
00275     - The outword blob list was not the same length as the best_choice string;
00276     - The best_choice string contained ALL blanks;
00277     - The best_choice string was zero length
00278   */
00279   BOOL8 tess_accepted;          // Tess thinks its ok?
00280   BOOL8 tess_would_adapt;       // Tess would adapt?
00281   BOOL8 done;                   // ready for output?
00282   bool small_caps;             // word appears to be small caps
00283   inT8 italic;
00284   inT8 bold;
00285   // The fontinfos are pointers to data owned by the classifier.
00286   const FontInfo* fontinfo;
00287   const FontInfo* fontinfo2;
00288   inT8 fontinfo_id_count;       // number of votes
00289   inT8 fontinfo_id2_count;      // number of votes
00290   BOOL8 guessed_x_ht;
00291   BOOL8 guessed_caps_ht;
00292   CRUNCH_MODE unlv_crunch_mode;
00293   float x_height;              // post match estimate
00294   float caps_height;           // post match estimate
00295 
00296   /*
00297     To deal with fuzzy spaces we need to be able to combine "words" to form
00298     combinations when we suspect that the gap is a non-space. The (new) text
00299     ord code generates separate words for EVERY fuzzy gap - flags in the word
00300     indicate whether the gap is below the threshold (fuzzy kern) and is thus
00301     NOT a real word break by default, or above the threshold (fuzzy space) and
00302     this is a real word break by default.
00303 
00304     The WERD_RES list contains all these words PLUS "combination" words built
00305     out of (copies of) the words split by fuzzy kerns. The separate parts have
00306     their "part_of_combo" flag set true and should be IGNORED on a default
00307     reading of the list.
00308 
00309     Combination words are FOLLOWED by the sequence of part_of_combo words
00310     which they combine.
00311   */
00312   BOOL8 combination;           //of two fuzzy gap wds
00313   BOOL8 part_of_combo;         //part of a combo
00314   BOOL8 reject_spaces;         //Reject spacing?
00315   // FontInfo ids for each unichar in best_choice.
00316   GenericVector<inT8> best_choice_fontinfo_ids;
00317 
00318   WERD_RES() {
00319     InitNonPointers();
00320     InitPointers();
00321   }
00322   WERD_RES(WERD *the_word) {
00323     InitNonPointers();
00324     InitPointers();
00325     word = the_word;
00326   }
00327   // Deep copies everything except the ratings MATRIX.
00328   // To get that use deep_copy below.
00329   WERD_RES(const WERD_RES &source) {
00330     InitPointers();
00331     *this = source;            // see operator=
00332   }
00333 
00334   ~WERD_RES();
00335 
00336   // Returns the UTF-8 string for the given blob index in the best_choice word,
00337   // given that we know whether we are in a right-to-left reading context.
00338   // This matters for mirrorable characters such as parentheses.  We recognize
00339   // characters purely based on their shape on the page, and by default produce
00340   // the corresponding unicode for a left-to-right context.
00341   const char* const BestUTF8(int blob_index, bool in_rtl_context) const {
00342     if (blob_index < 0 || best_choice == NULL ||
00343         blob_index >= best_choice->length())
00344       return NULL;
00345     UNICHAR_ID id = best_choice->unichar_id(blob_index);
00346     if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID)
00347       return NULL;
00348     UNICHAR_ID mirrored = uch_set->get_mirror(id);
00349     if (in_rtl_context && mirrored > 0 && mirrored != INVALID_UNICHAR_ID)
00350       id = mirrored;
00351     return uch_set->id_to_unichar_ext(id);
00352   }
00353   // Returns the UTF-8 string for the given blob index in the raw_choice word.
00354   const char* const RawUTF8(int blob_index) const {
00355     if (blob_index < 0 || blob_index >= raw_choice->length())
00356       return NULL;
00357     UNICHAR_ID id = raw_choice->unichar_id(blob_index);
00358     if (id < 0 || id >= uch_set->size() || id == INVALID_UNICHAR_ID)
00359       return NULL;
00360     return uch_set->id_to_unichar(id);
00361   }
00362 
00363   UNICHARSET::Direction SymbolDirection(int blob_index) const {
00364     if (best_choice == NULL ||
00365         blob_index >= best_choice->length() ||
00366         blob_index < 0)
00367       return UNICHARSET::U_OTHER_NEUTRAL;
00368     return uch_set->get_direction(best_choice->unichar_id(blob_index));
00369   }
00370 
00371   bool AnyRtlCharsInWord() const {
00372     if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1)
00373       return false;
00374     for (int id = 0; id < best_choice->length(); id++) {
00375       int unichar_id = best_choice->unichar_id(id);
00376       if (unichar_id < 0 || unichar_id >= uch_set->size())
00377         continue;  // Ignore illegal chars.
00378       UNICHARSET::Direction dir =
00379           uch_set->get_direction(unichar_id);
00380       if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
00381           dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
00382           dir == UNICHARSET::U_ARABIC_NUMBER)
00383         return true;
00384     }
00385     return false;
00386   }
00387 
00388   bool AnyLtrCharsInWord() const {
00389     if (uch_set == NULL || best_choice == NULL || best_choice->length() < 1)
00390       return false;
00391     for (int id = 0; id < best_choice->length(); id++) {
00392       int unichar_id = best_choice->unichar_id(id);
00393       if (unichar_id < 0 || unichar_id >= uch_set->size())
00394         continue;  // Ignore illegal chars.
00395       UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
00396       if (dir == UNICHARSET::U_LEFT_TO_RIGHT)
00397         return true;
00398     }
00399     return false;
00400   }
00401 
00402   // Return whether the blobs in this WERD_RES 0, 1,... come from an engine
00403   // that gave us the unichars in reading order (as opposed to strict left
00404   // to right).
00405   bool UnicharsInReadingOrder() const {
00406     return best_choice->unichars_in_script_order();
00407   }
00408 
00409   void InitNonPointers();
00410   void InitPointers();
00411   void Clear();
00412   void ClearResults();
00413   void ClearWordChoices();
00414   void ClearRatings();
00415 
00416   // Deep copies everything except the ratings MATRIX.
00417   // To get that use deep_copy below.
00418   WERD_RES& operator=(const WERD_RES& source);  //from this
00419 
00420   void CopySimpleFields(const WERD_RES& source);
00421 
00422   // Initializes a blank (default constructed) WERD_RES from one that has
00423   // already been recognized.
00424   // Use SetupFor*Recognition afterwards to complete the setup and make
00425   // it ready for a retry recognition.
00426   void InitForRetryRecognition(const WERD_RES& source);
00427 
00428   // Sets up the members used in recognition: bln_boxes, chopped_word,
00429   // seam_array, denorm.  Returns false if
00430   // the word is empty and sets up fake results.  If use_body_size is
00431   // true and row->body_size is set, then body_size will be used for
00432   // blob normalization instead of xheight + ascrise. This flag is for
00433   // those languages that are using CJK pitch model and thus it has to
00434   // be true if and only if tesseract->textord_use_cjk_fp_model is
00435   // true.
00436   // If allow_detailed_fx is true, the feature extractor will receive fine
00437   // precision outline information, allowing smoother features and better
00438   // features on low resolution images.
00439   // The norm_mode sets the default mode for normalization in absence
00440   // of any of the above flags. It should really be a tesseract::OcrEngineMode
00441   // but is declared as int for ease of use with tessedit_ocr_engine_mode.
00442   // Returns false if the word is empty and sets up fake results.
00443   bool SetupForRecognition(const UNICHARSET& unicharset_in,
00444                            tesseract::Tesseract* tesseract, Pix* pix,
00445                            int norm_mode,
00446                            const TBOX* norm_box, bool numeric_mode,
00447                            bool use_body_size, bool allow_detailed_fx,
00448                            ROW *row, const BLOCK* block);
00449 
00450   // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
00451   // accumulators from a made chopped word.  We presume the fields are already
00452   // empty.
00453   void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);
00454 
00455   // Sets up the members used in recognition for an empty recognition result:
00456   // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
00457   void SetupFake(const UNICHARSET& uch);
00458 
00459   // Set the word as having the script of the input unicharset.
00460   void SetupWordScript(const UNICHARSET& unicharset_in);
00461 
00462   // Sets up the blamer_bundle if it is not null, using the initialized denorm.
00463   void SetupBlamerBundle();
00464 
00465   // Computes the blob_widths and blob_gaps from the chopped_word.
00466   void SetupBlobWidthsAndGaps();
00467 
00468   // Updates internal data to account for a new SEAM (chop) at the given
00469   // blob_number. Fixes the ratings matrix and states in the choices, as well
00470   // as the blob widths and gaps.
00471   void InsertSeam(int blob_number, SEAM* seam);
00472 
00473   // Returns true if all the word choices except the first have adjust_factors
00474   // worse than the given threshold.
00475   bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;
00476 
00477   // Returns true if the current word is ambiguous (by number of answers or
00478   // by dangerous ambigs.)
00479   bool IsAmbiguous();
00480 
00481   // Returns true if the ratings matrix size matches the sum of each of the
00482   // segmentation states.
00483   bool StatesAllValid();
00484 
00485   // Prints a list of words found if debug is true or the word result matches
00486   // the word_to_debug.
00487   void DebugWordChoices(bool debug, const char* word_to_debug);
00488 
00489   // Removes from best_choices all choices which are not within a reasonable
00490   // range of the best choice.
00491   void FilterWordChoices(int debug_level);
00492 
00493   // Computes a set of distance thresholds used to control adaption.
00494   // Compares the best choice for the current word to the best raw choice
00495   // to determine which characters were classified incorrectly by the
00496   // classifier. Then places a separate threshold into thresholds for each
00497   // character in the word. If the classifier was correct, max_rating is placed
00498   // into thresholds. If the classifier was incorrect, the mean match rating
00499   // (error percentage) of the classifier's incorrect choice minus some margin
00500   // is placed into thresholds. This can then be used by the caller to try to
00501   // create a new template for the desired class that will classify the
00502   // character with a rating better than the threshold value. The match rating
00503   // placed into thresholds is never allowed to be below min_rating in order to
00504   // prevent trying to make overly tight templates.
00505   // min_rating limits how tight to make a template.
00506   // max_rating limits how loose to make a template.
00507   // rating_margin denotes the amount of margin to put in template.
00508   void ComputeAdaptionThresholds(float certainty_scale,
00509                                  float min_rating,
00510                                  float max_rating,
00511                                  float rating_margin,
00512                                  float* thresholds);
00513 
00514   // Saves a copy of the word_choice if it has the best unadjusted rating.
00515   // Returns true if the word_choice was the new best.
00516   bool LogNewRawChoice(WERD_CHOICE* word_choice);
00517   // Consumes word_choice by adding it to best_choices, (taking ownership) if
00518   // the certainty for word_choice is some distance of the best choice in
00519   // best_choices, or by deleting the word_choice and returning false.
00520   // The best_choices list is kept in sorted order by rating. Duplicates are
00521   // removed, and the list is kept no longer than max_num_choices in length.
00522   // Returns true if the word_choice is still a valid pointer.
00523   bool LogNewCookedChoice(int max_num_choices, bool debug,
00524                           WERD_CHOICE* word_choice);
00525 
00526   // Prints a brief list of all the best choices.
00527   void PrintBestChoices() const;
00528 
00529   // Returns the sum of the widths of the blob between start_blob and last_blob
00530   // inclusive.
00531   int GetBlobsWidth(int start_blob, int last_blob);
00532   // Returns the width of a gap between the specified blob and the next one.
00533   int GetBlobsGap(int blob_index);
00534 
00535   // Returns the BLOB_CHOICE corresponding to the given index in the
00536   // best choice word taken from the appropriate cell in the ratings MATRIX.
00537   // Borrowed pointer, so do not delete. May return NULL if there is no
00538   // BLOB_CHOICE matching the unichar_id at the given index.
00539   BLOB_CHOICE* GetBlobChoice(int index) const;
00540 
00541   // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
00542   // best choice word taken from the appropriate cell in the ratings MATRIX.
00543   // Borrowed pointer, so do not delete.
00544   BLOB_CHOICE_LIST* GetBlobChoices(int index) const;
00545 
00546   // Moves the results fields from word to this. This takes ownership of all
00547   // the data, so src can be destructed.
00548   // word1.ConsumeWordResult(word);
00549   // delete word;
00550   // is simpler and faster than:
00551   // word1 = *word;
00552   // delete word;
00553   // as it doesn't need to copy and reallocate anything.
00554   void ConsumeWordResults(WERD_RES* word);
00555 
00556   // Replace the best choice and rebuild box word.
00557   // choice must be from the current best_choices list.
00558   void ReplaceBestChoice(WERD_CHOICE* choice);
00559 
00560   // Builds the rebuild_word and sets the best_state from the chopped_word and
00561   // the best_choice->state.
00562   void RebuildBestState();
00563 
00564   // Copies the chopped_word to the rebuild_word, faking a best_state as well.
00565   // Also sets up the output box_word.
00566   void CloneChoppedToRebuild();
00567 
00568   // Sets/replaces the box_word with one made from the rebuild_word.
00569   void SetupBoxWord();
00570 
00571   // Sets up the script positions in the best_choice using the best_choice
00572   // to get the unichars, and the unicharset to get the target positions.
00573   void SetScriptPositions();
00574   // Sets all the blobs in all the words (best choice and alternates) to be
00575   // the given position. (When a sub/superscript is recognized as a separate
00576   // word, it falls victim to the rule that a whole word cannot be sub or
00577   // superscript, so this function overrides that problem.)
00578   void SetAllScriptPositions(tesseract::ScriptPos position);
00579 
00580   // Classifies the word with some already-calculated BLOB_CHOICEs.
00581   // The choices are an array of blob_count pointers to BLOB_CHOICE,
00582   // providing a single classifier result for each blob.
00583   // The BLOB_CHOICEs are consumed and the word takes ownership.
00584   // The number of blobs in the box_word must match blob_count.
00585   void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices);
00586 
00587   // Creates a WERD_CHOICE for the word using the top choices from the leading
00588   // diagonal of the ratings matrix.
00589   void FakeWordFromRatings();
00590 
00591   // Copies the best_choice strings to the correct_text for adaption/training.
00592   void BestChoiceToCorrectText();
00593 
00594   // Merges 2 adjacent blobs in the result if the permanent callback
00595   // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
00596   // callback box_cb is NULL or returns true, setting the merged blob
00597   // result to the class returned from class_cb.
00598   // Returns true if anything was merged.
00599   bool ConditionalBlobMerge(
00600       TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
00601       TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb);
00602 
00603   // Merges 2 adjacent blobs in the result (index and index+1) and corrects
00604   // all the data to account for the change.
00605   void MergeAdjacentBlobs(int index);
00606 
00607   // Callback helper for fix_quotes returns a double quote if both
00608   // arguments are quote, otherwise INVALID_UNICHAR_ID.
00609   UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);
00610   void fix_quotes();
00611 
00612   // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
00613   // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
00614   UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2);
00615   // Callback helper for fix_hyphens returns true if box1 and box2 overlap
00616   // (assuming both on the same textline, are in order and a chopped em dash.)
00617   bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
00618   void fix_hyphens();
00619 
00620   // Callback helper for merge_tess_fails returns a space if both
00621   // arguments are space, otherwise INVALID_UNICHAR_ID.
00622   UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);
00623   void merge_tess_fails();
00624 
00625   // Returns a really deep copy of *src, including the ratings MATRIX.
00626   static WERD_RES* deep_copy(const WERD_RES* src) {
00627     WERD_RES* result = new WERD_RES(*src);
00628     // That didn't copy the ratings, but we want a copy if there is one to
00629     // begin width.
00630     if (src->ratings != NULL)
00631       result->ratings = src->ratings->DeepCopy();
00632     return result;
00633   }
00634 
00635   // Copy blobs from word_res onto this word (eliminating spaces between).
00636   // Since this may be called bidirectionally OR both the BOL and EOL flags.
00637   void copy_on(WERD_RES *word_res) {  //from this word
00638     word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));
00639     word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
00640     word->copy_on(word_res->word);
00641   }
00642 
00643   // Returns true if the collection of count pieces, starting at start, are all
00644   // natural connected components, ie there are no real chops involved.
00645   bool PiecesAllNatural(int start, int count) const;
00646 };
00647 
00648 /*************************************************************************
00649  * PAGE_RES_IT - Page results iterator
00650  *************************************************************************/
00651 
00652 class PAGE_RES_IT {
00653  public:
00654   PAGE_RES * page_res;         // page being iterated
00655 
00656   PAGE_RES_IT() {
00657   }                            // empty contructor
00658 
00659   PAGE_RES_IT(PAGE_RES *the_page_res) {    // page result
00660     page_res = the_page_res;
00661     restart_page();  // ready to scan
00662   }
00663 
00664   // Do two PAGE_RES_ITs point at the same word?
00665   // This is much cheaper than cmp().
00666   bool operator ==(const PAGE_RES_IT &other) const;
00667 
00668   bool operator !=(const PAGE_RES_IT &other) const {return !(*this == other); }
00669 
00670   // Given another PAGE_RES_IT to the same page,
00671   //  this before other:     -1
00672   //  this equal to other:    0
00673   //  this later than other:  1
00674   int cmp(const PAGE_RES_IT &other) const;
00675 
00676   WERD_RES *restart_page() {
00677     return start_page(false);  // Skip empty blocks.
00678   }
00679   WERD_RES *restart_page_with_empties() {
00680     return start_page(true);  // Allow empty blocks.
00681   }
00682   WERD_RES *start_page(bool empty_ok);
00683 
00684   WERD_RES *restart_row();
00685 
00686   // ============ Methods that mutate the underling structures ===========
00687   // Note that these methods will potentially invalidate other PAGE_RES_ITs
00688   // and are intended to be used only while a single PAGE_RES_IT is  active.
00689   // This problem needs to be taken into account if these mutation operators
00690   // are ever provided to PageIterator or its subclasses.
00691 
00692   // Inserts the new_word and a corresponding WERD_RES before the current
00693   // position. The simple fields of the WERD_RES are copied from clone_res and
00694   // the resulting WERD_RES is returned for further setup with best_choice etc.
00695   WERD_RES* InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word);
00696 
00697   // Deletes the current WERD_RES and its underlying WERD.
00698   void DeleteCurrentWord();
00699 
00700   WERD_RES *forward() {  // Get next word.
00701     return internal_forward(false, false);
00702   }
00703   // Move forward, but allow empty blocks to show as single NULL words.
00704   WERD_RES *forward_with_empties() {
00705     return internal_forward(false, true);
00706   }
00707 
00708   WERD_RES *forward_paragraph();  // get first word in next non-empty paragraph
00709   WERD_RES *forward_block();  // get first word in next non-empty block
00710 
00711   WERD_RES *prev_word() const {  // previous word
00712     return prev_word_res;
00713   }
00714   ROW_RES *prev_row() const {  // row of prev word
00715     return prev_row_res;
00716   }
00717   BLOCK_RES *prev_block() const {  // block of prev word
00718     return prev_block_res;
00719   }
00720   WERD_RES *word() const {  // current word
00721     return word_res;
00722   }
00723   ROW_RES *row() const {  // row of current word
00724     return row_res;
00725   }
00726   BLOCK_RES *block() const {  // block of cur. word
00727     return block_res;
00728   }
00729   WERD_RES *next_word() const {  // next word
00730     return next_word_res;
00731   }
00732   ROW_RES *next_row() const {  // row of next word
00733     return next_row_res;
00734   }
00735   BLOCK_RES *next_block() const {  // block of next word
00736     return next_block_res;
00737   }
00738   void rej_stat_word();  // for page/block/row
00739 
00740  private:
00741   void ResetWordIterator();
00742   WERD_RES *internal_forward(bool new_block, bool empty_ok);
00743 
00744   WERD_RES * prev_word_res;    // previous word
00745   ROW_RES *prev_row_res;       // row of prev word
00746   BLOCK_RES *prev_block_res;   // block of prev word
00747 
00748   WERD_RES *word_res;          // current word
00749   ROW_RES *row_res;            // row of current word
00750   BLOCK_RES *block_res;        // block of cur. word
00751 
00752   WERD_RES *next_word_res;     // next word
00753   ROW_RES *next_row_res;       // row of next word
00754   BLOCK_RES *next_block_res;   // block of next word
00755 
00756   BLOCK_RES_IT block_res_it;   // iterators
00757   ROW_RES_IT row_res_it;
00758   WERD_RES_IT word_res_it;
00759 };
00760 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines