tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccstruct/blamer.h
Go to the documentation of this file.
00001 
00002 // File:        blamer.h
00003 // Description: Module allowing precise error causes to be allocated.
00004 // Author:      Rike Antonova
00005 // Refactored:  Ray Smith
00006 // Created:     Mon Feb 04 14:37:01 PST 2013
00007 //
00008 // (C) Copyright 2013, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifndef TESSERACT_CCSTRUCT_BLAMER_H_
00022 #define TESSERACT_CCSTRUCT_BLAMER_H_
00023 
00024 #include <stdio.h>
00025 #include "boxword.h"
00026 #include "genericvector.h"
00027 #include "matrix.h"
00028 #include "params_training_featdef.h"
00029 #include "ratngs.h"
00030 #include "strngs.h"
00031 #include "tesscallback.h"
00032 
00033 static const inT16 kBlamerBoxTolerance = 5;
00034 
00035 // Enum for expressing the source of error.
00036 // Note: Please update kIncorrectResultReasonNames when modifying this enum.
00037 enum IncorrectResultReason {
00038   // The text recorded in best choice == truth text
00039   IRR_CORRECT,
00040   // Either: Top choice is incorrect and is a dictionary word (language model
00041   // is unlikely to help correct such errors, so blame the classifier).
00042   // Or: the correct unichar was not included in shortlist produced by the
00043   // classifier at all.
00044   IRR_CLASSIFIER,
00045   // Chopper have not found one or more splits that correspond to the correct
00046   // character bounding boxes recorded in BlamerBundle::truth_word.
00047   IRR_CHOPPER,
00048   // Classifier did include correct unichars for each blob in the correct
00049   // segmentation, however its rating could have been too bad to allow the
00050   // language model to pull out the correct choice. On the other hand the
00051   // strength of the language model might have been too weak to favor the
00052   // correct answer, this we call this case a classifier-language model
00053   // tradeoff error.
00054   IRR_CLASS_LM_TRADEOFF,
00055   // Page layout failed to produce the correct bounding box. Blame page layout
00056   // if the truth was not found for the word, which implies that the bounding
00057   // box of the word was incorrect (no truth word had a similar bounding box).
00058   IRR_PAGE_LAYOUT,
00059   // SegSearch heuristic prevented one or more blobs from the correct
00060   // segmentation state to be classified (e.g. the blob was too wide).
00061   IRR_SEGSEARCH_HEUR,
00062   // The correct segmentaiton state was not explored because of poor SegSearch
00063   // pain point prioritization. We blame SegSearch pain point prioritization
00064   // if the best rating of a choice constructed from correct segmentation is
00065   // better than that of the best choice (i.e. if we got to explore the correct
00066   // segmentation state, language model would have picked the correct choice).
00067   IRR_SEGSEARCH_PP,
00068   // Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
00069   // and thus use the old language model (permuters).
00070   // TODO(antonova): integrate the new language mode with chopper
00071   IRR_CLASS_OLD_LM_TRADEOFF,
00072   // If there is an incorrect adaptive template match with a better score than
00073   // a correct one (either pre-trained or adapted), mark this as adaption error.
00074   IRR_ADAPTION,
00075   // split_and_recog_word() failed to find a suitable split in truth.
00076   IRR_NO_TRUTH_SPLIT,
00077   // Truth is not available for this word (e.g. when words in corrected content
00078   // file are turned into ~~~~ because an appropriate alignment was not found.
00079   IRR_NO_TRUTH,
00080   // The text recorded in best choice != truth text, but none of the above
00081   // reasons are set.
00082   IRR_UNKNOWN,
00083 
00084   IRR_NUM_REASONS
00085 };
00086 
00087 // Blamer-related information to determine the source of errors.
00088 struct BlamerBundle {
00089   static const char *IncorrectReasonName(IncorrectResultReason irr);
00090   BlamerBundle() : truth_has_char_boxes_(false),
00091       incorrect_result_reason_(IRR_CORRECT),
00092       lattice_data_(NULL) { ClearResults(); }
00093   BlamerBundle(const BlamerBundle &other) {
00094     this->CopyTruth(other);
00095     this->CopyResults(other);
00096   }
00097   ~BlamerBundle() { delete[] lattice_data_; }
00098 
00099   // Accessors.
00100   STRING TruthString() const {
00101     STRING truth_str;
00102     for (int i = 0; i < truth_text_.length(); ++i)
00103       truth_str += truth_text_[i];
00104     return truth_str;
00105   }
00106   IncorrectResultReason incorrect_result_reason() const {
00107     return incorrect_result_reason_;
00108   }
00109   bool NoTruth() const {
00110     return incorrect_result_reason_ == IRR_NO_TRUTH ||
00111            incorrect_result_reason_ == IRR_PAGE_LAYOUT;
00112   }
00113   bool HasDebugInfo() const {
00114     return debug_.length() > 0 || misadaption_debug_.length() > 0;
00115   }
00116   const STRING& debug() const {
00117     return debug_;
00118   }
00119   const STRING& misadaption_debug() const {
00120     return misadaption_debug_;
00121   }
00122   void UpdateBestRating(float rating) {
00123     if (rating < best_correctly_segmented_rating_)
00124       best_correctly_segmented_rating_ = rating;
00125   }
00126   int correct_segmentation_length() const {
00127     return correct_segmentation_cols_.length();
00128   }
00129   // Returns true if the given ratings matrix col,row position is included
00130   // in the correct segmentation path at the given index.
00131   bool MatrixPositionCorrect(int index, const MATRIX_COORD& coord) {
00132     return correct_segmentation_cols_[index] == coord.col &&
00133         correct_segmentation_rows_[index] == coord.row;
00134   }
00135   void set_best_choice_is_dict_and_top_choice(bool value) {
00136     best_choice_is_dict_and_top_choice_ = value;
00137   }
00138   const char* lattice_data() const {
00139     return lattice_data_;
00140   }
00141   int lattice_size() const {
00142     return lattice_size_;  // size of lattice_data in bytes
00143   }
00144   void set_lattice_data(const char* data, int size) {
00145     lattice_size_ = size;
00146     delete [] lattice_data_;
00147     lattice_data_ = new char[lattice_size_];
00148     memcpy(lattice_data_, data, lattice_size_);
00149   }
00150   const tesseract::ParamsTrainingBundle& params_training_bundle() const {
00151     return params_training_bundle_;
00152   }
00153   // Adds a new ParamsTrainingHypothesis to the current hypothesis list.
00154   void AddHypothesis(const tesseract::ParamsTrainingHypothesis& hypo) {
00155     params_training_bundle_.AddHypothesis(hypo);
00156   }
00157 
00158   // Functions to setup the blamer.
00159   // Whole word string, whole word bounding box.
00160   void SetWordTruth(const UNICHARSET& unicharset,
00161                     const char* truth_str, const TBOX& word_box);
00162   // Single "character" string, "character" bounding box.
00163   // May be called multiple times to indicate the characters in a word.
00164   void SetSymbolTruth(const UNICHARSET& unicharset,
00165                       const char* char_str, const TBOX& char_box);
00166   // Marks that there is something wrong with the truth text, like it contains
00167   // reject characters.
00168   void SetRejectedTruth();
00169 
00170   // Returns true if the provided word_choice is correct.
00171   bool ChoiceIsCorrect(const WERD_CHOICE* word_choice) const;
00172 
00173   void ClearResults() {
00174     norm_truth_word_.DeleteAllBoxes();
00175     norm_box_tolerance_ = 0;
00176     if (!NoTruth()) incorrect_result_reason_ = IRR_CORRECT;
00177     debug_ = "";
00178     segsearch_is_looking_for_blame_ = false;
00179     best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
00180     correct_segmentation_cols_.clear();
00181     correct_segmentation_rows_.clear();
00182     best_choice_is_dict_and_top_choice_ = false;
00183     delete[] lattice_data_;
00184     lattice_data_ = NULL;
00185     lattice_size_ = 0;
00186   }
00187   void CopyTruth(const BlamerBundle &other) {
00188     truth_has_char_boxes_ = other.truth_has_char_boxes_;
00189     truth_word_ = other.truth_word_;
00190     truth_text_ = other.truth_text_;
00191     incorrect_result_reason_ =
00192         (other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
00193   }
00194   void CopyResults(const BlamerBundle &other) {
00195     norm_truth_word_ = other.norm_truth_word_;
00196     norm_box_tolerance_ = other.norm_box_tolerance_;
00197     incorrect_result_reason_ = other.incorrect_result_reason_;
00198     segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
00199     best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
00200     correct_segmentation_cols_ = other.correct_segmentation_cols_;
00201     correct_segmentation_rows_ = other.correct_segmentation_rows_;
00202     best_choice_is_dict_and_top_choice_ =
00203         other.best_choice_is_dict_and_top_choice_;
00204     if (other.lattice_data_ != NULL) {
00205       lattice_data_ = new char[other.lattice_size_];
00206       memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
00207       lattice_size_ = other.lattice_size_;
00208     } else {
00209       lattice_data_ = NULL;
00210     }
00211   }
00212   const char *IncorrectReason() const;
00213 
00214   // Appends choice and truth details to the given debug string.
00215   void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
00216                        STRING *debug);
00217 
00218   // Sets up the norm_truth_word from truth_word using the given DENORM.
00219   void SetupNormTruthWord(const DENORM& denorm);
00220 
00221   // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
00222   // bundles) where the right edge/ of the left-hand word is word1_right,
00223   // and the left edge of the right-hand word is word2_left.
00224   void SplitBundle(int word1_right, int word2_left, bool debug,
00225                    BlamerBundle* bundle1, BlamerBundle* bundle2) const;
00226   // "Joins" the blames from bundle1 and bundle2 into *this.
00227   void JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2,
00228                   bool debug);
00229 
00230   // If a blob with the same bounding box as one of the truth character
00231   // bounding boxes is not classified as the corresponding truth character
00232   // blames character classifier for incorrect answer.
00233   void BlameClassifier(const UNICHARSET& unicharset,
00234                        const TBOX& blob_box,
00235                        const BLOB_CHOICE_LIST& choices,
00236                        bool debug);
00237 
00238 
00239   // Checks whether chops were made at all the character bounding box
00240   // boundaries in word->truth_word. If not - blames the chopper for an
00241   // incorrect answer.
00242   void SetChopperBlame(const WERD_RES* word, bool debug);
00243   // Blames the classifier or the language model if, after running only the
00244   // chopper, best_choice is incorrect and no blame has been yet set.
00245   // Blames the classifier if best_choice is classifier's top choice and is a
00246   // dictionary word (i.e. language model could not have helped).
00247   // Otherwise, blames the language model (formerly permuter word adjustment).
00248   void BlameClassifierOrLangModel(
00249       const WERD_RES* word,
00250       const UNICHARSET& unicharset, bool valid_permuter, bool debug);
00251   // Sets up the correct_segmentation_* to mark the correct bounding boxes.
00252   void SetupCorrectSegmentation(const TWERD* word, bool debug);
00253 
00254   // Returns true if a guided segmentation search is needed.
00255   bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
00256   // Setup ready to guide the segmentation search to the correct segmentation.
00257   // The callback pp_cb is used to avoid a cyclic dependency.
00258   // It calls into LMPainPoints::GenerateForBlamer by pre-binding the
00259   // WERD_RES, and the LMPainPoints itself.
00260   // pp_cb must be a permanent callback, and should be deleted by the caller.
00261   void InitForSegSearch(const WERD_CHOICE *best_choice,
00262                         MATRIX* ratings, UNICHAR_ID wildcard_id,
00263                         bool debug, STRING *debug_str,
00264                         TessResultCallback2<bool, int, int>* pp_cb);
00265   // Returns true if the guided segsearch is in progress.
00266   bool GuidedSegsearchStillGoing() const;
00267   // The segmentation search has ended. Sets the blame appropriately.
00268   void FinishSegSearch(const WERD_CHOICE *best_choice,
00269                        bool debug, STRING *debug_str);
00270 
00271   // If the bundle is null or still does not indicate the correct result,
00272   // fix it and use some backup reason for the blame.
00273   static void LastChanceBlame(bool debug, WERD_RES* word);
00274 
00275   // Sets the misadaption debug if this word is incorrect, as this word is
00276   // being adapted to.
00277   void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
00278 
00279  private:
00280   void SetBlame(IncorrectResultReason irr, const STRING &msg,
00281                 const WERD_CHOICE *choice, bool debug) {
00282     incorrect_result_reason_ = irr;
00283     debug_ = IncorrectReason();
00284     debug_ += " to blame: ";
00285     FillDebugString(msg, choice, &debug_);
00286     if (debug) tprintf("SetBlame(): %s", debug_.string());
00287   }
00288 
00289  private:
00290   // Set to true when bounding boxes for individual unichars are recorded.
00291   bool truth_has_char_boxes_;
00292   // The true_word (in the original image coordinate space) contains ground
00293   // truth bounding boxes for this WERD_RES.
00294   tesseract::BoxWord truth_word_;
00295   // Same as above, but in normalized coordinates
00296   // (filled in by WERD_RES::SetupForRecognition()).
00297   tesseract::BoxWord norm_truth_word_;
00298   // Tolerance for bounding box comparisons in normalized space.
00299   int norm_box_tolerance_;
00300   // Contains ground truth unichar for each of the bounding boxes in truth_word.
00301   GenericVector<STRING> truth_text_;
00302   // The reason for incorrect OCR result.
00303   IncorrectResultReason incorrect_result_reason_;
00304   // Debug text associated with the blame.
00305   STRING debug_;
00306   // Misadaption debug information (filled in if this word was misadapted to).
00307   STRING misadaption_debug_;
00308   // Variables used by the segmentation search when looking for the blame.
00309   // Set to true while segmentation search is continued after the usual
00310   // termination condition in order to look for the blame.
00311   bool segsearch_is_looking_for_blame_;
00312   // Best rating for correctly segmented path
00313   // (set and used by SegSearch when looking for blame).
00314   float best_correctly_segmented_rating_;
00315   // Vectors populated by SegSearch to indicate column and row indices that
00316   // correspond to blobs with correct bounding boxes.
00317   GenericVector<int> correct_segmentation_cols_;
00318   GenericVector<int> correct_segmentation_rows_;
00319   // Set to true if best choice is a dictionary word and
00320   // classifier's top choice.
00321   bool best_choice_is_dict_and_top_choice_;
00322   // Serialized segmentation search lattice.
00323   char *lattice_data_;
00324   int lattice_size_;  // size of lattice_data in bytes
00325   // Information about hypotheses (paths) explored by the segmentation search.
00326   tesseract::ParamsTrainingBundle params_training_bundle_;
00327 };
00328 
00329 
00330 #endif  // TESSERACT_CCSTRUCT_BLAMER_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines