tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/wordrec/wordrec.h
Go to the documentation of this file.
00001 
00002 // File:        wordrec.h
00003 // Description: wordrec class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018 
00019 #ifndef TESSERACT_WORDREC_WORDREC_H__
00020 #define TESSERACT_WORDREC_WORDREC_H__
00021 
00022 #include "associate.h"
00023 #include "classify.h"
00024 #include "dict.h"
00025 #include "language_model.h"
00026 #include "ratngs.h"
00027 #include "matrix.h"
00028 #include "gradechop.h"
00029 #include "seam.h"
00030 #include "findseam.h"
00031 #include "callcpp.h"
00032 
00033 class WERD_RES;
00034 
00035 namespace tesseract {
00036 
00037 // A class for storing which nodes are to be processed by the segmentation
00038 // search. There is a single SegSearchPending for each column in the ratings
00039 // matrix, and it indicates whether the segsearch should combine all
00040 // BLOB_CHOICES in the column, or just the given row with the parents
00041 // corresponding to *this SegSearchPending, and whether only updated parent
00042 // ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs.
00043 class SegSearchPending {
00044  public:
00045   SegSearchPending()
00046     : classified_row_(-1),
00047       revisit_whole_column_(false),
00048       column_classified_(false) {}
00049 
00050   // Marks the whole column as just classified. Used to start a search on
00051   // a newly initialized ratings matrix.
00052   void SetColumnClassified() {
00053     column_classified_ = true;
00054   }
00055   // Marks the matrix entry at the given row as just classified.
00056   // Used after classifying a new matrix cell.
00057   // Additional to, not overriding a previous RevisitWholeColumn.
00058   void SetBlobClassified(int row) {
00059     classified_row_ = row;
00060   }
00061   // Marks the whole column as needing work, but not just classified.
00062   // Used when the parent vse list is updated.
00063   // Additional to, not overriding a previous SetBlobClassified.
00064   void RevisitWholeColumn() {
00065     revisit_whole_column_ = true;
00066   }
00067 
00068   // Clears *this to indicate no work to do.
00069   void Clear() {
00070     classified_row_ = -1;
00071     revisit_whole_column_ = false;
00072     column_classified_ = false;
00073   }
00074 
00075   // Returns true if there are updates to do in the column that *this
00076   // represents.
00077   bool WorkToDo() const {
00078     return revisit_whole_column_ || column_classified_ || classified_row_ >= 0;
00079   }
00080   // Returns true if the given row was just classified.
00081   bool IsRowJustClassified(int row) const {
00082     return row == classified_row_ || column_classified_;
00083   }
00084   // Returns the single row to process if there is only one, otherwise -1.
00085   int SingleRow() const {
00086     return revisit_whole_column_ || column_classified_ ? -1 : classified_row_;
00087   }
00088 
00089  private:
00090   // If non-negative, indicates the single row in the ratings matrix that has
00091   // just been classified, and so should be combined with all the parents in the
00092   // column that this SegSearchPending represents.
00093   // Operates independently of revisit_whole_column.
00094   int classified_row_;
00095   // If revisit_whole_column is true, then all BLOB_CHOICEs in this column will
00096   // be processed, but classified_row can indicate a row that is newly
00097   // classified. Overridden if column_classified is true.
00098   bool revisit_whole_column_;
00099   // If column_classified is true, parent vses are processed with all rows
00100   // regardless of whether they are just updated, overriding
00101   // revisit_whole_column and classified_row.
00102   bool column_classified_;
00103 };
00104 
00105 
00106 /* ccmain/tstruct.cpp *********************************************************/
00107 class FRAGMENT:public ELIST_LINK
00108 {
00109   public:
00110     FRAGMENT() {  //constructor
00111     }
00112     FRAGMENT(EDGEPT *head_pt,   //start
00113              EDGEPT *tail_pt);  //end
00114 
00115     ICOORD head;                 //coords of start
00116     ICOORD tail;                 //coords of end
00117     EDGEPT *headpt;              //start point
00118     EDGEPT *tailpt;              //end point
00119 };
00120 ELISTIZEH(FRAGMENT)
00121 
00122 
00123 class Wordrec : public Classify {
00124  public:
00125   // config parameters *******************************************************
00126   BOOL_VAR_H(merge_fragments_in_matrix, TRUE,
00127              "Merge the fragments in the ratings matrix and delete them "
00128              "after merging");
00129   BOOL_VAR_H(wordrec_no_block, FALSE, "Don't output block information");
00130   BOOL_VAR_H(wordrec_enable_assoc, TRUE, "Associator Enable");
00131   BOOL_VAR_H(force_word_assoc, FALSE,
00132              "force associator to run regardless of what enable_assoc is."
00133              "This is used for CJK where component grouping is necessary.");
00134   double_VAR_H(wordrec_worst_state, 1, "Worst segmentation state");
00135   BOOL_VAR_H(fragments_guide_chopper, FALSE,
00136              "Use information from fragments to guide chopping process");
00137   INT_VAR_H(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped");
00138   double_VAR_H(tessedit_certainty_threshold, -2.25, "Good blob limit");
00139   INT_VAR_H(chop_debug, 0, "Chop debug");
00140   BOOL_VAR_H(chop_enable, 1, "Chop enable");
00141   BOOL_VAR_H(chop_vertical_creep, 0, "Vertical creep");
00142   INT_VAR_H(chop_split_length, 10000, "Split Length");
00143   INT_VAR_H(chop_same_distance, 2, "Same distance");
00144   INT_VAR_H(chop_min_outline_points, 6, "Min Number of Points on Outline");
00145   INT_VAR_H(chop_seam_pile_size, 150, "Max number of seams in seam_pile");
00146   BOOL_VAR_H(chop_new_seam_pile, 1, "Use new seam_pile");
00147   INT_VAR_H(chop_inside_angle, -50, "Min Inside Angle Bend");
00148   INT_VAR_H(chop_min_outline_area, 2000, "Min Outline Area");
00149   double_VAR_H(chop_split_dist_knob, 0.5, "Split length adjustment");
00150   double_VAR_H(chop_overlap_knob, 0.9, "Split overlap adjustment");
00151   double_VAR_H(chop_center_knob, 0.15, "Split center adjustment");
00152   INT_VAR_H(chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs "
00153             "above which we don't care that a chop is not near the center.");
00154   double_VAR_H(chop_sharpness_knob, 0.06, "Split sharpness adjustment");
00155   double_VAR_H(chop_width_change_knob, 5.0, "Width change adjustment");
00156   double_VAR_H(chop_ok_split, 100.0, "OK split limit");
00157   double_VAR_H(chop_good_split, 50.0, "Good split limit");
00158   INT_VAR_H(chop_x_y_weight, 3, "X / Y  length weight");
00159   INT_VAR_H(segment_adjust_debug, 0, "Segmentation adjustment debug");
00160   BOOL_VAR_H(assume_fixed_pitch_char_segment, FALSE,
00161              "include fixed-pitch heuristics in char segmentation");
00162   INT_VAR_H(wordrec_debug_level, 0, "Debug level for wordrec");
00163   INT_VAR_H(wordrec_max_join_chunks, 4,
00164             "Max number of broken pieces to associate");
00165   BOOL_VAR_H(wordrec_skip_no_truth_words, false,
00166              "Only run OCR for words that had truth recorded in BlamerBundle");
00167   BOOL_VAR_H(wordrec_debug_blamer, false, "Print blamer debug messages");
00168   BOOL_VAR_H(wordrec_run_blamer, false, "Try to set the blame for errors");
00169   INT_VAR_H(segsearch_debug_level, 0, "SegSearch debug level");
00170   INT_VAR_H(segsearch_max_pain_points, 2000,
00171             "Maximum number of pain points stored in the queue");
00172   INT_VAR_H(segsearch_max_futile_classifications, 10,
00173             "Maximum number of pain point classifications per word.");
00174   double_VAR_H(segsearch_max_char_wh_ratio, 2.0,
00175                "Maximum character width-to-height ratio");
00176   BOOL_VAR_H(save_alt_choices, true,
00177              "Save alternative paths found during chopping "
00178              "and segmentation search");
00179 
00180   // methods from wordrec/*.cpp ***********************************************
00181   Wordrec();
00182   virtual ~Wordrec();
00183 
00184   // Fills word->alt_choices with alternative paths found during
00185   // chopping/segmentation search that are kept in best_choices.
00186   void SaveAltChoices(const LIST &best_choices, WERD_RES *word);
00187 
00188   // Fills character choice lattice in the given BlamerBundle
00189   // using the given ratings matrix and best choice list.
00190   void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
00191                    const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
00192 
00193   // Calls fill_lattice_ member function
00194   // (assumes that fill_lattice_ is not NULL).
00195   void CallFillLattice(const MATRIX &ratings,
00196                        const WERD_CHOICE_LIST &best_choices,
00197                        const UNICHARSET &unicharset,
00198                        BlamerBundle *blamer_bundle) {
00199     (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
00200   }
00201 
00202   // tface.cpp
00203   void program_editup(const char *textbase,
00204                       bool init_classifier,
00205                       bool init_permute);
00206   void cc_recog(WERD_RES *word);
00207   void program_editdown(inT32 elasped_time);
00208   void set_pass1();
00209   void set_pass2();
00210   int end_recog();
00211   BLOB_CHOICE_LIST *call_matcher(TBLOB* blob);
00212   int dict_word(const WERD_CHOICE &word);
00213   // wordclass.cpp
00214   BLOB_CHOICE_LIST *classify_blob(TBLOB *blob,
00215                                   const char *string,
00216                                   C_COL color,
00217                                   BlamerBundle *blamer_bundle);
00218 
00219   // segsearch.cpp
00220   // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs.
00221   // Each entry in the matrix represents the classification choice
00222   // for a chunk, i.e. an entry in row 2, column 1 represents the list
00223   // of ratings for the chunks 1 and 2 classified as a single blob.
00224   // The entries on the diagonal of the matrix are classifier choice lists
00225   // for a single chunk from the maximal segmentation.
00226   //
00227   // The ratings matrix given to SegSearch represents the segmentation
00228   // graph / trellis for the current word. The nodes in the graph are the
00229   // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings
00230   // matrix. The children of each node (nodes connected by outgoing links)
00231   // are the entries in the column that is equal to node's row+1. The parents
00232   // (nodes connected by the incoming links) are the entries in the row that
00233   // is equal to the node's column-1. Here is an example ratings matrix:
00234   //
00235   //    0    1    2   3   4
00236   //  -------------------------
00237   // 0| c,(                   |
00238   // 1| d    l,1              |
00239   // 2|           o           |
00240   // 3|              c,(      |
00241   // 4|              g,y  l,1 |
00242   //  -------------------------
00243   //
00244   // In the example above node "o" has children (outgoing connection to nodes)
00245   // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d".
00246   //
00247   // The objective of the search is to find the least cost path, where the cost
00248   // is determined by the language model components and the properties of the
00249   // cut between the blobs on the path. SegSearch starts by populating the
00250   // matrix with the all the entries that were classified by the chopper and
00251   // finding the initial best path. Based on the classifier ratings, language
00252   // model scores and the properties of each cut, a list of "pain points" is
00253   // constructed - those are the points on the path where the choices do not
00254   // look consistent with the neighboring choices, the cuts look particularly
00255   // problematic, or the certainties of the blobs are low. The most troublesome
00256   // "pain point" is picked from the list and the new entry in the ratings
00257   // matrix corresponding to this "pain point" is filled in. Then the language
00258   // model state is updated to reflect the new classification and the new
00259   // "pain points" are added to the list and the next most troublesome
00260   // "pain point" is determined. This continues until either the word choice
00261   // composed from the best paths in the segmentation graph is "good enough"
00262   // (e.g. above a certain certainty threshold, is an unambiguous dictionary
00263   // word, etc) or there are no more "pain points" to explore.
00264   //
00265   // If associate_blobs is set to false no new classifications will be done
00266   // to combine blobs. Segmentation search will run only one "iteration"
00267   // on the classifications already recorded in chunks_record.ratings.
00268   //
00269   // Note: this function assumes that word, output_best_state,
00270   // best_char_choices and fixpt arguments are not NULL.
00271   void SegSearch(WERD_RES* word_res,
00272                  BestChoiceBundle* best_choice_bundle,
00273                  BlamerBundle* blamer_bundle);
00274 
00275   // Runs SegSearch() function (above) without needing a best_choice_bundle
00276   // or blamer_bundle. Used for testing.
00277   void DoSegSearch(WERD_RES* word_res);
00278 
00279   // chop.cpp
00280   PRIORITY point_priority(EDGEPT *point);
00281   void add_point_to_list(PointHeap* point_heap, EDGEPT *point);
00282   int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3);
00283   int is_little_chunk(EDGEPT *point1, EDGEPT *point2);
00284   int is_small_area(EDGEPT *point1, EDGEPT *point2);
00285   EDGEPT *pick_close_point(EDGEPT *critical_point,
00286                            EDGEPT *vertical_point,
00287                            int *best_dist);
00288   void prioritize_points(TESSLINE *outline, PointHeap* points);
00289   void new_min_point(EDGEPT *local_min, PointHeap* points);
00290   void new_max_point(EDGEPT *local_max, PointHeap* points);
00291   void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point,
00292                                  EDGEPT** best_point,
00293                                  EDGEPT_CLIST *new_points);
00294 
00295   // chopper.cpp
00296   SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
00297                           bool italic_blob, const GenericVector<SEAM*>& seams);
00298   SEAM *chop_numbered_blob(TWERD *word, inT32 blob_number,
00299                            bool italic_blob, const GenericVector<SEAM*>& seams);
00300   SEAM *chop_overlapping_blob(const GenericVector<TBOX>& boxes,
00301                               bool italic_blob,
00302                               WERD_RES *word_res, int *blob_number);
00303   SEAM *improve_one_blob(const GenericVector<BLOB_CHOICE*> &blob_choices,
00304                          DANGERR *fixpt,
00305                          bool split_next_to_fragment,
00306                          bool italic_blob,
00307                          WERD_RES *word,
00308                          int *blob_number);
00309   SEAM *chop_one_blob(const GenericVector<TBOX> &boxes,
00310                       const GenericVector<BLOB_CHOICE*> &blob_choices,
00311                       WERD_RES *word_res,
00312                       int *blob_number);
00313   void chop_word_main(WERD_RES *word);
00314   void improve_by_chopping(float rating_cert_scale,
00315                            WERD_RES *word,
00316                            BestChoiceBundle *best_choice_bundle,
00317                            BlamerBundle *blamer_bundle,
00318                            LMPainPoints *pain_points,
00319                            GenericVector<SegSearchPending>* pending);
00320   int select_blob_to_split(const GenericVector<BLOB_CHOICE*> &blob_choices,
00321                            float rating_ceiling,
00322                            bool split_next_to_fragment);
00323   int select_blob_to_split_from_fixpt(DANGERR *fixpt);
00324 
00325   // findseam.cpp
00326   void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue* seams);
00327   void choose_best_seam(SeamQueue* seam_queue,
00328                         SPLIT *split,
00329                         PRIORITY priority,
00330                         SEAM **seam_result,
00331                         TBLOB *blob,
00332                         SeamPile* seam_pile);
00333   void combine_seam(const SeamPile& seam_pile,
00334                     const SEAM* seam, SeamQueue* seam_queue);
00335   inT16 constrained_split(SPLIT *split, TBLOB *blob);
00336   SEAM *pick_good_seam(TBLOB *blob);
00337   PRIORITY seam_priority(SEAM *seam, inT16 xmin, inT16 xmax);
00338   void try_point_pairs (EDGEPT * points[MAX_NUM_POINTS],
00339                         inT16 num_points,
00340                         SeamQueue* seam_queue,
00341                         SeamPile* seam_pile,
00342                         SEAM ** seam, TBLOB * blob);
00343   void try_vertical_splits(EDGEPT * points[MAX_NUM_POINTS],
00344                            inT16 num_points,
00345                            EDGEPT_CLIST *new_points,
00346                            SeamQueue* seam_queue,
00347                            SeamPile* seam_pile,
00348                            SEAM ** seam, TBLOB * blob);
00349 
00350   // gradechop.cpp
00351   PRIORITY full_split_priority(SPLIT *split, inT16 xmin, inT16 xmax);
00352   PRIORITY grade_center_of_blob(register BOUNDS_RECT rect);
00353   PRIORITY grade_overlap(register BOUNDS_RECT rect);
00354   PRIORITY grade_split_length(register SPLIT *split);
00355   PRIORITY grade_sharpness(register SPLIT *split);
00356   PRIORITY grade_width_change(register BOUNDS_RECT rect);
00357   void set_outline_bounds(register EDGEPT *point1,
00358                           register EDGEPT *point2,
00359                           BOUNDS_RECT rect);
00360 
00361   // outlines.cpp
00362   int crosses_outline(EDGEPT *p0, EDGEPT *p1, EDGEPT *outline);
00363   int is_crossed(TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1);
00364   int is_same_edgept(EDGEPT *p1, EDGEPT *p2);
00365   bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1,
00366                   EDGEPT **near_pt);
00367   void reverse_outline(EDGEPT *outline);
00368 
00369   // pieces.cpp
00370   virtual BLOB_CHOICE_LIST *classify_piece(const GenericVector<SEAM*>& seams,
00371                                            inT16 start,
00372                                            inT16 end,
00373                                            const char* description,
00374                                            TWERD *word,
00375                                            BlamerBundle *blamer_bundle);
00376   // Try to merge fragments in the ratings matrix and put the result in
00377   // the corresponding row and column
00378   void merge_fragments(MATRIX *ratings,
00379                        inT16 num_blobs);
00380   // Recursively go through the ratings matrix to find lists of fragments
00381   // to be merged in the function merge_and_put_fragment_lists.
00382   // current_frag is the postion of the piece we are looking for.
00383   // current_row is the row in the rating matrix we are currently at.
00384   // start is the row we started initially, so that we can know where
00385   // to append the results to the matrix. num_frag_parts is the total
00386   // number of pieces we are looking for and num_blobs is the size of the
00387   // ratings matrix.
00388   void get_fragment_lists(inT16 current_frag,
00389                           inT16 current_row,
00390                           inT16 start,
00391                           inT16 num_frag_parts,
00392                           inT16 num_blobs,
00393                           MATRIX *ratings,
00394                           BLOB_CHOICE_LIST *choice_lists);
00395   // Merge the fragment lists in choice_lists and append it to the
00396   // ratings matrix
00397   void merge_and_put_fragment_lists(inT16 row,
00398                                     inT16 column,
00399                                     inT16 num_frag_parts,
00400                                     BLOB_CHOICE_LIST *choice_lists,
00401                                     MATRIX *ratings);
00402   // Filter the fragment list so that the filtered_choices only contain
00403   // fragments that are in the correct position. choices is the list
00404   // that we are going to filter. fragment_pos is the position in the
00405   // fragment that we are looking for and num_frag_parts is the the
00406   // total number of pieces. The result will be appended to
00407   // filtered_choices.
00408   void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
00409                                    int fragment_pos,
00410                                    int num_frag_parts,
00411                                    BLOB_CHOICE_LIST *filtered_choices);
00412 
00413   // Member variables.
00414 
00415   LanguageModel *language_model_;
00416   PRIORITY pass2_ok_split;
00417   // Stores the best choice for the previous word in the paragraph.
00418   // This variable is modified by PAGE_RES_IT when iterating over
00419   // words to OCR on the page.
00420   WERD_CHOICE *prev_word_best_choice_;
00421   // Sums of blame reasons computed by the blamer.
00422   GenericVector<int> blame_reasons_;
00423   // Function used to fill char choice lattices.
00424   void (Wordrec::*fill_lattice_)(const MATRIX &ratings,
00425                                  const WERD_CHOICE_LIST &best_choices,
00426                                  const UNICHARSET &unicharset,
00427                                  BlamerBundle *blamer_bundle);
00428 
00429  protected:
00430   inline bool SegSearchDone(int num_futile_classifications) {
00431     return (language_model_->AcceptableChoiceFound() ||
00432             num_futile_classifications >=
00433             segsearch_max_futile_classifications);
00434   }
00435 
00436   // Updates the language model state recorded for the child entries specified
00437   // in pending[starting_col]. Enqueues the children of the updated entries
00438   // into pending and proceeds to update (and remove from pending) all the
00439   // remaining entries in pending[col] (col >= starting_col). Upon termination
00440   // of this function all the pending[col] lists will be empty.
00441   //
00442   // The arguments:
00443   //
00444   // starting_col: index of the column in chunks_record->ratings from
00445   // which the update should be started
00446   //
00447   // pending: list of entries listing chunks_record->ratings entries
00448   // that should be updated
00449   //
00450   // pain_points: priority heap listing the pain points generated by
00451   // the language model
00452   //
00453   // temp_pain_points: temporary storage for tentative pain points generated
00454   // by the language model after a single call to LanguageModel::UpdateState()
00455   // (the argument is passed in rather than created before each
00456   // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation)
00457   //
00458   // best_choice_bundle: a collection of variables that should be updated
00459   // if a new best choice is found
00460   //
00461   void UpdateSegSearchNodes(
00462       float rating_cert_scale,
00463       int starting_col,
00464       GenericVector<SegSearchPending>* pending,
00465       WERD_RES *word_res,
00466       LMPainPoints *pain_points,
00467       BestChoiceBundle *best_choice_bundle,
00468       BlamerBundle *blamer_bundle);
00469 
00470   // Process the given pain point: classify the corresponding blob, enqueue
00471   // new pain points to join the newly classified blob with its neighbors.
00472   void ProcessSegSearchPainPoint(float pain_point_priority,
00473                                  const MATRIX_COORD &pain_point,
00474                                  const char* pain_point_type,
00475                                  GenericVector<SegSearchPending>* pending,
00476                                  WERD_RES *word_res,
00477                                  LMPainPoints *pain_points,
00478                                  BlamerBundle *blamer_bundle);
00479   // Resets enough of the results so that the Viterbi search is re-run.
00480   // Needed when the n-gram model is enabled, as the multi-length comparison
00481   // implementation will re-value existing paths to worse values.
00482   void ResetNGramSearch(WERD_RES* word_res,
00483                         BestChoiceBundle* best_choice_bundle,
00484                         GenericVector<SegSearchPending>* pending);
00485 
00486   // Add pain points for classifying blobs on the correct segmentation path
00487   // (so that we can evaluate correct segmentation path and discover the reason
00488   // for incorrect result).
00489   void InitBlamerForSegSearch(WERD_RES *word_res,
00490                               LMPainPoints *pain_points,
00491                               BlamerBundle *blamer_bundle,
00492                               STRING *blamer_debug);
00493 };
00494 
00495 
00496 }  // namespace tesseract
00497 
00498 #endif  // TESSERACT_WORDREC_WORDREC_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines