tesseract
3.03
|
00001 00002 // File: wordrec.h 00003 // Description: wordrec class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #ifndef TESSERACT_WORDREC_WORDREC_H__ 00020 #define TESSERACT_WORDREC_WORDREC_H__ 00021 00022 #include "associate.h" 00023 #include "classify.h" 00024 #include "dict.h" 00025 #include "language_model.h" 00026 #include "ratngs.h" 00027 #include "matrix.h" 00028 #include "gradechop.h" 00029 #include "seam.h" 00030 #include "findseam.h" 00031 #include "callcpp.h" 00032 00033 class WERD_RES; 00034 00035 namespace tesseract { 00036 00037 // A class for storing which nodes are to be processed by the segmentation 00038 // search. There is a single SegSearchPending for each column in the ratings 00039 // matrix, and it indicates whether the segsearch should combine all 00040 // BLOB_CHOICES in the column, or just the given row with the parents 00041 // corresponding to *this SegSearchPending, and whether only updated parent 00042 // ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs. 00043 class SegSearchPending { 00044 public: 00045 SegSearchPending() 00046 : classified_row_(-1), 00047 revisit_whole_column_(false), 00048 column_classified_(false) {} 00049 00050 // Marks the whole column as just classified. Used to start a search on 00051 // a newly initialized ratings matrix. 00052 void SetColumnClassified() { 00053 column_classified_ = true; 00054 } 00055 // Marks the matrix entry at the given row as just classified. 00056 // Used after classifying a new matrix cell. 00057 // Additional to, not overriding a previous RevisitWholeColumn. 00058 void SetBlobClassified(int row) { 00059 classified_row_ = row; 00060 } 00061 // Marks the whole column as needing work, but not just classified. 00062 // Used when the parent vse list is updated. 00063 // Additional to, not overriding a previous SetBlobClassified. 00064 void RevisitWholeColumn() { 00065 revisit_whole_column_ = true; 00066 } 00067 00068 // Clears *this to indicate no work to do. 00069 void Clear() { 00070 classified_row_ = -1; 00071 revisit_whole_column_ = false; 00072 column_classified_ = false; 00073 } 00074 00075 // Returns true if there are updates to do in the column that *this 00076 // represents. 00077 bool WorkToDo() const { 00078 return revisit_whole_column_ || column_classified_ || classified_row_ >= 0; 00079 } 00080 // Returns true if the given row was just classified. 00081 bool IsRowJustClassified(int row) const { 00082 return row == classified_row_ || column_classified_; 00083 } 00084 // Returns the single row to process if there is only one, otherwise -1. 00085 int SingleRow() const { 00086 return revisit_whole_column_ || column_classified_ ? -1 : classified_row_; 00087 } 00088 00089 private: 00090 // If non-negative, indicates the single row in the ratings matrix that has 00091 // just been classified, and so should be combined with all the parents in the 00092 // column that this SegSearchPending represents. 00093 // Operates independently of revisit_whole_column. 00094 int classified_row_; 00095 // If revisit_whole_column is true, then all BLOB_CHOICEs in this column will 00096 // be processed, but classified_row can indicate a row that is newly 00097 // classified. Overridden if column_classified is true. 00098 bool revisit_whole_column_; 00099 // If column_classified is true, parent vses are processed with all rows 00100 // regardless of whether they are just updated, overriding 00101 // revisit_whole_column and classified_row. 00102 bool column_classified_; 00103 }; 00104 00105 00106 /* ccmain/tstruct.cpp *********************************************************/ 00107 class FRAGMENT:public ELIST_LINK 00108 { 00109 public: 00110 FRAGMENT() { //constructor 00111 } 00112 FRAGMENT(EDGEPT *head_pt, //start 00113 EDGEPT *tail_pt); //end 00114 00115 ICOORD head; //coords of start 00116 ICOORD tail; //coords of end 00117 EDGEPT *headpt; //start point 00118 EDGEPT *tailpt; //end point 00119 }; 00120 ELISTIZEH(FRAGMENT) 00121 00122 00123 class Wordrec : public Classify { 00124 public: 00125 // config parameters ******************************************************* 00126 BOOL_VAR_H(merge_fragments_in_matrix, TRUE, 00127 "Merge the fragments in the ratings matrix and delete them " 00128 "after merging"); 00129 BOOL_VAR_H(wordrec_no_block, FALSE, "Don't output block information"); 00130 BOOL_VAR_H(wordrec_enable_assoc, TRUE, "Associator Enable"); 00131 BOOL_VAR_H(force_word_assoc, FALSE, 00132 "force associator to run regardless of what enable_assoc is." 00133 "This is used for CJK where component grouping is necessary."); 00134 double_VAR_H(wordrec_worst_state, 1, "Worst segmentation state"); 00135 BOOL_VAR_H(fragments_guide_chopper, FALSE, 00136 "Use information from fragments to guide chopping process"); 00137 INT_VAR_H(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped"); 00138 double_VAR_H(tessedit_certainty_threshold, -2.25, "Good blob limit"); 00139 INT_VAR_H(chop_debug, 0, "Chop debug"); 00140 BOOL_VAR_H(chop_enable, 1, "Chop enable"); 00141 BOOL_VAR_H(chop_vertical_creep, 0, "Vertical creep"); 00142 INT_VAR_H(chop_split_length, 10000, "Split Length"); 00143 INT_VAR_H(chop_same_distance, 2, "Same distance"); 00144 INT_VAR_H(chop_min_outline_points, 6, "Min Number of Points on Outline"); 00145 INT_VAR_H(chop_seam_pile_size, 150, "Max number of seams in seam_pile"); 00146 BOOL_VAR_H(chop_new_seam_pile, 1, "Use new seam_pile"); 00147 INT_VAR_H(chop_inside_angle, -50, "Min Inside Angle Bend"); 00148 INT_VAR_H(chop_min_outline_area, 2000, "Min Outline Area"); 00149 double_VAR_H(chop_split_dist_knob, 0.5, "Split length adjustment"); 00150 double_VAR_H(chop_overlap_knob, 0.9, "Split overlap adjustment"); 00151 double_VAR_H(chop_center_knob, 0.15, "Split center adjustment"); 00152 INT_VAR_H(chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs " 00153 "above which we don't care that a chop is not near the center."); 00154 double_VAR_H(chop_sharpness_knob, 0.06, "Split sharpness adjustment"); 00155 double_VAR_H(chop_width_change_knob, 5.0, "Width change adjustment"); 00156 double_VAR_H(chop_ok_split, 100.0, "OK split limit"); 00157 double_VAR_H(chop_good_split, 50.0, "Good split limit"); 00158 INT_VAR_H(chop_x_y_weight, 3, "X / Y length weight"); 00159 INT_VAR_H(segment_adjust_debug, 0, "Segmentation adjustment debug"); 00160 BOOL_VAR_H(assume_fixed_pitch_char_segment, FALSE, 00161 "include fixed-pitch heuristics in char segmentation"); 00162 INT_VAR_H(wordrec_debug_level, 0, "Debug level for wordrec"); 00163 INT_VAR_H(wordrec_max_join_chunks, 4, 00164 "Max number of broken pieces to associate"); 00165 BOOL_VAR_H(wordrec_skip_no_truth_words, false, 00166 "Only run OCR for words that had truth recorded in BlamerBundle"); 00167 BOOL_VAR_H(wordrec_debug_blamer, false, "Print blamer debug messages"); 00168 BOOL_VAR_H(wordrec_run_blamer, false, "Try to set the blame for errors"); 00169 INT_VAR_H(segsearch_debug_level, 0, "SegSearch debug level"); 00170 INT_VAR_H(segsearch_max_pain_points, 2000, 00171 "Maximum number of pain points stored in the queue"); 00172 INT_VAR_H(segsearch_max_futile_classifications, 10, 00173 "Maximum number of pain point classifications per word."); 00174 double_VAR_H(segsearch_max_char_wh_ratio, 2.0, 00175 "Maximum character width-to-height ratio"); 00176 BOOL_VAR_H(save_alt_choices, true, 00177 "Save alternative paths found during chopping " 00178 "and segmentation search"); 00179 00180 // methods from wordrec/*.cpp *********************************************** 00181 Wordrec(); 00182 virtual ~Wordrec(); 00183 00184 // Fills word->alt_choices with alternative paths found during 00185 // chopping/segmentation search that are kept in best_choices. 00186 void SaveAltChoices(const LIST &best_choices, WERD_RES *word); 00187 00188 // Fills character choice lattice in the given BlamerBundle 00189 // using the given ratings matrix and best choice list. 00190 void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, 00191 const UNICHARSET &unicharset, BlamerBundle *blamer_bundle); 00192 00193 // Calls fill_lattice_ member function 00194 // (assumes that fill_lattice_ is not NULL). 00195 void CallFillLattice(const MATRIX &ratings, 00196 const WERD_CHOICE_LIST &best_choices, 00197 const UNICHARSET &unicharset, 00198 BlamerBundle *blamer_bundle) { 00199 (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle); 00200 } 00201 00202 // tface.cpp 00203 void program_editup(const char *textbase, 00204 bool init_classifier, 00205 bool init_permute); 00206 void cc_recog(WERD_RES *word); 00207 void program_editdown(inT32 elasped_time); 00208 void set_pass1(); 00209 void set_pass2(); 00210 int end_recog(); 00211 BLOB_CHOICE_LIST *call_matcher(TBLOB* blob); 00212 int dict_word(const WERD_CHOICE &word); 00213 // wordclass.cpp 00214 BLOB_CHOICE_LIST *classify_blob(TBLOB *blob, 00215 const char *string, 00216 C_COL color, 00217 BlamerBundle *blamer_bundle); 00218 00219 // segsearch.cpp 00220 // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs. 00221 // Each entry in the matrix represents the classification choice 00222 // for a chunk, i.e. an entry in row 2, column 1 represents the list 00223 // of ratings for the chunks 1 and 2 classified as a single blob. 00224 // The entries on the diagonal of the matrix are classifier choice lists 00225 // for a single chunk from the maximal segmentation. 00226 // 00227 // The ratings matrix given to SegSearch represents the segmentation 00228 // graph / trellis for the current word. The nodes in the graph are the 00229 // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings 00230 // matrix. The children of each node (nodes connected by outgoing links) 00231 // are the entries in the column that is equal to node's row+1. The parents 00232 // (nodes connected by the incoming links) are the entries in the row that 00233 // is equal to the node's column-1. Here is an example ratings matrix: 00234 // 00235 // 0 1 2 3 4 00236 // ------------------------- 00237 // 0| c,( | 00238 // 1| d l,1 | 00239 // 2| o | 00240 // 3| c,( | 00241 // 4| g,y l,1 | 00242 // ------------------------- 00243 // 00244 // In the example above node "o" has children (outgoing connection to nodes) 00245 // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d". 00246 // 00247 // The objective of the search is to find the least cost path, where the cost 00248 // is determined by the language model components and the properties of the 00249 // cut between the blobs on the path. SegSearch starts by populating the 00250 // matrix with the all the entries that were classified by the chopper and 00251 // finding the initial best path. Based on the classifier ratings, language 00252 // model scores and the properties of each cut, a list of "pain points" is 00253 // constructed - those are the points on the path where the choices do not 00254 // look consistent with the neighboring choices, the cuts look particularly 00255 // problematic, or the certainties of the blobs are low. The most troublesome 00256 // "pain point" is picked from the list and the new entry in the ratings 00257 // matrix corresponding to this "pain point" is filled in. Then the language 00258 // model state is updated to reflect the new classification and the new 00259 // "pain points" are added to the list and the next most troublesome 00260 // "pain point" is determined. This continues until either the word choice 00261 // composed from the best paths in the segmentation graph is "good enough" 00262 // (e.g. above a certain certainty threshold, is an unambiguous dictionary 00263 // word, etc) or there are no more "pain points" to explore. 00264 // 00265 // If associate_blobs is set to false no new classifications will be done 00266 // to combine blobs. Segmentation search will run only one "iteration" 00267 // on the classifications already recorded in chunks_record.ratings. 00268 // 00269 // Note: this function assumes that word, output_best_state, 00270 // best_char_choices and fixpt arguments are not NULL. 00271 void SegSearch(WERD_RES* word_res, 00272 BestChoiceBundle* best_choice_bundle, 00273 BlamerBundle* blamer_bundle); 00274 00275 // Runs SegSearch() function (above) without needing a best_choice_bundle 00276 // or blamer_bundle. Used for testing. 00277 void DoSegSearch(WERD_RES* word_res); 00278 00279 // chop.cpp 00280 PRIORITY point_priority(EDGEPT *point); 00281 void add_point_to_list(PointHeap* point_heap, EDGEPT *point); 00282 int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3); 00283 int is_little_chunk(EDGEPT *point1, EDGEPT *point2); 00284 int is_small_area(EDGEPT *point1, EDGEPT *point2); 00285 EDGEPT *pick_close_point(EDGEPT *critical_point, 00286 EDGEPT *vertical_point, 00287 int *best_dist); 00288 void prioritize_points(TESSLINE *outline, PointHeap* points); 00289 void new_min_point(EDGEPT *local_min, PointHeap* points); 00290 void new_max_point(EDGEPT *local_max, PointHeap* points); 00291 void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, 00292 EDGEPT** best_point, 00293 EDGEPT_CLIST *new_points); 00294 00295 // chopper.cpp 00296 SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, 00297 bool italic_blob, const GenericVector<SEAM*>& seams); 00298 SEAM *chop_numbered_blob(TWERD *word, inT32 blob_number, 00299 bool italic_blob, const GenericVector<SEAM*>& seams); 00300 SEAM *chop_overlapping_blob(const GenericVector<TBOX>& boxes, 00301 bool italic_blob, 00302 WERD_RES *word_res, int *blob_number); 00303 SEAM *improve_one_blob(const GenericVector<BLOB_CHOICE*> &blob_choices, 00304 DANGERR *fixpt, 00305 bool split_next_to_fragment, 00306 bool italic_blob, 00307 WERD_RES *word, 00308 int *blob_number); 00309 SEAM *chop_one_blob(const GenericVector<TBOX> &boxes, 00310 const GenericVector<BLOB_CHOICE*> &blob_choices, 00311 WERD_RES *word_res, 00312 int *blob_number); 00313 void chop_word_main(WERD_RES *word); 00314 void improve_by_chopping(float rating_cert_scale, 00315 WERD_RES *word, 00316 BestChoiceBundle *best_choice_bundle, 00317 BlamerBundle *blamer_bundle, 00318 LMPainPoints *pain_points, 00319 GenericVector<SegSearchPending>* pending); 00320 int select_blob_to_split(const GenericVector<BLOB_CHOICE*> &blob_choices, 00321 float rating_ceiling, 00322 bool split_next_to_fragment); 00323 int select_blob_to_split_from_fixpt(DANGERR *fixpt); 00324 00325 // findseam.cpp 00326 void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue* seams); 00327 void choose_best_seam(SeamQueue* seam_queue, 00328 SPLIT *split, 00329 PRIORITY priority, 00330 SEAM **seam_result, 00331 TBLOB *blob, 00332 SeamPile* seam_pile); 00333 void combine_seam(const SeamPile& seam_pile, 00334 const SEAM* seam, SeamQueue* seam_queue); 00335 inT16 constrained_split(SPLIT *split, TBLOB *blob); 00336 SEAM *pick_good_seam(TBLOB *blob); 00337 PRIORITY seam_priority(SEAM *seam, inT16 xmin, inT16 xmax); 00338 void try_point_pairs (EDGEPT * points[MAX_NUM_POINTS], 00339 inT16 num_points, 00340 SeamQueue* seam_queue, 00341 SeamPile* seam_pile, 00342 SEAM ** seam, TBLOB * blob); 00343 void try_vertical_splits(EDGEPT * points[MAX_NUM_POINTS], 00344 inT16 num_points, 00345 EDGEPT_CLIST *new_points, 00346 SeamQueue* seam_queue, 00347 SeamPile* seam_pile, 00348 SEAM ** seam, TBLOB * blob); 00349 00350 // gradechop.cpp 00351 PRIORITY full_split_priority(SPLIT *split, inT16 xmin, inT16 xmax); 00352 PRIORITY grade_center_of_blob(register BOUNDS_RECT rect); 00353 PRIORITY grade_overlap(register BOUNDS_RECT rect); 00354 PRIORITY grade_split_length(register SPLIT *split); 00355 PRIORITY grade_sharpness(register SPLIT *split); 00356 PRIORITY grade_width_change(register BOUNDS_RECT rect); 00357 void set_outline_bounds(register EDGEPT *point1, 00358 register EDGEPT *point2, 00359 BOUNDS_RECT rect); 00360 00361 // outlines.cpp 00362 int crosses_outline(EDGEPT *p0, EDGEPT *p1, EDGEPT *outline); 00363 int is_crossed(TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1); 00364 int is_same_edgept(EDGEPT *p1, EDGEPT *p2); 00365 bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, 00366 EDGEPT **near_pt); 00367 void reverse_outline(EDGEPT *outline); 00368 00369 // pieces.cpp 00370 virtual BLOB_CHOICE_LIST *classify_piece(const GenericVector<SEAM*>& seams, 00371 inT16 start, 00372 inT16 end, 00373 const char* description, 00374 TWERD *word, 00375 BlamerBundle *blamer_bundle); 00376 // Try to merge fragments in the ratings matrix and put the result in 00377 // the corresponding row and column 00378 void merge_fragments(MATRIX *ratings, 00379 inT16 num_blobs); 00380 // Recursively go through the ratings matrix to find lists of fragments 00381 // to be merged in the function merge_and_put_fragment_lists. 00382 // current_frag is the postion of the piece we are looking for. 00383 // current_row is the row in the rating matrix we are currently at. 00384 // start is the row we started initially, so that we can know where 00385 // to append the results to the matrix. num_frag_parts is the total 00386 // number of pieces we are looking for and num_blobs is the size of the 00387 // ratings matrix. 00388 void get_fragment_lists(inT16 current_frag, 00389 inT16 current_row, 00390 inT16 start, 00391 inT16 num_frag_parts, 00392 inT16 num_blobs, 00393 MATRIX *ratings, 00394 BLOB_CHOICE_LIST *choice_lists); 00395 // Merge the fragment lists in choice_lists and append it to the 00396 // ratings matrix 00397 void merge_and_put_fragment_lists(inT16 row, 00398 inT16 column, 00399 inT16 num_frag_parts, 00400 BLOB_CHOICE_LIST *choice_lists, 00401 MATRIX *ratings); 00402 // Filter the fragment list so that the filtered_choices only contain 00403 // fragments that are in the correct position. choices is the list 00404 // that we are going to filter. fragment_pos is the position in the 00405 // fragment that we are looking for and num_frag_parts is the the 00406 // total number of pieces. The result will be appended to 00407 // filtered_choices. 00408 void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, 00409 int fragment_pos, 00410 int num_frag_parts, 00411 BLOB_CHOICE_LIST *filtered_choices); 00412 00413 // Member variables. 00414 00415 LanguageModel *language_model_; 00416 PRIORITY pass2_ok_split; 00417 // Stores the best choice for the previous word in the paragraph. 00418 // This variable is modified by PAGE_RES_IT when iterating over 00419 // words to OCR on the page. 00420 WERD_CHOICE *prev_word_best_choice_; 00421 // Sums of blame reasons computed by the blamer. 00422 GenericVector<int> blame_reasons_; 00423 // Function used to fill char choice lattices. 00424 void (Wordrec::*fill_lattice_)(const MATRIX &ratings, 00425 const WERD_CHOICE_LIST &best_choices, 00426 const UNICHARSET &unicharset, 00427 BlamerBundle *blamer_bundle); 00428 00429 protected: 00430 inline bool SegSearchDone(int num_futile_classifications) { 00431 return (language_model_->AcceptableChoiceFound() || 00432 num_futile_classifications >= 00433 segsearch_max_futile_classifications); 00434 } 00435 00436 // Updates the language model state recorded for the child entries specified 00437 // in pending[starting_col]. Enqueues the children of the updated entries 00438 // into pending and proceeds to update (and remove from pending) all the 00439 // remaining entries in pending[col] (col >= starting_col). Upon termination 00440 // of this function all the pending[col] lists will be empty. 00441 // 00442 // The arguments: 00443 // 00444 // starting_col: index of the column in chunks_record->ratings from 00445 // which the update should be started 00446 // 00447 // pending: list of entries listing chunks_record->ratings entries 00448 // that should be updated 00449 // 00450 // pain_points: priority heap listing the pain points generated by 00451 // the language model 00452 // 00453 // temp_pain_points: temporary storage for tentative pain points generated 00454 // by the language model after a single call to LanguageModel::UpdateState() 00455 // (the argument is passed in rather than created before each 00456 // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation) 00457 // 00458 // best_choice_bundle: a collection of variables that should be updated 00459 // if a new best choice is found 00460 // 00461 void UpdateSegSearchNodes( 00462 float rating_cert_scale, 00463 int starting_col, 00464 GenericVector<SegSearchPending>* pending, 00465 WERD_RES *word_res, 00466 LMPainPoints *pain_points, 00467 BestChoiceBundle *best_choice_bundle, 00468 BlamerBundle *blamer_bundle); 00469 00470 // Process the given pain point: classify the corresponding blob, enqueue 00471 // new pain points to join the newly classified blob with its neighbors. 00472 void ProcessSegSearchPainPoint(float pain_point_priority, 00473 const MATRIX_COORD &pain_point, 00474 const char* pain_point_type, 00475 GenericVector<SegSearchPending>* pending, 00476 WERD_RES *word_res, 00477 LMPainPoints *pain_points, 00478 BlamerBundle *blamer_bundle); 00479 // Resets enough of the results so that the Viterbi search is re-run. 00480 // Needed when the n-gram model is enabled, as the multi-length comparison 00481 // implementation will re-value existing paths to worse values. 00482 void ResetNGramSearch(WERD_RES* word_res, 00483 BestChoiceBundle* best_choice_bundle, 00484 GenericVector<SegSearchPending>* pending); 00485 00486 // Add pain points for classifying blobs on the correct segmentation path 00487 // (so that we can evaluate correct segmentation path and discover the reason 00488 // for incorrect result). 00489 void InitBlamerForSegSearch(WERD_RES *word_res, 00490 LMPainPoints *pain_points, 00491 BlamerBundle *blamer_bundle, 00492 STRING *blamer_debug); 00493 }; 00494 00495 00496 } // namespace tesseract 00497 00498 #endif // TESSERACT_WORDREC_WORDREC_H__