tesseract
3.03
|
00001 00002 // File: recogtraining.cpp 00003 // Description: Functions for ambiguity and parameter training. 00004 // Author: Daria Antonova 00005 // Created: Mon Aug 13 11:26:43 PDT 2009 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include "tesseractclass.h" 00021 00022 #include "boxread.h" 00023 #include "control.h" 00024 #include "cutil.h" 00025 #include "host.h" 00026 #include "ratngs.h" 00027 #include "reject.h" 00028 #include "stopper.h" 00029 00030 namespace tesseract { 00031 00032 const inT16 kMaxBoxEdgeDiff = 2; 00033 00034 // Sets flags necessary for recognition in the training mode. 00035 // Opens and returns the pointer to the output file. 00036 FILE *Tesseract::init_recog_training(const STRING &fname) { 00037 if (tessedit_ambigs_training) { 00038 tessedit_tess_adaption_mode.set_value(0); // turn off adaption 00039 tessedit_enable_doc_dict.set_value(0); // turn off document dictionary 00040 // Explore all segmentations. 00041 getDict().stopper_no_acceptable_choices.set_value(1); 00042 } 00043 00044 STRING output_fname = fname; 00045 const char *lastdot = strrchr(output_fname.string(), '.'); 00046 if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0'; 00047 output_fname += ".txt"; 00048 FILE *output_file = open_file(output_fname.string(), "a+"); 00049 return output_file; 00050 } 00051 00052 // Copies the bounding box from page_res_it->word() to the given TBOX. 00053 bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) { 00054 while (page_res_it->block() != NULL) { 00055 if (page_res_it->word() != NULL) 00056 break; 00057 page_res_it->forward(); 00058 } 00059 00060 if (page_res_it->word() != NULL) { 00061 *tbox = page_res_it->word()->word->bounding_box(); 00062 page_res_it->forward(); 00063 00064 // If tbox->left() is negative, the training image has vertical text and 00065 // all the coordinates of bounding boxes of page_res are rotated by 90 00066 // degrees in a counterclockwise direction. We need to rotate the TBOX back 00067 // in order to compare with the TBOXes of box files. 00068 if (tbox->left() < 0) { 00069 tbox->rotate(FCOORD(0.0, -1.0)); 00070 } 00071 00072 return true; 00073 } else { 00074 return false; 00075 } 00076 } 00077 00078 // This function takes tif/box pair of files and runs recognition on the image, 00079 // while making sure that the word bounds that tesseract identified roughly 00080 // match to those specified by the input box file. For each word (ngram in a 00081 // single bounding box from the input box file) it outputs the ocred result, 00082 // the correct label, rating and certainty. 00083 void Tesseract::recog_training_segmented(const STRING &fname, 00084 PAGE_RES *page_res, 00085 volatile ETEXT_DESC *monitor, 00086 FILE *output_file) { 00087 STRING box_fname = fname; 00088 const char *lastdot = strrchr(box_fname.string(), '.'); 00089 if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0'; 00090 box_fname += ".box"; 00091 // read_next_box() will close box_file 00092 FILE *box_file = open_file(box_fname.string(), "r"); 00093 00094 PAGE_RES_IT page_res_it; 00095 page_res_it.page_res = page_res; 00096 page_res_it.restart_page(); 00097 STRING label; 00098 00099 // Process all the words on this page. 00100 TBOX tbox; // tesseract-identified box 00101 TBOX bbox; // box from the box file 00102 bool keep_going; 00103 int line_number = 0; 00104 int examined_words = 0; 00105 do { 00106 keep_going = read_t(&page_res_it, &tbox); 00107 keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label, 00108 &bbox); 00109 // Align bottom left points of the TBOXes. 00110 while (keep_going && 00111 !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) { 00112 keep_going = (bbox.bottom() < tbox.bottom()) ? 00113 read_t(&page_res_it, &tbox) : 00114 ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); 00115 } 00116 while (keep_going && 00117 !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) { 00118 keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) : 00119 ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); 00120 } 00121 // OCR the word if top right points of the TBOXes are similar. 00122 if (keep_going && 00123 NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) && 00124 NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) { 00125 ambigs_classify_and_output(page_res_it.prev_word(), 00126 page_res_it.prev_row(), 00127 page_res_it.prev_block(), 00128 label.string(), output_file); 00129 examined_words++; 00130 } 00131 } while (keep_going); 00132 00133 // Set up scripts on all of the words that did not get sent to 00134 // ambigs_classify_and_output. They all should have, but if all the 00135 // werd_res's don't get uch_sets, tesseract will crash when you try 00136 // to iterate over them. :-( 00137 int total_words = 0; 00138 for (page_res_it.restart_page(); page_res_it.block() != NULL; 00139 page_res_it.forward()) { 00140 if (page_res_it.word()) { 00141 if (page_res_it.word()->uch_set == NULL) 00142 page_res_it.word()->SetupFake(unicharset); 00143 total_words++; 00144 } 00145 } 00146 if (examined_words < 0.85 * total_words) { 00147 tprintf("TODO(antonova): clean up recog_training_segmented; " 00148 " It examined only a small fraction of the ambigs image.\n"); 00149 } 00150 tprintf("recog_training_segmented: examined %d / %d words.\n", 00151 examined_words, total_words); 00152 } 00153 00154 // Helper prints the given set of blob choices. 00155 static void PrintPath(int length, const BLOB_CHOICE** blob_choices, 00156 const UNICHARSET& unicharset, 00157 const char *label, FILE *output_file) { 00158 float rating = 0.0f; 00159 float certainty = 0.0f; 00160 for (int i = 0; i < length; ++i) { 00161 const BLOB_CHOICE* blob_choice = blob_choices[i]; 00162 fprintf(output_file, "%s", 00163 unicharset.id_to_unichar(blob_choice->unichar_id())); 00164 rating += blob_choice->rating(); 00165 if (certainty > blob_choice->certainty()) 00166 certainty = blob_choice->certainty(); 00167 } 00168 fprintf(output_file, "\t%s\t%.4f\t%.4f\n", 00169 label, rating, certainty); 00170 } 00171 00172 // Helper recursively prints all paths through the ratings matrix, starting 00173 // at column col. 00174 static void PrintMatrixPaths(int col, int dim, 00175 const MATRIX& ratings, 00176 int length, const BLOB_CHOICE** blob_choices, 00177 const UNICHARSET& unicharset, 00178 const char *label, FILE *output_file) { 00179 for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) { 00180 if (ratings.get(col, row) != NOT_CLASSIFIED) { 00181 BLOB_CHOICE_IT bc_it(ratings.get(col, row)); 00182 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { 00183 blob_choices[length] = bc_it.data(); 00184 if (row + 1 < dim) { 00185 PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, 00186 unicharset, label, output_file); 00187 } else { 00188 PrintPath(length + 1, blob_choices, unicharset, label, output_file); 00189 } 00190 } 00191 } 00192 } 00193 } 00194 00195 // Runs classify_word_pass1() on the current word. Outputs Tesseract's 00196 // raw choice as a result of the classification. For words labeled with a 00197 // single unichar also outputs all alternatives from blob_choices of the 00198 // best choice. 00199 void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res, 00200 ROW_RES *row_res, 00201 BLOCK_RES *block_res, 00202 const char *label, 00203 FILE *output_file) { 00204 // Classify word. 00205 fflush(stdout); 00206 WordData word_data(block_res->block, row_res->row, werd_res); 00207 SetupWordPassN(1, &word_data); 00208 classify_word_pass1(&word_data, werd_res); 00209 WERD_CHOICE *best_choice = werd_res->best_choice; 00210 ASSERT_HOST(best_choice != NULL); 00211 00212 // Compute the number of unichars in the label. 00213 GenericVector<UNICHAR_ID> encoding; 00214 if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) { 00215 tprintf("Not outputting illegal unichar %s\n", label); 00216 return; 00217 } 00218 00219 // Dump all paths through the ratings matrix (which is normally small). 00220 int dim = werd_res->ratings->dimension(); 00221 const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim]; 00222 PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, 00223 unicharset, label, output_file); 00224 delete [] blob_choices; 00225 } 00226 00227 } // namespace tesseract