tesseract
3.03
|
00001 /********************************************************************** 00002 * File: applybox.cpp (Formerly applybox.c) 00003 * Description: Re segment rows according to box file data 00004 * Author: Phil Cheatle 00005 * Created: Wed Nov 24 09:11:23 GMT 1993 00006 * 00007 * (C) Copyright 1993, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include <ctype.h> 00025 #include <string.h> 00026 #ifdef __UNIX__ 00027 #include <assert.h> 00028 #include <errno.h> 00029 #endif 00030 #include "allheaders.h" 00031 #include "boxread.h" 00032 #include "chopper.h" 00033 #include "pageres.h" 00034 #include "unichar.h" 00035 #include "unicharset.h" 00036 #include "tesseractclass.h" 00037 #include "genericvector.h" 00038 00039 // Max number of blobs to classify together in FindSegmentation. 00040 const int kMaxGroupSize = 4; 00041 // Max fraction of median allowed as deviation in xheight before switching 00042 // to median. 00043 const double kMaxXHeightDeviationFraction = 0.125; 00044 00045 /************************************************************************* 00046 * The box file is assumed to contain box definitions, one per line, of the 00047 * following format for blob-level boxes: 00048 * <UTF8 str> <left> <bottom> <right> <top> <page id> 00049 * and for word/line-level boxes: 00050 * WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str> 00051 * NOTES: 00052 * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT. 00053 * 00054 * <page id> is 0-based, and the page number is used for multipage input (tiff). 00055 * 00056 * In the blob-level form, each line represents a recognizable unit, which may 00057 * be several UTF-8 bytes, but there is a bounding box around each recognizable 00058 * unit, and no classifier is needed to train in this mode (bootstrapping.) 00059 * 00060 * In the word/line-level form, the line begins with the literal "WordStr", and 00061 * the bounding box bounds either a whole line or a whole word. The recognizable 00062 * units in the word/line are listed after the # at the end of the line and 00063 * are space delimited, ignoring any original spaces on the line. 00064 * Eg. 00065 * word -> #w o r d 00066 * multi word line -> #m u l t i w o r d l i n e 00067 * The recognizable units must be space-delimited in order to allow multiple 00068 * unicodes to be used for a single recognizable unit, eg Hindi. 00069 * In this mode, the classifier must have been pre-trained with the desired 00070 * character set, or it will not be able to find the character segmentations. 00071 *************************************************************************/ 00072 00073 namespace tesseract { 00074 00075 static void clear_any_old_text(BLOCK_LIST *block_list) { 00076 BLOCK_IT block_it(block_list); 00077 for (block_it.mark_cycle_pt(); 00078 !block_it.cycled_list(); block_it.forward()) { 00079 ROW_IT row_it(block_it.data()->row_list()); 00080 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00081 WERD_IT word_it(row_it.data()->word_list()); 00082 for (word_it.mark_cycle_pt(); 00083 !word_it.cycled_list(); word_it.forward()) { 00084 word_it.data()->set_text(""); 00085 } 00086 } 00087 } 00088 } 00089 00090 // Applies the box file based on the image name fname, and resegments 00091 // the words in the block_list (page), with: 00092 // blob-mode: one blob per line in the box file, words as input. 00093 // word/line-mode: one blob per space-delimited unit after the #, and one word 00094 // per line in the box file. (See comment above for box file format.) 00095 // If find_segmentation is true, (word/line mode) then the classifier is used 00096 // to re-segment words/lines to match the space-delimited truth string for 00097 // each box. In this case, the input box may be for a word or even a whole 00098 // text line, and the output words will contain multiple blobs corresponding 00099 // to the space-delimited input string. 00100 // With find_segmentation false, no classifier is needed, but the chopper 00101 // can still be used to correctly segment touching characters with the help 00102 // of the input boxes. 00103 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned 00104 // from normal classification, ie. with a word, chopped_word, rebuild_word, 00105 // seam_array, denorm, box_word, and best_state, but NO best_choice or 00106 // raw_choice, as they would require a UNICHARSET, which we aim to avoid. 00107 // Instead, the correct_text member of WERD_RES is set, and this may be later 00108 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords 00109 // is not required before calling ApplyBoxTraining. 00110 PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname, 00111 bool find_segmentation, 00112 BLOCK_LIST *block_list) { 00113 int box_count = 0; 00114 int box_failures = 0; 00115 00116 FILE* box_file = OpenBoxFile(fname); 00117 TBOX box; 00118 GenericVector<TBOX> boxes; 00119 GenericVector<STRING> texts, full_texts; 00120 00121 bool found_box = true; 00122 while (found_box) { 00123 int line_number = 0; // Line number of the box file. 00124 STRING text, full_text; 00125 found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box); 00126 if (found_box) { 00127 ++box_count; 00128 MakeBoxFileStr(text.string(), box, applybox_page, &full_text); 00129 } else { 00130 full_text = ""; 00131 } 00132 boxes.push_back(box); 00133 texts.push_back(text); 00134 full_texts.push_back(full_text); 00135 } 00136 00137 // In word mode, we use the boxes to make a word for each box, but 00138 // in blob mode we use the existing words and maximally chop them first. 00139 PAGE_RES* page_res = find_segmentation ? 00140 NULL : SetupApplyBoxes(boxes, block_list); 00141 clear_any_old_text(block_list); 00142 00143 for (int i = 0; i < boxes.size() - 1; i++) { 00144 bool foundit = false; 00145 if (page_res != NULL) { 00146 if (i == 0) { 00147 foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1], 00148 full_texts[i].string()); 00149 } else { 00150 foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i], 00151 boxes[i + 1], full_texts[i].string()); 00152 } 00153 } else { 00154 foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1], 00155 texts[i].string()); 00156 } 00157 if (!foundit) { 00158 box_failures++; 00159 ReportFailedBox(i, boxes[i], texts[i].string(), 00160 "FAILURE! Couldn't find a matching blob"); 00161 } 00162 } 00163 00164 if (page_res == NULL) { 00165 // In word/line mode, we now maximally chop all the words and resegment 00166 // them with the classifier. 00167 page_res = SetupApplyBoxes(boxes, block_list); 00168 ReSegmentByClassification(page_res); 00169 } 00170 if (applybox_debug > 0) { 00171 tprintf("APPLY_BOXES:\n"); 00172 tprintf(" Boxes read from boxfile: %6d\n", box_count); 00173 if (box_failures > 0) 00174 tprintf(" Boxes failed resegmentation: %6d\n", box_failures); 00175 } 00176 TidyUp(page_res); 00177 return page_res; 00178 } 00179 00180 // Helper computes median xheight in the image. 00181 static double MedianXHeight(BLOCK_LIST *block_list) { 00182 BLOCK_IT block_it(block_list); 00183 STATS xheights(0, block_it.data()->bounding_box().height()); 00184 for (block_it.mark_cycle_pt(); 00185 !block_it.cycled_list(); block_it.forward()) { 00186 ROW_IT row_it(block_it.data()->row_list()); 00187 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00188 xheights.add(IntCastRounded(row_it.data()->x_height()), 1); 00189 } 00190 } 00191 return xheights.median(); 00192 } 00193 00194 // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: 00195 // All fuzzy spaces are removed, and all the words are maximally chopped. 00196 PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes, 00197 BLOCK_LIST *block_list) { 00198 double median_xheight = MedianXHeight(block_list); 00199 double max_deviation = kMaxXHeightDeviationFraction * median_xheight; 00200 // Strip all fuzzy space markers to simplify the PAGE_RES. 00201 BLOCK_IT b_it(block_list); 00202 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00203 BLOCK* block = b_it.data(); 00204 ROW_IT r_it(block->row_list()); 00205 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { 00206 ROW* row = r_it.data(); 00207 float diff = fabs(row->x_height() - median_xheight); 00208 if (diff > max_deviation) { 00209 if (applybox_debug) { 00210 tprintf("row xheight=%g, but median xheight = %g\n", 00211 row->x_height(), median_xheight); 00212 } 00213 row->set_x_height(static_cast<float>(median_xheight)); 00214 } 00215 WERD_IT w_it(row->word_list()); 00216 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00217 WERD* word = w_it.data(); 00218 if (word->cblob_list()->empty()) { 00219 delete w_it.extract(); 00220 } else { 00221 word->set_flag(W_FUZZY_SP, false); 00222 word->set_flag(W_FUZZY_NON, false); 00223 } 00224 } 00225 } 00226 } 00227 PAGE_RES* page_res = new PAGE_RES(block_list, NULL); 00228 PAGE_RES_IT pr_it(page_res); 00229 WERD_RES* word_res; 00230 while ((word_res = pr_it.word()) != NULL) { 00231 MaximallyChopWord(boxes, pr_it.block()->block, 00232 pr_it.row()->row, word_res); 00233 pr_it.forward(); 00234 } 00235 return page_res; 00236 } 00237 00238 // Tests the chopper by exhaustively running chop_one_blob. 00239 // The word_res will contain filled chopped_word, seam_array, denorm, 00240 // box_word and best_state for the maximally chopped word. 00241 void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes, 00242 BLOCK* block, ROW* row, 00243 WERD_RES* word_res) { 00244 if (!word_res->SetupForRecognition(unicharset, this, BestPix(), 00245 tessedit_ocr_engine_mode, NULL, 00246 classify_bln_numeric_mode, 00247 textord_use_cjk_fp_model, 00248 poly_allow_detailed_fx, 00249 row, block)) { 00250 word_res->CloneChoppedToRebuild(); 00251 return; 00252 } 00253 if (chop_debug) { 00254 tprintf("Maximally chopping word at:"); 00255 word_res->word->bounding_box().print(); 00256 } 00257 GenericVector<BLOB_CHOICE*> blob_choices; 00258 ASSERT_HOST(!word_res->chopped_word->blobs.empty()); 00259 float rating = static_cast<float>(MAX_INT8); 00260 for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) { 00261 // The rating and certainty are not quite arbitrary. Since 00262 // select_blob_to_chop uses the worst certainty to choose, they all have 00263 // to be different, so starting with MAX_INT8, subtract 1/8 for each blob 00264 // in here, and then divide by e each time they are chopped, which 00265 // should guarantee a set of unequal values for the whole tree of blobs 00266 // produced, however much chopping is required. The chops are thus only 00267 // limited by the ability of the chopper to find suitable chop points, 00268 // and not by the value of the certainties. 00269 BLOB_CHOICE* choice = 00270 new BLOB_CHOICE(0, rating, -rating, -1, -1, 0, 0, 0, 0, BCC_FAKE); 00271 blob_choices.push_back(choice); 00272 rating -= 0.125f; 00273 } 00274 const double e = exp(1.0); // The base of natural logs. 00275 int blob_number; 00276 int right_chop_index = 0; 00277 if (!assume_fixed_pitch_char_segment) { 00278 // We only chop if the language is not fixed pitch like CJK. 00279 SEAM* seam = NULL; 00280 while ((seam = chop_one_blob(boxes, blob_choices, word_res, 00281 &blob_number)) != NULL) { 00282 word_res->InsertSeam(blob_number, seam); 00283 BLOB_CHOICE* left_choice = blob_choices[blob_number]; 00284 rating = left_choice->rating() / e; 00285 left_choice->set_rating(rating); 00286 left_choice->set_certainty(-rating); 00287 // combine confidence w/ serial # 00288 BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index, 00289 rating - 0.125f, -rating, 00290 -1, -1, 0, 0, 0, 0, BCC_FAKE); 00291 blob_choices.insert(right_choice, blob_number + 1); 00292 } 00293 } 00294 word_res->CloneChoppedToRebuild(); 00295 word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]); 00296 } 00297 00298 // Helper to compute the dispute resolution metric. 00299 // Disputed blob resolution. The aim is to give the blob to the most 00300 // appropriate boxfile box. Most of the time it is obvious, but if 00301 // two boxfile boxes overlap significantly it is not. If a small boxfile 00302 // box takes most of the blob, and a large boxfile box does too, then 00303 // we want the small boxfile box to get it, but if the small box 00304 // is much smaller than the blob, we don't want it to get it. 00305 // Details of the disputed blob resolution: 00306 // Given a box with area A, and a blob with area B, with overlap area C, 00307 // then the miss metric is (A-C)(B-C)/(AB) and the box with minimum 00308 // miss metric gets the blob. 00309 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) { 00310 int overlap_area = box1.intersection(box2).area(); 00311 double miss_metric = box1.area()- overlap_area; 00312 miss_metric /= box1.area(); 00313 miss_metric *= box2.area() - overlap_area; 00314 miss_metric /= box2.area(); 00315 return miss_metric; 00316 } 00317 00318 // Gather consecutive blobs that match the given box into the best_state 00319 // and corresponding correct_text. 00320 // Fights over which box owns which blobs are settled by pre-chopping and 00321 // applying the blobs to box or next_box with the least non-overlap. 00322 // Returns false if the box was in error, which can only be caused by 00323 // failing to find an appropriate blob for a box. 00324 // This means that occasionally, blobs may be incorrectly segmented if the 00325 // chopper fails to find a suitable chop point. 00326 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box, 00327 const TBOX& box, const TBOX& next_box, 00328 const char* correct_text) { 00329 if (applybox_debug > 1) { 00330 tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text); 00331 } 00332 PAGE_RES_IT page_res_it(page_res); 00333 WERD_RES* word_res; 00334 for (word_res = page_res_it.word(); word_res != NULL; 00335 word_res = page_res_it.forward()) { 00336 if (!word_res->box_word->bounding_box().major_overlap(box)) 00337 continue; 00338 if (applybox_debug > 1) { 00339 tprintf("Checking word box:"); 00340 word_res->box_word->bounding_box().print(); 00341 } 00342 int word_len = word_res->box_word->length(); 00343 for (int i = 0; i < word_len; ++i) { 00344 TBOX char_box = TBOX(); 00345 int blob_count = 0; 00346 for (blob_count = 0; i + blob_count < word_len; ++blob_count) { 00347 TBOX blob_box = word_res->box_word->BlobBox(i + blob_count); 00348 if (!blob_box.major_overlap(box)) 00349 break; 00350 if (word_res->correct_text[i + blob_count].length() > 0) 00351 break; // Blob is claimed already. 00352 double current_box_miss_metric = BoxMissMetric(blob_box, box); 00353 double next_box_miss_metric = BoxMissMetric(blob_box, next_box); 00354 if (applybox_debug > 2) { 00355 tprintf("Checking blob:"); 00356 blob_box.print(); 00357 tprintf("Current miss metric = %g, next = %g\n", 00358 current_box_miss_metric, next_box_miss_metric); 00359 } 00360 if (current_box_miss_metric > next_box_miss_metric) 00361 break; // Blob is a better match for next box. 00362 char_box += blob_box; 00363 } 00364 if (blob_count > 0) { 00365 if (applybox_debug > 1) { 00366 tprintf("Index [%d, %d) seem good.\n", i, i + blob_count); 00367 } 00368 if (!char_box.almost_equal(box, 3) && 00369 (box.x_gap(next_box) < -3 || 00370 (prev_box != NULL && prev_box->x_gap(box) < -3))) { 00371 return false; 00372 } 00373 // We refine just the box_word, best_state and correct_text here. 00374 // The rebuild_word is made in TidyUp. 00375 // blob_count blobs are put together to match the box. Merge the 00376 // box_word boxes, save the blob_count in the state and the text. 00377 word_res->box_word->MergeBoxes(i, i + blob_count); 00378 word_res->best_state[i] = blob_count; 00379 word_res->correct_text[i] = correct_text; 00380 if (applybox_debug > 2) { 00381 tprintf("%d Blobs match: blob box:", blob_count); 00382 word_res->box_word->BlobBox(i).print(); 00383 tprintf("Matches box:"); 00384 box.print(); 00385 tprintf("With next box:"); 00386 next_box.print(); 00387 } 00388 // Eliminated best_state and correct_text entries for the consumed 00389 // blobs. 00390 for (int j = 1; j < blob_count; ++j) { 00391 word_res->best_state.remove(i + 1); 00392 word_res->correct_text.remove(i + 1); 00393 } 00394 // Assume that no box spans multiple source words, so we are done with 00395 // this box. 00396 if (applybox_debug > 1) { 00397 tprintf("Best state = "); 00398 for (int j = 0; j < word_res->best_state.size(); ++j) { 00399 tprintf("%d ", word_res->best_state[j]); 00400 } 00401 tprintf("\n"); 00402 tprintf("Correct text = [[ "); 00403 for (int j = 0; j < word_res->correct_text.size(); ++j) { 00404 tprintf("%s ", word_res->correct_text[j].string()); 00405 } 00406 tprintf("]]\n"); 00407 } 00408 return true; 00409 } 00410 } 00411 } 00412 if (applybox_debug > 0) { 00413 tprintf("FAIL!\n"); 00414 } 00415 return false; // Failure. 00416 } 00417 00418 // Consume all source blobs that strongly overlap the given box, 00419 // putting them into a new word, with the correct_text label. 00420 // Fights over which box owns which blobs are settled by 00421 // applying the blobs to box or next_box with the least non-overlap. 00422 // Returns false if the box was in error, which can only be caused by 00423 // failing to find an overlapping blob for a box. 00424 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, 00425 const TBOX& box, const TBOX& next_box, 00426 const char* correct_text) { 00427 if (applybox_debug > 1) { 00428 tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); 00429 } 00430 WERD* new_word = NULL; 00431 BLOCK_IT b_it(block_list); 00432 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { 00433 BLOCK* block = b_it.data(); 00434 if (!box.major_overlap(block->bounding_box())) 00435 continue; 00436 ROW_IT r_it(block->row_list()); 00437 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { 00438 ROW* row = r_it.data(); 00439 if (!box.major_overlap(row->bounding_box())) 00440 continue; 00441 WERD_IT w_it(row->word_list()); 00442 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00443 WERD* word = w_it.data(); 00444 if (applybox_debug > 2) { 00445 tprintf("Checking word:"); 00446 word->bounding_box().print(); 00447 } 00448 if (word->text() != NULL && word->text()[0] != '\0') 00449 continue; // Ignore words that are already done. 00450 if (!box.major_overlap(word->bounding_box())) 00451 continue; 00452 C_BLOB_IT blob_it(word->cblob_list()); 00453 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); 00454 blob_it.forward()) { 00455 C_BLOB* blob = blob_it.data(); 00456 TBOX blob_box = blob->bounding_box(); 00457 if (!blob_box.major_overlap(box)) 00458 continue; 00459 double current_box_miss_metric = BoxMissMetric(blob_box, box); 00460 double next_box_miss_metric = BoxMissMetric(blob_box, next_box); 00461 if (applybox_debug > 2) { 00462 tprintf("Checking blob:"); 00463 blob_box.print(); 00464 tprintf("Current miss metric = %g, next = %g\n", 00465 current_box_miss_metric, next_box_miss_metric); 00466 } 00467 if (current_box_miss_metric > next_box_miss_metric) 00468 continue; // Blob is a better match for next box. 00469 if (applybox_debug > 2) { 00470 tprintf("Blob match: blob:"); 00471 blob_box.print(); 00472 tprintf("Matches box:"); 00473 box.print(); 00474 tprintf("With next box:"); 00475 next_box.print(); 00476 } 00477 if (new_word == NULL) { 00478 // Make a new word with a single blob. 00479 new_word = word->shallow_copy(); 00480 new_word->set_text(correct_text); 00481 w_it.add_to_end(new_word); 00482 } 00483 C_BLOB_IT new_blob_it(new_word->cblob_list()); 00484 new_blob_it.add_to_end(blob_it.extract()); 00485 } 00486 } 00487 } 00488 } 00489 if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); 00490 return new_word != NULL; 00491 } 00492 00493 // Resegments the words by running the classifier in an attempt to find the 00494 // correct segmentation that produces the required string. 00495 void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) { 00496 PAGE_RES_IT pr_it(page_res); 00497 WERD_RES* word_res; 00498 for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { 00499 WERD* word = word_res->word; 00500 if (word->text() == NULL || word->text()[0] == '\0') 00501 continue; // Ignore words that have no text. 00502 // Convert the correct text to a vector of UNICHAR_ID 00503 GenericVector<UNICHAR_ID> target_text; 00504 if (!ConvertStringToUnichars(word->text(), &target_text)) { 00505 tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", 00506 word->text()); 00507 pr_it.DeleteCurrentWord(); 00508 continue; 00509 } 00510 if (!FindSegmentation(target_text, word_res)) { 00511 tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", 00512 word->text()); 00513 pr_it.DeleteCurrentWord(); 00514 continue; 00515 } 00516 } 00517 } 00518 00519 // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. 00520 // Returns false if an invalid UNICHAR_ID is encountered. 00521 bool Tesseract::ConvertStringToUnichars(const char* utf8, 00522 GenericVector<UNICHAR_ID>* class_ids) { 00523 for (int step = 0; *utf8 != '\0'; utf8 += step) { 00524 const char* next_space = strchr(utf8, ' '); 00525 if (next_space == NULL) 00526 next_space = utf8 + strlen(utf8); 00527 step = next_space - utf8; 00528 UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step); 00529 if (class_id == INVALID_UNICHAR_ID) { 00530 return false; 00531 } 00532 while (utf8[step] == ' ') 00533 ++step; 00534 class_ids->push_back(class_id); 00535 } 00536 return true; 00537 } 00538 00539 // Resegments the word to achieve the target_text from the classifier. 00540 // Returns false if the re-segmentation fails. 00541 // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and 00542 // applies a full search on the classifier results to find the best classified 00543 // segmentation. As a compromise to obtain better recall, 1-1 ambiguity 00544 // substitutions ARE used. 00545 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text, 00546 WERD_RES* word_res) { 00547 // Classify all required combinations of blobs and save results in choices. 00548 int word_length = word_res->box_word->length(); 00549 GenericVector<BLOB_CHOICE_LIST*>* choices = 00550 new GenericVector<BLOB_CHOICE_LIST*>[word_length]; 00551 for (int i = 0; i < word_length; ++i) { 00552 for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) { 00553 BLOB_CHOICE_LIST* match_result = classify_piece( 00554 word_res->seam_array, i, i + j - 1, "Applybox", 00555 word_res->chopped_word, word_res->blamer_bundle); 00556 if (applybox_debug > 2) { 00557 tprintf("%d+%d:", i, j); 00558 print_ratings_list("Segment:", match_result, unicharset); 00559 } 00560 choices[i].push_back(match_result); 00561 } 00562 } 00563 // Search the segmentation graph for the target text. Must be an exact 00564 // match. Using wildcards makes it difficult to find the correct 00565 // segmentation even when it is there. 00566 word_res->best_state.clear(); 00567 GenericVector<int> search_segmentation; 00568 float best_rating = 0.0f; 00569 SearchForText(choices, 0, word_length, target_text, 0, 0.0f, 00570 &search_segmentation, &best_rating, &word_res->best_state); 00571 for (int i = 0; i < word_length; ++i) 00572 choices[i].delete_data_pointers(); 00573 delete [] choices; 00574 if (word_res->best_state.empty()) { 00575 // Build the original segmentation and if it is the same length as the 00576 // truth, assume it will do. 00577 int blob_count = 1; 00578 for (int s = 0; s < word_res->seam_array.size(); ++s) { 00579 SEAM* seam = word_res->seam_array[s]; 00580 if (seam->split1 == NULL) { 00581 word_res->best_state.push_back(blob_count); 00582 blob_count = 1; 00583 } else { 00584 ++blob_count; 00585 } 00586 } 00587 word_res->best_state.push_back(blob_count); 00588 if (word_res->best_state.size() != target_text.size()) { 00589 word_res->best_state.clear(); // No good. Original segmentation bad size. 00590 return false; 00591 } 00592 } 00593 word_res->correct_text.clear(); 00594 for (int i = 0; i < target_text.size(); ++i) { 00595 word_res->correct_text.push_back( 00596 STRING(unicharset.id_to_unichar(target_text[i]))); 00597 } 00598 return true; 00599 } 00600 00601 // Recursive helper to find a match to the target_text (from text_index 00602 // position) in the choices (from choices_pos position). 00603 // Choices is an array of GenericVectors, of length choices_length, with each 00604 // element representing a starting position in the word, and the 00605 // GenericVector holding classification results for a sequence of consecutive 00606 // blobs, with index 0 being a single blob, index 1 being 2 blobs etc. 00607 void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices, 00608 int choices_pos, int choices_length, 00609 const GenericVector<UNICHAR_ID>& target_text, 00610 int text_index, 00611 float rating, GenericVector<int>* segmentation, 00612 float* best_rating, 00613 GenericVector<int>* best_segmentation) { 00614 const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs(); 00615 for (int length = 1; length <= choices[choices_pos].size(); ++length) { 00616 // Rating of matching choice or worst choice if no match. 00617 float choice_rating = 0.0f; 00618 // Find the corresponding best BLOB_CHOICE. 00619 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]); 00620 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 00621 choice_it.forward()) { 00622 BLOB_CHOICE* choice = choice_it.data(); 00623 choice_rating = choice->rating(); 00624 UNICHAR_ID class_id = choice->unichar_id(); 00625 if (class_id == target_text[text_index]) { 00626 break; 00627 } 00628 // Search ambigs table. 00629 if (class_id < table.size() && table[class_id] != NULL) { 00630 AmbigSpec_IT spec_it(table[class_id]); 00631 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); 00632 spec_it.forward()) { 00633 const AmbigSpec *ambig_spec = spec_it.data(); 00634 // We'll only do 1-1. 00635 if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID && 00636 ambig_spec->correct_ngram_id == target_text[text_index]) 00637 break; 00638 } 00639 if (!spec_it.cycled_list()) 00640 break; // Found an ambig. 00641 } 00642 } 00643 if (choice_it.cycled_list()) 00644 continue; // No match. 00645 segmentation->push_back(length); 00646 if (choices_pos + length == choices_length && 00647 text_index + 1 == target_text.size()) { 00648 // This is a complete match. If the rating is good record a new best. 00649 if (applybox_debug > 2) { 00650 tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n", 00651 rating + choice_rating, *best_rating, segmentation->size(), 00652 best_segmentation->size()); 00653 } 00654 if (best_segmentation->empty() || rating + choice_rating < *best_rating) { 00655 *best_segmentation = *segmentation; 00656 *best_rating = rating + choice_rating; 00657 } 00658 } else if (choices_pos + length < choices_length && 00659 text_index + 1 < target_text.size()) { 00660 if (applybox_debug > 3) { 00661 tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", 00662 target_text[text_index], 00663 unicharset.id_to_unichar(target_text[text_index]), 00664 choice_it.data()->unichar_id() == target_text[text_index] 00665 ? "Match" : "Ambig", 00666 choices_pos, length); 00667 } 00668 SearchForText(choices, choices_pos + length, choices_length, target_text, 00669 text_index + 1, rating + choice_rating, segmentation, 00670 best_rating, best_segmentation); 00671 if (applybox_debug > 3) { 00672 tprintf("End recursion for %d=%s\n", target_text[text_index], 00673 unicharset.id_to_unichar(target_text[text_index])); 00674 } 00675 } 00676 segmentation->truncate(segmentation->size() - 1); 00677 } 00678 } 00679 00680 // Counts up the labelled words and the blobs within. 00681 // Deletes all unused or emptied words, counting the unused ones. 00682 // Resets W_BOL and W_EOL flags correctly. 00683 // Builds the rebuild_word and rebuilds the box_word and the best_choice. 00684 void Tesseract::TidyUp(PAGE_RES* page_res) { 00685 int ok_blob_count = 0; 00686 int bad_blob_count = 0; 00687 int ok_word_count = 0; 00688 int unlabelled_words = 0; 00689 PAGE_RES_IT pr_it(page_res); 00690 WERD_RES* word_res; 00691 for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { 00692 int ok_in_word = 0; 00693 int blob_count = word_res->correct_text.size(); 00694 WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count); 00695 word_choice->set_permuter(TOP_CHOICE_PERM); 00696 for (int c = 0; c < blob_count; ++c) { 00697 if (word_res->correct_text[c].length() > 0) { 00698 ++ok_in_word; 00699 } 00700 // Since we only need a fake word_res->best_choice, the actual 00701 // unichar_ids do not matter. Which is fortunate, since TidyUp() 00702 // can be called while training Tesseract, at the stage where 00703 // unicharset is not meaningful yet. 00704 word_choice->append_unichar_id_space_allocated( 00705 INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f); 00706 } 00707 if (ok_in_word > 0) { 00708 ok_blob_count += ok_in_word; 00709 bad_blob_count += word_res->correct_text.size() - ok_in_word; 00710 word_res->LogNewRawChoice(word_choice); 00711 word_res->LogNewCookedChoice(1, false, word_choice); 00712 } else { 00713 ++unlabelled_words; 00714 if (applybox_debug > 0) { 00715 tprintf("APPLY_BOXES: Unlabelled word at :"); 00716 word_res->word->bounding_box().print(); 00717 } 00718 pr_it.DeleteCurrentWord(); 00719 } 00720 } 00721 pr_it.restart_page(); 00722 for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { 00723 // Denormalize back to a BoxWord. 00724 word_res->RebuildBestState(); 00725 word_res->SetupBoxWord(); 00726 word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row()); 00727 word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row()); 00728 } 00729 if (applybox_debug > 0) { 00730 tprintf(" Found %d good blobs.\n", ok_blob_count); 00731 if (bad_blob_count > 0) { 00732 tprintf(" Leaving %d unlabelled blobs in %d words.\n", 00733 bad_blob_count, ok_word_count); 00734 } 00735 if (unlabelled_words > 0) 00736 tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words); 00737 } 00738 } 00739 00740 // Logs a bad box by line in the box file and box coords. 00741 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box, 00742 const char *box_ch, const char *err_msg) { 00743 tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", 00744 boxfile_lineno + 1, box_ch, 00745 box.left(), box.bottom(), box.right(), box.top(), err_msg); 00746 } 00747 00748 // Creates a fake best_choice entry in each WERD_RES with the correct text. 00749 void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) { 00750 PAGE_RES_IT pr_it(page_res); 00751 for (WERD_RES *word_res = pr_it.word(); word_res != NULL; 00752 word_res = pr_it.forward()) { 00753 WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set, 00754 word_res->correct_text.size()); 00755 for (int i = 0; i < word_res->correct_text.size(); ++i) { 00756 // The part before the first space is the real ground truth, and the 00757 // rest is the bounding box location and page number. 00758 GenericVector<STRING> tokens; 00759 word_res->correct_text[i].split(' ', &tokens); 00760 UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string()); 00761 choice->append_unichar_id_space_allocated(char_id, 00762 word_res->best_state[i], 00763 0.0f, 0.0f); 00764 } 00765 word_res->ClearWordChoices(); 00766 word_res->LogNewRawChoice(choice); 00767 word_res->LogNewCookedChoice(1, false, choice); 00768 } 00769 } 00770 00771 // Calls LearnWord to extract features for labelled blobs within each word. 00772 // Features are written to the given filename. 00773 void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) { 00774 PAGE_RES_IT pr_it(page_res); 00775 int word_count = 0; 00776 for (WERD_RES *word_res = pr_it.word(); word_res != NULL; 00777 word_res = pr_it.forward()) { 00778 LearnWord(filename.string(), word_res); 00779 ++word_count; 00780 } 00781 tprintf("Generated training data for %d words\n", word_count); 00782 } 00783 00784 00785 } // namespace tesseract