tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/applybox.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        applybox.cpp  (Formerly applybox.c)
00003  * Description: Re segment rows according to box file data
00004  * Author:      Phil Cheatle
00005  * Created:     Wed Nov 24 09:11:23 GMT 1993
00006  *
00007  * (C) Copyright 1993, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #include <ctype.h>
00025 #include <string.h>
00026 #ifdef __UNIX__
00027 #include <assert.h>
00028 #include <errno.h>
00029 #endif
00030 #include "allheaders.h"
00031 #include "boxread.h"
00032 #include "chopper.h"
00033 #include "pageres.h"
00034 #include "unichar.h"
00035 #include "unicharset.h"
00036 #include "tesseractclass.h"
00037 #include "genericvector.h"
00038 
00039 // Max number of blobs to classify together in FindSegmentation.
00040 const int kMaxGroupSize = 4;
00041 // Max fraction of median allowed as deviation in xheight before switching
00042 // to median.
00043 const double kMaxXHeightDeviationFraction = 0.125;
00044 
00045 /*************************************************************************
00046  * The box file is assumed to contain box definitions, one per line, of the
00047  * following format for blob-level boxes:
00048  *   <UTF8 str> <left> <bottom> <right> <top> <page id>
00049  * and for word/line-level boxes:
00050  *   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
00051  * NOTES:
00052  * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
00053  *
00054  * <page id> is 0-based, and the page number is used for multipage input (tiff).
00055  *
00056  * In the blob-level form, each line represents a recognizable unit, which may
00057  * be several UTF-8 bytes, but there is a bounding box around each recognizable
00058  * unit, and no classifier is needed to train in this mode (bootstrapping.)
00059  *
00060  * In the word/line-level form, the line begins with the literal "WordStr", and
00061  * the bounding box bounds either a whole line or a whole word. The recognizable
00062  * units in the word/line are listed after the # at the end of the line and
00063  * are space delimited, ignoring any original spaces on the line.
00064  * Eg.
00065  * word -> #w o r d
00066  * multi word line -> #m u l t i w o r d l i n e
00067  * The recognizable units must be space-delimited in order to allow multiple
00068  * unicodes to be used for a single recognizable unit, eg Hindi.
00069  * In this mode, the classifier must have been pre-trained with the desired
00070  * character set, or it will not be able to find the character segmentations.
00071  *************************************************************************/
00072 
00073 namespace tesseract {
00074 
00075 static void clear_any_old_text(BLOCK_LIST *block_list) {
00076   BLOCK_IT block_it(block_list);
00077   for (block_it.mark_cycle_pt();
00078        !block_it.cycled_list(); block_it.forward()) {
00079     ROW_IT row_it(block_it.data()->row_list());
00080     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00081       WERD_IT word_it(row_it.data()->word_list());
00082       for (word_it.mark_cycle_pt();
00083            !word_it.cycled_list(); word_it.forward()) {
00084         word_it.data()->set_text("");
00085       }
00086     }
00087   }
00088 }
00089 
00090 // Applies the box file based on the image name fname, and resegments
00091 // the words in the block_list (page), with:
00092 // blob-mode: one blob per line in the box file, words as input.
00093 // word/line-mode: one blob per space-delimited unit after the #, and one word
00094 // per line in the box file. (See comment above for box file format.)
00095 // If find_segmentation is true, (word/line mode) then the classifier is used
00096 // to re-segment words/lines to match the space-delimited truth string for
00097 // each box. In this case, the input box may be for a word or even a whole
00098 // text line, and the output words will contain multiple blobs corresponding
00099 // to the space-delimited input string.
00100 // With find_segmentation false, no classifier is needed, but the chopper
00101 // can still be used to correctly segment touching characters with the help
00102 // of the input boxes.
00103 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
00104 // from normal classification, ie. with a word, chopped_word, rebuild_word,
00105 // seam_array, denorm, box_word, and best_state, but NO best_choice or
00106 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
00107 // Instead, the correct_text member of WERD_RES is set, and this may be later
00108 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
00109 // is not required before calling ApplyBoxTraining.
00110 PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
00111                                 bool find_segmentation,
00112                                 BLOCK_LIST *block_list) {
00113   int box_count = 0;
00114   int box_failures = 0;
00115 
00116   FILE* box_file = OpenBoxFile(fname);
00117   TBOX box;
00118   GenericVector<TBOX> boxes;
00119   GenericVector<STRING> texts, full_texts;
00120 
00121   bool found_box = true;
00122   while (found_box) {
00123     int line_number = 0;           // Line number of the box file.
00124     STRING text, full_text;
00125     found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box);
00126     if (found_box) {
00127       ++box_count;
00128       MakeBoxFileStr(text.string(), box, applybox_page, &full_text);
00129     } else {
00130       full_text = "";
00131     }
00132     boxes.push_back(box);
00133     texts.push_back(text);
00134     full_texts.push_back(full_text);
00135   }
00136 
00137   // In word mode, we use the boxes to make a word for each box, but
00138   // in blob mode we use the existing words and maximally chop them first.
00139   PAGE_RES* page_res = find_segmentation ?
00140       NULL : SetupApplyBoxes(boxes, block_list);
00141   clear_any_old_text(block_list);
00142 
00143   for (int i = 0; i < boxes.size() - 1; i++) {
00144     bool foundit = false;
00145     if (page_res != NULL) {
00146       if (i == 0) {
00147         foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
00148                                    full_texts[i].string());
00149       } else {
00150         foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
00151                                    boxes[i + 1], full_texts[i].string());
00152       }
00153     } else {
00154       foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
00155                                  texts[i].string());
00156     }
00157     if (!foundit) {
00158       box_failures++;
00159       ReportFailedBox(i, boxes[i], texts[i].string(),
00160                       "FAILURE! Couldn't find a matching blob");
00161     }
00162   }
00163 
00164   if (page_res == NULL) {
00165     // In word/line mode, we now maximally chop all the words and resegment
00166     // them with the classifier.
00167     page_res = SetupApplyBoxes(boxes, block_list);
00168     ReSegmentByClassification(page_res);
00169   }
00170   if (applybox_debug > 0) {
00171     tprintf("APPLY_BOXES:\n");
00172     tprintf("   Boxes read from boxfile:  %6d\n", box_count);
00173     if (box_failures > 0)
00174       tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
00175   }
00176   TidyUp(page_res);
00177   return page_res;
00178 }
00179 
00180 // Helper computes median xheight in the image.
00181 static double MedianXHeight(BLOCK_LIST *block_list) {
00182   BLOCK_IT block_it(block_list);
00183   STATS xheights(0, block_it.data()->bounding_box().height());
00184   for (block_it.mark_cycle_pt();
00185        !block_it.cycled_list(); block_it.forward()) {
00186     ROW_IT row_it(block_it.data()->row_list());
00187     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00188       xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
00189     }
00190   }
00191   return xheights.median();
00192 }
00193 
00194 // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
00195 // All fuzzy spaces are removed, and all the words are maximally chopped.
00196 PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
00197                                      BLOCK_LIST *block_list) {
00198   double median_xheight = MedianXHeight(block_list);
00199   double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
00200   // Strip all fuzzy space markers to simplify the PAGE_RES.
00201   BLOCK_IT b_it(block_list);
00202   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00203     BLOCK* block = b_it.data();
00204     ROW_IT r_it(block->row_list());
00205     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
00206       ROW* row = r_it.data();
00207       float diff = fabs(row->x_height() - median_xheight);
00208       if (diff > max_deviation) {
00209         if (applybox_debug) {
00210           tprintf("row xheight=%g, but median xheight = %g\n",
00211                   row->x_height(), median_xheight);
00212         }
00213         row->set_x_height(static_cast<float>(median_xheight));
00214       }
00215       WERD_IT w_it(row->word_list());
00216       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00217         WERD* word = w_it.data();
00218         if (word->cblob_list()->empty()) {
00219           delete w_it.extract();
00220         } else {
00221           word->set_flag(W_FUZZY_SP, false);
00222           word->set_flag(W_FUZZY_NON, false);
00223         }
00224       }
00225     }
00226   }
00227   PAGE_RES* page_res = new PAGE_RES(block_list, NULL);
00228   PAGE_RES_IT pr_it(page_res);
00229   WERD_RES* word_res;
00230   while ((word_res = pr_it.word()) != NULL) {
00231     MaximallyChopWord(boxes, pr_it.block()->block,
00232                       pr_it.row()->row, word_res);
00233     pr_it.forward();
00234   }
00235   return page_res;
00236 }
00237 
00238 // Tests the chopper by exhaustively running chop_one_blob.
00239 // The word_res will contain filled chopped_word, seam_array, denorm,
00240 // box_word and best_state for the maximally chopped word.
00241 void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
00242                                   BLOCK* block, ROW* row,
00243                                   WERD_RES* word_res) {
00244   if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
00245                                      tessedit_ocr_engine_mode, NULL,
00246                                      classify_bln_numeric_mode,
00247                                      textord_use_cjk_fp_model,
00248                                      poly_allow_detailed_fx,
00249                                      row, block)) {
00250     word_res->CloneChoppedToRebuild();
00251     return;
00252   }
00253   if (chop_debug) {
00254     tprintf("Maximally chopping word at:");
00255     word_res->word->bounding_box().print();
00256   }
00257   GenericVector<BLOB_CHOICE*> blob_choices;
00258   ASSERT_HOST(!word_res->chopped_word->blobs.empty());
00259   float rating = static_cast<float>(MAX_INT8);
00260   for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
00261     // The rating and certainty are not quite arbitrary. Since
00262     // select_blob_to_chop uses the worst certainty to choose, they all have
00263     // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
00264     // in here, and then divide by e each time they are chopped, which
00265     // should guarantee a set of unequal values for the whole tree of blobs
00266     // produced, however much chopping is required. The chops are thus only
00267     // limited by the ability of the chopper to find suitable chop points,
00268     // and not by the value of the certainties.
00269     BLOB_CHOICE* choice =
00270         new BLOB_CHOICE(0, rating, -rating, -1, -1, 0, 0, 0, 0, BCC_FAKE);
00271     blob_choices.push_back(choice);
00272     rating -= 0.125f;
00273   }
00274   const double e = exp(1.0);  // The base of natural logs.
00275   int blob_number;
00276   int right_chop_index = 0;
00277   if (!assume_fixed_pitch_char_segment) {
00278     // We only chop if the language is not fixed pitch like CJK.
00279     SEAM* seam = NULL;
00280     while ((seam = chop_one_blob(boxes, blob_choices, word_res,
00281                                  &blob_number)) != NULL) {
00282       word_res->InsertSeam(blob_number, seam);
00283       BLOB_CHOICE* left_choice = blob_choices[blob_number];
00284       rating = left_choice->rating() / e;
00285       left_choice->set_rating(rating);
00286       left_choice->set_certainty(-rating);
00287       // combine confidence w/ serial #
00288       BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
00289                                                   rating - 0.125f, -rating,
00290                                                   -1, -1, 0, 0, 0, 0, BCC_FAKE);
00291       blob_choices.insert(right_choice, blob_number + 1);
00292     }
00293   }
00294   word_res->CloneChoppedToRebuild();
00295   word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
00296 }
00297 
00298 // Helper to compute the dispute resolution metric.
00299 // Disputed blob resolution. The aim is to give the blob to the most
00300 // appropriate boxfile box. Most of the time it is obvious, but if
00301 // two boxfile boxes overlap significantly it is not. If a small boxfile
00302 // box takes most of the blob, and a large boxfile box does too, then
00303 // we want the small boxfile box to get it, but if the small box
00304 // is much smaller than the blob, we don't want it to get it.
00305 // Details of the disputed blob resolution:
00306 // Given a box with area A, and a blob with area B, with overlap area C,
00307 // then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
00308 // miss metric gets the blob.
00309 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
00310   int overlap_area = box1.intersection(box2).area();
00311   double miss_metric = box1.area()- overlap_area;
00312   miss_metric /= box1.area();
00313   miss_metric *= box2.area() - overlap_area;
00314   miss_metric /= box2.area();
00315   return miss_metric;
00316 }
00317 
00318 // Gather consecutive blobs that match the given box into the best_state
00319 // and corresponding correct_text.
00320 // Fights over which box owns which blobs are settled by pre-chopping and
00321 // applying the blobs to box or next_box with the least non-overlap.
00322 // Returns false if the box was in error, which can only be caused by
00323 // failing to find an appropriate blob for a box.
00324 // This means that occasionally, blobs may be incorrectly segmented if the
00325 // chopper fails to find a suitable chop point.
00326 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
00327                                  const TBOX& box, const TBOX& next_box,
00328                                  const char* correct_text) {
00329   if (applybox_debug > 1) {
00330     tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
00331   }
00332   PAGE_RES_IT page_res_it(page_res);
00333   WERD_RES* word_res;
00334   for (word_res = page_res_it.word(); word_res != NULL;
00335        word_res = page_res_it.forward()) {
00336     if (!word_res->box_word->bounding_box().major_overlap(box))
00337       continue;
00338     if (applybox_debug > 1) {
00339       tprintf("Checking word box:");
00340       word_res->box_word->bounding_box().print();
00341     }
00342     int word_len = word_res->box_word->length();
00343     for (int i = 0; i < word_len; ++i) {
00344       TBOX char_box = TBOX();
00345       int blob_count = 0;
00346       for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
00347         TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
00348         if (!blob_box.major_overlap(box))
00349           break;
00350         if (word_res->correct_text[i + blob_count].length() > 0)
00351           break;  // Blob is claimed already.
00352         double current_box_miss_metric = BoxMissMetric(blob_box, box);
00353         double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
00354         if (applybox_debug > 2) {
00355           tprintf("Checking blob:");
00356           blob_box.print();
00357           tprintf("Current miss metric = %g, next = %g\n",
00358                   current_box_miss_metric, next_box_miss_metric);
00359         }
00360         if (current_box_miss_metric > next_box_miss_metric)
00361           break;  // Blob is a better match for next box.
00362         char_box += blob_box;
00363       }
00364       if (blob_count > 0) {
00365         if (applybox_debug > 1) {
00366           tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
00367         }
00368         if (!char_box.almost_equal(box, 3) &&
00369             (box.x_gap(next_box) < -3 ||
00370              (prev_box != NULL && prev_box->x_gap(box) < -3))) {
00371           return false;
00372         }
00373         // We refine just the box_word, best_state and correct_text here.
00374         // The rebuild_word is made in TidyUp.
00375         // blob_count blobs are put together to match the box. Merge the
00376         // box_word boxes, save the blob_count in the state and the text.
00377         word_res->box_word->MergeBoxes(i, i + blob_count);
00378         word_res->best_state[i] = blob_count;
00379         word_res->correct_text[i] = correct_text;
00380         if (applybox_debug > 2) {
00381           tprintf("%d Blobs match: blob box:", blob_count);
00382           word_res->box_word->BlobBox(i).print();
00383           tprintf("Matches box:");
00384           box.print();
00385           tprintf("With next box:");
00386           next_box.print();
00387         }
00388         // Eliminated best_state and correct_text entries for the consumed
00389         // blobs.
00390         for (int j = 1; j < blob_count; ++j) {
00391           word_res->best_state.remove(i + 1);
00392           word_res->correct_text.remove(i + 1);
00393         }
00394         // Assume that no box spans multiple source words, so we are done with
00395         // this box.
00396         if (applybox_debug > 1) {
00397           tprintf("Best state = ");
00398           for (int j = 0; j < word_res->best_state.size(); ++j) {
00399             tprintf("%d ", word_res->best_state[j]);
00400           }
00401           tprintf("\n");
00402           tprintf("Correct text = [[ ");
00403           for (int j = 0; j < word_res->correct_text.size(); ++j) {
00404             tprintf("%s ", word_res->correct_text[j].string());
00405           }
00406           tprintf("]]\n");
00407         }
00408         return true;
00409       }
00410     }
00411   }
00412   if (applybox_debug > 0) {
00413     tprintf("FAIL!\n");
00414   }
00415   return false;  // Failure.
00416 }
00417 
00418 // Consume all source blobs that strongly overlap the given box,
00419 // putting them into a new word, with the correct_text label.
00420 // Fights over which box owns which blobs are settled by
00421 // applying the blobs to box or next_box with the least non-overlap.
00422 // Returns false if the box was in error, which can only be caused by
00423 // failing to find an overlapping blob for a box.
00424 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
00425                                  const TBOX& box, const TBOX& next_box,
00426                                  const char* correct_text) {
00427   if (applybox_debug > 1) {
00428     tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
00429   }
00430   WERD* new_word = NULL;
00431   BLOCK_IT b_it(block_list);
00432   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00433     BLOCK* block = b_it.data();
00434     if (!box.major_overlap(block->bounding_box()))
00435       continue;
00436     ROW_IT r_it(block->row_list());
00437     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
00438       ROW* row = r_it.data();
00439       if (!box.major_overlap(row->bounding_box()))
00440         continue;
00441       WERD_IT w_it(row->word_list());
00442       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00443         WERD* word = w_it.data();
00444         if (applybox_debug > 2) {
00445           tprintf("Checking word:");
00446           word->bounding_box().print();
00447         }
00448         if (word->text() != NULL && word->text()[0] != '\0')
00449           continue;  // Ignore words that are already done.
00450         if (!box.major_overlap(word->bounding_box()))
00451           continue;
00452         C_BLOB_IT blob_it(word->cblob_list());
00453         for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
00454              blob_it.forward()) {
00455           C_BLOB* blob = blob_it.data();
00456           TBOX blob_box = blob->bounding_box();
00457           if (!blob_box.major_overlap(box))
00458             continue;
00459           double current_box_miss_metric = BoxMissMetric(blob_box, box);
00460           double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
00461           if (applybox_debug > 2) {
00462             tprintf("Checking blob:");
00463             blob_box.print();
00464             tprintf("Current miss metric = %g, next = %g\n",
00465                     current_box_miss_metric, next_box_miss_metric);
00466           }
00467           if (current_box_miss_metric > next_box_miss_metric)
00468             continue;  // Blob is a better match for next box.
00469           if (applybox_debug > 2) {
00470             tprintf("Blob match: blob:");
00471             blob_box.print();
00472             tprintf("Matches box:");
00473             box.print();
00474             tprintf("With next box:");
00475             next_box.print();
00476           }
00477           if (new_word == NULL) {
00478             // Make a new word with a single blob.
00479             new_word = word->shallow_copy();
00480             new_word->set_text(correct_text);
00481             w_it.add_to_end(new_word);
00482           }
00483           C_BLOB_IT new_blob_it(new_word->cblob_list());
00484           new_blob_it.add_to_end(blob_it.extract());
00485         }
00486       }
00487     }
00488   }
00489   if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
00490   return new_word != NULL;
00491 }
00492 
00493 // Resegments the words by running the classifier in an attempt to find the
00494 // correct segmentation that produces the required string.
00495 void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
00496   PAGE_RES_IT pr_it(page_res);
00497   WERD_RES* word_res;
00498   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
00499     WERD* word = word_res->word;
00500     if (word->text() == NULL || word->text()[0] == '\0')
00501       continue;  // Ignore words that have no text.
00502     // Convert the correct text to a vector of UNICHAR_ID
00503     GenericVector<UNICHAR_ID> target_text;
00504     if (!ConvertStringToUnichars(word->text(), &target_text)) {
00505       tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
00506               word->text());
00507       pr_it.DeleteCurrentWord();
00508       continue;
00509     }
00510     if (!FindSegmentation(target_text, word_res)) {
00511       tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
00512               word->text());
00513       pr_it.DeleteCurrentWord();
00514       continue;
00515     }
00516   }
00517 }
00518 
00519 // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
00520 // Returns false if an invalid UNICHAR_ID is encountered.
00521 bool Tesseract::ConvertStringToUnichars(const char* utf8,
00522                                         GenericVector<UNICHAR_ID>* class_ids) {
00523   for (int step = 0; *utf8 != '\0'; utf8 += step) {
00524     const char* next_space = strchr(utf8, ' ');
00525     if (next_space == NULL)
00526       next_space = utf8 + strlen(utf8);
00527     step = next_space - utf8;
00528     UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
00529     if (class_id == INVALID_UNICHAR_ID) {
00530       return false;
00531     }
00532     while (utf8[step] == ' ')
00533       ++step;
00534     class_ids->push_back(class_id);
00535   }
00536   return true;
00537 }
00538 
00539 // Resegments the word to achieve the target_text from the classifier.
00540 // Returns false if the re-segmentation fails.
00541 // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
00542 // applies a full search on the classifier results to find the best classified
00543 // segmentation. As a compromise to obtain better recall, 1-1 ambiguity
00544 // substitutions ARE used.
00545 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
00546                                  WERD_RES* word_res) {
00547   // Classify all required combinations of blobs and save results in choices.
00548   int word_length = word_res->box_word->length();
00549   GenericVector<BLOB_CHOICE_LIST*>* choices =
00550       new GenericVector<BLOB_CHOICE_LIST*>[word_length];
00551   for (int i = 0; i < word_length; ++i) {
00552     for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
00553       BLOB_CHOICE_LIST* match_result = classify_piece(
00554           word_res->seam_array, i, i + j - 1, "Applybox",
00555           word_res->chopped_word, word_res->blamer_bundle);
00556       if (applybox_debug > 2) {
00557         tprintf("%d+%d:", i, j);
00558         print_ratings_list("Segment:", match_result, unicharset);
00559       }
00560       choices[i].push_back(match_result);
00561     }
00562   }
00563   // Search the segmentation graph for the target text. Must be an exact
00564   // match. Using wildcards makes it difficult to find the correct
00565   // segmentation even when it is there.
00566   word_res->best_state.clear();
00567   GenericVector<int> search_segmentation;
00568   float best_rating = 0.0f;
00569   SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
00570                 &search_segmentation, &best_rating, &word_res->best_state);
00571   for (int i = 0; i < word_length; ++i)
00572     choices[i].delete_data_pointers();
00573   delete [] choices;
00574   if (word_res->best_state.empty()) {
00575     // Build the original segmentation and if it is the same length as the
00576     // truth, assume it will do.
00577     int blob_count = 1;
00578     for (int s = 0; s < word_res->seam_array.size(); ++s) {
00579       SEAM* seam = word_res->seam_array[s];
00580       if (seam->split1 == NULL) {
00581         word_res->best_state.push_back(blob_count);
00582         blob_count = 1;
00583       } else {
00584         ++blob_count;
00585       }
00586     }
00587     word_res->best_state.push_back(blob_count);
00588     if (word_res->best_state.size() != target_text.size()) {
00589       word_res->best_state.clear();  // No good. Original segmentation bad size.
00590       return false;
00591     }
00592   }
00593   word_res->correct_text.clear();
00594   for (int i = 0; i < target_text.size(); ++i) {
00595     word_res->correct_text.push_back(
00596         STRING(unicharset.id_to_unichar(target_text[i])));
00597   }
00598   return true;
00599 }
00600 
00601 // Recursive helper to find a match to the target_text (from text_index
00602 // position) in the choices (from choices_pos position).
00603 // Choices is an array of GenericVectors, of length choices_length, with each
00604 // element representing a starting position in the word, and the
00605 // GenericVector holding classification results for a sequence of consecutive
00606 // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
00607 void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
00608                               int choices_pos, int choices_length,
00609                               const GenericVector<UNICHAR_ID>& target_text,
00610                               int text_index,
00611                               float rating, GenericVector<int>* segmentation,
00612                               float* best_rating,
00613                               GenericVector<int>* best_segmentation) {
00614   const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
00615   for (int length = 1; length <= choices[choices_pos].size(); ++length) {
00616     // Rating of matching choice or worst choice if no match.
00617     float choice_rating = 0.0f;
00618     // Find the corresponding best BLOB_CHOICE.
00619     BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
00620     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
00621          choice_it.forward()) {
00622       BLOB_CHOICE* choice = choice_it.data();
00623       choice_rating = choice->rating();
00624       UNICHAR_ID class_id = choice->unichar_id();
00625       if (class_id == target_text[text_index]) {
00626         break;
00627       }
00628       // Search ambigs table.
00629       if (class_id < table.size() && table[class_id] != NULL) {
00630         AmbigSpec_IT spec_it(table[class_id]);
00631         for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
00632              spec_it.forward()) {
00633           const AmbigSpec *ambig_spec = spec_it.data();
00634           // We'll only do 1-1.
00635           if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
00636               ambig_spec->correct_ngram_id == target_text[text_index])
00637             break;
00638         }
00639         if (!spec_it.cycled_list())
00640           break;  // Found an ambig.
00641       }
00642     }
00643     if (choice_it.cycled_list())
00644       continue;  // No match.
00645     segmentation->push_back(length);
00646     if (choices_pos + length == choices_length &&
00647         text_index + 1 == target_text.size()) {
00648       // This is a complete match. If the rating is good record a new best.
00649       if (applybox_debug > 2) {
00650         tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
00651                 rating + choice_rating, *best_rating, segmentation->size(),
00652                 best_segmentation->size());
00653       }
00654       if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
00655         *best_segmentation = *segmentation;
00656         *best_rating = rating + choice_rating;
00657       }
00658     } else if (choices_pos + length < choices_length &&
00659                text_index + 1 < target_text.size()) {
00660       if (applybox_debug > 3) {
00661         tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
00662                 target_text[text_index],
00663                 unicharset.id_to_unichar(target_text[text_index]),
00664                 choice_it.data()->unichar_id() == target_text[text_index]
00665                      ? "Match" : "Ambig",
00666                 choices_pos, length);
00667       }
00668       SearchForText(choices, choices_pos + length, choices_length, target_text,
00669                     text_index + 1, rating + choice_rating, segmentation,
00670                     best_rating, best_segmentation);
00671       if (applybox_debug > 3) {
00672         tprintf("End recursion for %d=%s\n", target_text[text_index],
00673                 unicharset.id_to_unichar(target_text[text_index]));
00674       }
00675     }
00676     segmentation->truncate(segmentation->size() - 1);
00677   }
00678 }
00679 
00680 // Counts up the labelled words and the blobs within.
00681 // Deletes all unused or emptied words, counting the unused ones.
00682 // Resets W_BOL and W_EOL flags correctly.
00683 // Builds the rebuild_word and rebuilds the box_word and the best_choice.
00684 void Tesseract::TidyUp(PAGE_RES* page_res) {
00685   int ok_blob_count = 0;
00686   int bad_blob_count = 0;
00687   int ok_word_count = 0;
00688   int unlabelled_words = 0;
00689   PAGE_RES_IT pr_it(page_res);
00690   WERD_RES* word_res;
00691   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
00692     int ok_in_word = 0;
00693     int blob_count = word_res->correct_text.size();
00694     WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
00695     word_choice->set_permuter(TOP_CHOICE_PERM);
00696     for (int c = 0; c < blob_count; ++c) {
00697       if (word_res->correct_text[c].length() > 0) {
00698         ++ok_in_word;
00699       }
00700       // Since we only need a fake word_res->best_choice, the actual
00701       // unichar_ids do not matter. Which is fortunate, since TidyUp()
00702       // can be called while training Tesseract, at the stage where
00703       // unicharset is not meaningful yet.
00704       word_choice->append_unichar_id_space_allocated(
00705           INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
00706     }
00707     if (ok_in_word > 0) {
00708       ok_blob_count += ok_in_word;
00709       bad_blob_count += word_res->correct_text.size() - ok_in_word;
00710       word_res->LogNewRawChoice(word_choice);
00711       word_res->LogNewCookedChoice(1, false, word_choice);
00712     } else {
00713       ++unlabelled_words;
00714       if (applybox_debug > 0) {
00715         tprintf("APPLY_BOXES: Unlabelled word at :");
00716         word_res->word->bounding_box().print();
00717       }
00718       pr_it.DeleteCurrentWord();
00719     }
00720   }
00721   pr_it.restart_page();
00722   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
00723     // Denormalize back to a BoxWord.
00724     word_res->RebuildBestState();
00725     word_res->SetupBoxWord();
00726     word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
00727     word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
00728   }
00729   if (applybox_debug > 0) {
00730     tprintf("   Found %d good blobs.\n", ok_blob_count);
00731     if (bad_blob_count > 0) {
00732       tprintf("   Leaving %d unlabelled blobs in %d words.\n",
00733               bad_blob_count, ok_word_count);
00734     }
00735     if (unlabelled_words > 0)
00736       tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
00737   }
00738 }
00739 
00740 // Logs a bad box by line in the box file and box coords.
00741 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
00742                                 const char *box_ch, const char *err_msg) {
00743   tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
00744           boxfile_lineno + 1, box_ch,
00745           box.left(), box.bottom(), box.right(), box.top(), err_msg);
00746 }
00747 
00748 // Creates a fake best_choice entry in each WERD_RES with the correct text.
00749 void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
00750   PAGE_RES_IT pr_it(page_res);
00751   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
00752        word_res = pr_it.forward()) {
00753     WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
00754                                           word_res->correct_text.size());
00755     for (int i = 0; i < word_res->correct_text.size(); ++i) {
00756       // The part before the first space is the real ground truth, and the
00757       // rest is the bounding box location and page number.
00758       GenericVector<STRING> tokens;
00759       word_res->correct_text[i].split(' ', &tokens);
00760       UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
00761       choice->append_unichar_id_space_allocated(char_id,
00762                                                 word_res->best_state[i],
00763                                                 0.0f, 0.0f);
00764     }
00765     word_res->ClearWordChoices();
00766     word_res->LogNewRawChoice(choice);
00767     word_res->LogNewCookedChoice(1, false, choice);
00768   }
00769 }
00770 
00771 // Calls LearnWord to extract features for labelled blobs within each word.
00772 // Features are written to the given filename.
00773 void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
00774   PAGE_RES_IT pr_it(page_res);
00775   int word_count = 0;
00776   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
00777        word_res = pr_it.forward()) {
00778     LearnWord(filename.string(), word_res);
00779     ++word_count;
00780   }
00781   tprintf("Generated training data for %d words\n", word_count);
00782 }
00783 
00784 
00785 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines