tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/recogtraining.cpp
Go to the documentation of this file.
00001 
00002 // File:        recogtraining.cpp
00003 // Description: Functions for ambiguity and parameter training.
00004 // Author:      Daria Antonova
00005 // Created:     Mon Aug 13 11:26:43 PDT 2009
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "tesseractclass.h"
00021 
00022 #include "boxread.h"
00023 #include "control.h"
00024 #include "cutil.h"
00025 #include "host.h"
00026 #include "ratngs.h"
00027 #include "reject.h"
00028 #include "stopper.h"
00029 
00030 namespace tesseract {
00031 
00032 const inT16 kMaxBoxEdgeDiff = 2;
00033 
00034 // Sets flags necessary for recognition in the training mode.
00035 // Opens and returns the pointer to the output file.
00036 FILE *Tesseract::init_recog_training(const STRING &fname) {
00037   if (tessedit_ambigs_training) {
00038     tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
00039     tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
00040     // Explore all segmentations.
00041     getDict().stopper_no_acceptable_choices.set_value(1);
00042   }
00043 
00044   STRING output_fname = fname;
00045   const char *lastdot = strrchr(output_fname.string(), '.');
00046   if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0';
00047   output_fname += ".txt";
00048   FILE *output_file = open_file(output_fname.string(), "a+");
00049   return output_file;
00050 }
00051 
00052 // Copies the bounding box from page_res_it->word() to the given TBOX.
00053 bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
00054   while (page_res_it->block() != NULL) {
00055     if (page_res_it->word() != NULL)
00056       break;
00057     page_res_it->forward();
00058   }
00059 
00060   if (page_res_it->word() != NULL) {
00061     *tbox = page_res_it->word()->word->bounding_box();
00062     page_res_it->forward();
00063 
00064     // If tbox->left() is negative, the training image has vertical text and
00065     // all the coordinates of bounding boxes of page_res are rotated by 90
00066     // degrees in a counterclockwise direction. We need to rotate the TBOX back
00067     // in order to compare with the TBOXes of box files.
00068     if (tbox->left() < 0) {
00069       tbox->rotate(FCOORD(0.0, -1.0));
00070     }
00071 
00072     return true;
00073   } else {
00074     return false;
00075   }
00076 }
00077 
00078 // This function takes tif/box pair of files and runs recognition on the image,
00079 // while making sure that the word bounds that tesseract identified roughly
00080 // match to those specified by the input box file. For each word (ngram in a
00081 // single bounding box from the input box file) it outputs the ocred result,
00082 // the correct label, rating and certainty.
00083 void Tesseract::recog_training_segmented(const STRING &fname,
00084                                          PAGE_RES *page_res,
00085                                          volatile ETEXT_DESC *monitor,
00086                                          FILE *output_file) {
00087   STRING box_fname = fname;
00088   const char *lastdot = strrchr(box_fname.string(), '.');
00089   if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0';
00090   box_fname += ".box";
00091   // read_next_box() will close box_file
00092   FILE *box_file = open_file(box_fname.string(), "r");
00093 
00094   PAGE_RES_IT page_res_it;
00095   page_res_it.page_res = page_res;
00096   page_res_it.restart_page();
00097   STRING label;
00098 
00099   // Process all the words on this page.
00100   TBOX tbox;  // tesseract-identified box
00101   TBOX bbox;  // box from the box file
00102   bool keep_going;
00103   int line_number = 0;
00104   int examined_words = 0;
00105   do {
00106     keep_going = read_t(&page_res_it, &tbox);
00107     keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
00108                               &bbox);
00109     // Align bottom left points of the TBOXes.
00110     while (keep_going &&
00111            !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
00112       keep_going = (bbox.bottom() < tbox.bottom()) ?
00113           read_t(&page_res_it, &tbox) :
00114             ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
00115     }
00116     while (keep_going &&
00117            !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
00118       keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) :
00119           ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
00120     }
00121     // OCR the word if top right points of the TBOXes are similar.
00122     if (keep_going &&
00123         NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
00124         NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
00125         ambigs_classify_and_output(page_res_it.prev_word(),
00126                                    page_res_it.prev_row(),
00127                                    page_res_it.prev_block(),
00128                                    label.string(), output_file);
00129         examined_words++;
00130     }
00131   } while (keep_going);
00132 
00133   // Set up scripts on all of the words that did not get sent to
00134   // ambigs_classify_and_output.  They all should have, but if all the
00135   // werd_res's don't get uch_sets, tesseract will crash when you try
00136   // to iterate over them. :-(
00137   int total_words = 0;
00138   for (page_res_it.restart_page(); page_res_it.block() != NULL;
00139        page_res_it.forward()) {
00140     if (page_res_it.word()) {
00141       if (page_res_it.word()->uch_set == NULL)
00142         page_res_it.word()->SetupFake(unicharset);
00143       total_words++;
00144     }
00145   }
00146   if (examined_words < 0.85 * total_words) {
00147     tprintf("TODO(antonova): clean up recog_training_segmented; "
00148             " It examined only a small fraction of the ambigs image.\n");
00149   }
00150   tprintf("recog_training_segmented: examined %d / %d words.\n",
00151           examined_words, total_words);
00152 }
00153 
00154 // Helper prints the given set of blob choices.
00155 static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
00156                       const UNICHARSET& unicharset,
00157                       const char *label, FILE *output_file) {
00158   float rating = 0.0f;
00159   float certainty = 0.0f;
00160   for (int i = 0; i < length; ++i) {
00161     const BLOB_CHOICE* blob_choice = blob_choices[i];
00162     fprintf(output_file, "%s",
00163            unicharset.id_to_unichar(blob_choice->unichar_id()));
00164     rating += blob_choice->rating();
00165     if (certainty > blob_choice->certainty())
00166       certainty = blob_choice->certainty();
00167   }
00168   fprintf(output_file, "\t%s\t%.4f\t%.4f\n",
00169          label, rating, certainty);
00170 }
00171 
00172 // Helper recursively prints all paths through the ratings matrix, starting
00173 // at column col.
00174 static void PrintMatrixPaths(int col, int dim,
00175                              const MATRIX& ratings,
00176                              int length, const BLOB_CHOICE** blob_choices,
00177                              const UNICHARSET& unicharset,
00178                              const char *label, FILE *output_file) {
00179   for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
00180     if (ratings.get(col, row) != NOT_CLASSIFIED) {
00181       BLOB_CHOICE_IT bc_it(ratings.get(col, row));
00182       for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
00183         blob_choices[length] = bc_it.data();
00184         if (row + 1 < dim) {
00185           PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices,
00186                            unicharset, label, output_file);
00187         } else {
00188           PrintPath(length + 1, blob_choices, unicharset, label, output_file);
00189         }
00190       }
00191     }
00192   }
00193 }
00194 
00195 // Runs classify_word_pass1() on the current word. Outputs Tesseract's
00196 // raw choice as a result of the classification. For words labeled with a
00197 // single unichar also outputs all alternatives from blob_choices of the
00198 // best choice.
00199 void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res,
00200                                            ROW_RES *row_res,
00201                                            BLOCK_RES *block_res,
00202                                            const char *label,
00203                                            FILE *output_file) {
00204   // Classify word.
00205   fflush(stdout);
00206   WordData word_data(block_res->block, row_res->row, werd_res);
00207   SetupWordPassN(1, &word_data);
00208   classify_word_pass1(&word_data, werd_res);
00209   WERD_CHOICE *best_choice = werd_res->best_choice;
00210   ASSERT_HOST(best_choice != NULL);
00211 
00212   // Compute the number of unichars in the label.
00213   GenericVector<UNICHAR_ID> encoding;
00214   if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) {
00215     tprintf("Not outputting illegal unichar %s\n", label);
00216     return;
00217   }
00218 
00219   // Dump all paths through the ratings matrix (which is normally small).
00220   int dim = werd_res->ratings->dimension();
00221   const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim];
00222   PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices,
00223                    unicharset, label, output_file);
00224   delete [] blob_choices;
00225 }
00226 
00227 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines