tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/wordrec/segsearch.cpp
Go to the documentation of this file.
00001 
00002 // File:        segsearch.h
00003 // Description: Segmentation search functions.
00004 // Author:      Daria Antonova
00005 // Created:     Mon Jun 23 11:26:43 PDT 2008
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "wordrec.h"
00021 
00022 #include "associate.h"
00023 #include "language_model.h"
00024 #include "matrix.h"
00025 #include "params.h"
00026 #include "lm_pain_points.h"
00027 #include "ratngs.h"
00028 
00029 namespace tesseract {
00030 
00031 void Wordrec::DoSegSearch(WERD_RES* word_res) {
00032   BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
00033   // Run Segmentation Search.
00034   SegSearch(word_res, &best_choice_bundle, NULL);
00035 }
00036 
00037 void Wordrec::SegSearch(WERD_RES* word_res,
00038                         BestChoiceBundle* best_choice_bundle,
00039                         BlamerBundle* blamer_bundle) {
00040   if (segsearch_debug_level > 0) {
00041     tprintf("Starting SegSearch on ratings matrix%s:\n",
00042             wordrec_enable_assoc ? " (with assoc)" : "");
00043     word_res->ratings->print(getDict().getUnicharset());
00044   }
00045   LMPainPoints pain_points(segsearch_max_pain_points,
00046                            segsearch_max_char_wh_ratio,
00047                            assume_fixed_pitch_char_segment,
00048                            &getDict(), segsearch_debug_level);
00049 
00050   pain_points.GenerateInitial(word_res);
00051 
00052   // Compute scaling factor that will help us recover blob outline length
00053   // from classifier rating and certainty for the blob.
00054   float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
00055 
00056   language_model_->InitForWord(prev_word_best_choice_,
00057                                assume_fixed_pitch_char_segment,
00058                                segsearch_max_char_wh_ratio, rating_cert_scale);
00059 
00060   // Initialize blamer-related information: map character boxes recorded in
00061   // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
00062   // ratings matrix. We expect this step to succeed, since when running the
00063   // chopper we checked that the correct chops are present.
00064   if (blamer_bundle != NULL) {
00065     blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
00066                                             wordrec_debug_blamer);
00067   }
00068 
00069   MATRIX_COORD pain_point;
00070   float pain_point_priority;
00071 
00072   // pending[col] tells whether there is update work to do to combine
00073   // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
00074   // As the language model state is updated, pending entries are modified to
00075   // minimize duplication of work. It is important that during the update the
00076   // children are considered in the non-decreasing order of their column, since
00077   // this guarantees that all the parents would be up to date before an update
00078   // of a child is done.
00079   GenericVector<SegSearchPending> pending;
00080   pending.init_to_size(word_res->ratings->dimension(), SegSearchPending());
00081 
00082   // Search the ratings matrix for the initial best path.
00083   pending[0].SetColumnClassified();
00084   UpdateSegSearchNodes(rating_cert_scale, 0, &pending, word_res,
00085                        &pain_points, best_choice_bundle, blamer_bundle);
00086 
00087   if (!SegSearchDone(0)) {  // find a better choice
00088     if (chop_enable && word_res->chopped_word != NULL) {
00089       improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
00090                           blamer_bundle, &pain_points, &pending);
00091     }
00092     if (chop_debug)
00093       print_seams("Final seam list:", word_res->seam_array);
00094 
00095     if (blamer_bundle != NULL &&
00096         !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
00097       blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
00098     }
00099   }
00100   // Keep trying to find a better path by fixing the "pain points".
00101   int num_futile_classifications = 0;
00102   STRING blamer_debug;
00103   while (wordrec_enable_assoc &&
00104       (!SegSearchDone(num_futile_classifications) ||
00105           (blamer_bundle != NULL &&
00106               blamer_bundle->GuidedSegsearchStillGoing()))) {
00107     // Get the next valid "pain point".
00108     bool found_nothing = true;
00109     LMPainPointsType pp_type;
00110     while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
00111         LM_PPTYPE_NUM) {
00112       if (!pain_point.Valid(*word_res->ratings)) {
00113         word_res->ratings->IncreaseBandSize(
00114             pain_point.row - pain_point.col + 1);
00115       }
00116       if (pain_point.Valid(*word_res->ratings) &&
00117           !word_res->ratings->Classified(pain_point.col, pain_point.row,
00118                                          getDict().WildcardID())) {
00119         found_nothing = false;
00120         break;
00121       }
00122     }
00123     if (found_nothing) {
00124       if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
00125       break;
00126     }
00127     ProcessSegSearchPainPoint(pain_point_priority, pain_point,
00128                               LMPainPoints::PainPointDescription(pp_type),
00129                               &pending, word_res, &pain_points, blamer_bundle);
00130 
00131     UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
00132                          word_res, &pain_points, best_choice_bundle,
00133                          blamer_bundle);
00134     if (!best_choice_bundle->updated) ++num_futile_classifications;
00135 
00136     if (segsearch_debug_level > 0) {
00137       tprintf("num_futile_classifications %d\n", num_futile_classifications);
00138     }
00139 
00140     best_choice_bundle->updated = false;  // reset updated
00141 
00142     // See if it's time to terminate SegSearch or time for starting a guided
00143     // search for the true path to find the blame for the incorrect best_choice.
00144     if (SegSearchDone(num_futile_classifications) &&
00145         blamer_bundle != NULL &&
00146         blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
00147       InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
00148                              &blamer_debug);
00149     }
00150   }  // end while loop exploring alternative paths
00151   if (blamer_bundle != NULL) {
00152     blamer_bundle->FinishSegSearch(word_res->best_choice,
00153                                    wordrec_debug_blamer, &blamer_debug);
00154   }
00155 
00156   if (segsearch_debug_level > 0) {
00157     tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
00158             language_model_->AcceptableChoiceFound());
00159   }
00160 }
00161 
00162 void Wordrec::UpdateSegSearchNodes(
00163     float rating_cert_scale,
00164     int starting_col,
00165     GenericVector<SegSearchPending>* pending,
00166     WERD_RES *word_res,
00167     LMPainPoints *pain_points,
00168     BestChoiceBundle *best_choice_bundle,
00169     BlamerBundle *blamer_bundle) {
00170   MATRIX *ratings = word_res->ratings;
00171   ASSERT_HOST(ratings->dimension() == pending->size());
00172   ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size());
00173   for (int col = starting_col; col < ratings->dimension(); ++col) {
00174     if (!(*pending)[col].WorkToDo()) continue;
00175     int first_row = col;
00176     int last_row = MIN(ratings->dimension() - 1,
00177                        col + ratings->bandwidth() - 1);
00178     if ((*pending)[col].SingleRow() >= 0) {
00179       first_row = last_row = (*pending)[col].SingleRow();
00180     }
00181     if (segsearch_debug_level > 0) {
00182       tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n",
00183               col, first_row, last_row,
00184               (*pending)[col].IsRowJustClassified(MAX_INT32));
00185     }
00186     // Iterate over the pending list for this column.
00187     for (int row = first_row; row <= last_row; ++row) {
00188       // Update language model state of this child+parent pair.
00189       BLOB_CHOICE_LIST *current_node = ratings->get(col, row);
00190       LanguageModelState *parent_node =
00191           col == 0 ? NULL : best_choice_bundle->beam[col - 1];
00192       if (current_node != NULL &&
00193           language_model_->UpdateState((*pending)[col].IsRowJustClassified(row),
00194                                        col, row, current_node, parent_node,
00195                                        pain_points, word_res,
00196                                        best_choice_bundle, blamer_bundle) &&
00197           row + 1 < ratings->dimension()) {
00198         // Since the language model state of this entry changed, process all
00199         // the child column.
00200         (*pending)[row + 1].RevisitWholeColumn();
00201         if (segsearch_debug_level > 0) {
00202           tprintf("Added child col=%d to pending\n", row + 1);
00203         }
00204       }  // end if UpdateState.
00205     }  // end for row.
00206   }  // end for col.
00207   if (best_choice_bundle->best_vse != NULL) {
00208     ASSERT_HOST(word_res->StatesAllValid());
00209     if (best_choice_bundle->best_vse->updated) {
00210       pain_points->GenerateFromPath(rating_cert_scale,
00211                                     best_choice_bundle->best_vse, word_res);
00212       if (!best_choice_bundle->fixpt.empty()) {
00213         pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt,
00214                                         best_choice_bundle->best_vse, word_res);
00215       }
00216     }
00217   }
00218   // The segsearch is completed. Reset all updated flags on all VSEs and reset
00219   // all pendings.
00220   for (int col = 0; col < pending->size(); ++col) {
00221     (*pending)[col].Clear();
00222     ViterbiStateEntry_IT
00223         vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);
00224     for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {
00225       vse_it.data()->updated = false;
00226     }
00227   }
00228 }
00229 
00230 void Wordrec::ProcessSegSearchPainPoint(
00231     float pain_point_priority,
00232     const MATRIX_COORD &pain_point, const char* pain_point_type,
00233     GenericVector<SegSearchPending>* pending, WERD_RES *word_res,
00234     LMPainPoints *pain_points, BlamerBundle *blamer_bundle) {
00235   if (segsearch_debug_level > 0) {
00236     tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
00237             pain_point_type, pain_point_priority,
00238             pain_point.col, pain_point.row);
00239   }
00240   ASSERT_HOST(pain_points != NULL);
00241   MATRIX *ratings = word_res->ratings;
00242   // Classify blob [pain_point.col pain_point.row]
00243   if (!pain_point.Valid(*ratings)) {
00244     ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
00245   }
00246   ASSERT_HOST(pain_point.Valid(*ratings));
00247   BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
00248                                                 pain_point.col, pain_point.row,
00249                                                 pain_point_type,
00250                                                 word_res->chopped_word,
00251                                                 blamer_bundle);
00252   BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
00253   if (lst == NULL) {
00254     ratings->put(pain_point.col, pain_point.row, classified);
00255   } else {
00256     // We can not delete old BLOB_CHOICEs, since they might contain
00257     // ViterbiStateEntries that are parents of other "active" entries.
00258     // Thus if the matrix cell already contains classifications we add
00259     // the new ones to the beginning of the list.
00260     BLOB_CHOICE_IT it(lst);
00261     it.add_list_before(classified);
00262     delete classified;  // safe to delete, since empty after add_list_before()
00263     classified = NULL;
00264   }
00265 
00266   if (segsearch_debug_level > 0) {
00267     print_ratings_list("Updated ratings matrix with a new entry:",
00268                        ratings->get(pain_point.col, pain_point.row),
00269                        getDict().getUnicharset());
00270     ratings->print(getDict().getUnicharset());
00271   }
00272 
00273   // Insert initial "pain points" to join the newly classified blob
00274   // with its left and right neighbors.
00275   if (classified != NULL && !classified->empty()) {
00276     if (pain_point.col > 0) {
00277       pain_points->GeneratePainPoint(
00278           pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
00279           true, segsearch_max_char_wh_ratio, word_res);
00280     }
00281     if (pain_point.row + 1 < ratings->dimension()) {
00282       pain_points->GeneratePainPoint(
00283           pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
00284           true, segsearch_max_char_wh_ratio, word_res);
00285     }
00286   }
00287   (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
00288 }
00289 
00290 // Resets enough of the results so that the Viterbi search is re-run.
00291 // Needed when the n-gram model is enabled, as the multi-length comparison
00292 // implementation will re-value existing paths to worse values.
00293 void Wordrec::ResetNGramSearch(WERD_RES* word_res,
00294                                BestChoiceBundle* best_choice_bundle,
00295                                GenericVector<SegSearchPending>* pending) {
00296   // TODO(rays) More refactoring required here.
00297   // Delete existing viterbi states.
00298   for (int col = 0; col < best_choice_bundle->beam.size(); ++col) {
00299     best_choice_bundle->beam[col]->Clear();
00300   }
00301   // Reset best_choice_bundle.
00302   word_res->ClearWordChoices();
00303   best_choice_bundle->best_vse = NULL;
00304   // Clear out all existing pendings and add a new one for the first column.
00305   (*pending)[0].SetColumnClassified();
00306   for (int i = 1; i < pending->size(); ++i)
00307     (*pending)[i].Clear();
00308 }
00309 
00310 void Wordrec::InitBlamerForSegSearch(WERD_RES *word_res,
00311                                      LMPainPoints *pain_points,
00312                                      BlamerBundle *blamer_bundle,
00313                                      STRING *blamer_debug) {
00314   pain_points->Clear();  // Clear pain points heap.
00315   TessResultCallback2<bool, int, int>* pp_cb = NewPermanentTessCallback(
00316       pain_points, &LMPainPoints::GenerateForBlamer,
00317       static_cast<double>(segsearch_max_char_wh_ratio), word_res);
00318   blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings,
00319                                   getDict().WildcardID(), wordrec_debug_blamer,
00320                                   blamer_debug, pp_cb);
00321   delete pp_cb;
00322 }
00323 
00324 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines