tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/tfacepp.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        tfacepp.cpp  (Formerly tface++.c)
00003  * Description: C++ side of the C/C++ Tess/Editor interface.
00004  * Author:                  Ray Smith
00005  * Created:                 Thu Apr 23 15:39:23 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #pragma warning(disable:4305)  // int/float warnings
00023 #pragma warning(disable:4800)  // int/bool warnings
00024 #endif
00025 
00026 #include <math.h>
00027 
00028 #include "blamer.h"
00029 #include "errcode.h"
00030 #include "ratngs.h"
00031 #include "reject.h"
00032 #include "tesseractclass.h"
00033 #include "werd.h"
00034 
00035 #define MAX_UNDIVIDED_LENGTH 24
00036 
00037 
00038 
00039 /**********************************************************************
00040  * recog_word
00041  *
00042  * Convert the word to tess form and pass it to the tess segmenter.
00043  * Convert the output back to editor form.
00044  **********************************************************************/
00045 namespace tesseract {
00046 void Tesseract::recog_word(WERD_RES *word) {
00047   if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL ||
00048       word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
00049     if (classify_debug_level) tprintf("No truth for word - skipping\n");
00050     word->tess_failed = true;
00051     return;
00052   }
00053   ASSERT_HOST(!word->chopped_word->blobs.empty());
00054   recog_word_recursive(word);
00055   word->SetupBoxWord();
00056   if (word->best_choice->length() != word->box_word->length()) {
00057     tprintf("recog_word ASSERT FAIL String:\"%s\"; "
00058             "Strlen=%d; #Blobs=%d\n",
00059             word->best_choice->debug_string().string(),
00060             word->best_choice->length(), word->box_word->length());
00061   }
00062   ASSERT_HOST(word->best_choice->length() == word->box_word->length());
00063   // Check that the ratings matrix size matches the sum of all the
00064   // segmentation states.
00065   if (!word->StatesAllValid()) {
00066     tprintf("Not all words have valid states relative to ratings matrix!!");
00067     word->DebugWordChoices(true, NULL);
00068     ASSERT_HOST(word->StatesAllValid());
00069   }
00070   if (tessedit_override_permuter) {
00071     /* Override the permuter type if a straight dictionary check disagrees. */
00072     uinT8 perm_type = word->best_choice->permuter();
00073     if ((perm_type != SYSTEM_DAWG_PERM) &&
00074         (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
00075       uinT8 real_dict_perm_type = dict_word(*word->best_choice);
00076       if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
00077            (real_dict_perm_type == FREQ_DAWG_PERM) ||
00078            (real_dict_perm_type == USER_DAWG_PERM)) &&
00079           (alpha_count(word->best_choice->unichar_string().string(),
00080                        word->best_choice->unichar_lengths().string()) > 0)) {
00081         word->best_choice->set_permuter(real_dict_perm_type);  // use dict perm
00082       }
00083     }
00084     if (tessedit_rejection_debug &&
00085         perm_type != word->best_choice->permuter()) {
00086       tprintf("Permuter Type Flipped from %d to %d\n",
00087               perm_type, word->best_choice->permuter());
00088     }
00089   }
00090   // Factored out from control.cpp
00091   ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
00092   if (word->best_choice == NULL || word->best_choice->length() == 0 ||
00093       static_cast<int>(strspn(word->best_choice->unichar_string().string(),
00094                               " ")) == word->best_choice->length()) {
00095     word->tess_failed = true;
00096     word->reject_map.initialise(word->box_word->length());
00097     word->reject_map.rej_word_tess_failure();
00098   } else {
00099     word->tess_failed = false;
00100   }
00101 }
00102 
00103 
00104 /**********************************************************************
00105  * recog_word_recursive
00106  *
00107  * Convert the word to tess form and pass it to the tess segmenter.
00108  * Convert the output back to editor form.
00109  **********************************************************************/
00110 void Tesseract::recog_word_recursive(WERD_RES *word) {
00111   int word_length = word->chopped_word->NumBlobs();  // no of blobs
00112   if (word_length > MAX_UNDIVIDED_LENGTH) {
00113     return split_and_recog_word(word);
00114   }
00115   cc_recog(word);
00116   word_length = word->rebuild_word->NumBlobs();  // No of blobs in output.
00117 
00118   // Do sanity checks and minor fixes on best_choice.
00119   if (word->best_choice->length() > word_length) {
00120     word->best_choice->make_bad();  // should never happen
00121     tprintf("recog_word: Discarded long string \"%s\""
00122             " (%d characters vs %d blobs)\n",
00123             word->best_choice->unichar_string().string(),
00124             word->best_choice->length(), word_length);
00125     tprintf("Word is at:");
00126     word->word->bounding_box().print();
00127   }
00128   if (word->best_choice->length() < word_length) {
00129     UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
00130     while (word->best_choice->length() < word_length) {
00131       word->best_choice->append_unichar_id(space_id, 1, 0.0,
00132                                            word->best_choice->certainty());
00133     }
00134   }
00135 }
00136 
00137 
00138 /**********************************************************************
00139  * split_and_recog_word
00140  *
00141  * Split the word into 2 smaller pieces at the largest gap.
00142  * Recognize the pieces and stick the results back together.
00143  **********************************************************************/
00144 void Tesseract::split_and_recog_word(WERD_RES *word) {
00145   // Find the biggest blob gap in the chopped_word.
00146   int bestgap = -MAX_INT32;
00147   int split_index = 0;
00148   for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
00149     TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
00150     TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
00151     int gap = blob_box.left() - prev_box.right();
00152     if (gap > bestgap) {
00153       bestgap = gap;
00154       split_index = b;
00155     }
00156   }
00157   ASSERT_HOST(split_index > 0);
00158 
00159   WERD_RES *word2 = NULL;
00160   BlamerBundle *orig_bb = NULL;
00161   split_word(word, split_index, &word2, &orig_bb);
00162 
00163   // Recognize the first part of the word.
00164   recog_word_recursive(word);
00165   // Recognize the second part of the word.
00166   recog_word_recursive(word2);
00167 
00168   join_words(word, word2, orig_bb);
00169 }
00170 
00171 
00172 /**********************************************************************
00173  * split_word
00174  *
00175  * Split a given WERD_RES in place into two smaller words for recognition.
00176  * split_pt is the index of the first blob to go in the second word.
00177  * The underlying word is left alone, only the TWERD (and subsequent data)
00178  * are split up.  orig_blamer_bundle is set to the original blamer bundle,
00179  * and will now be owned by the caller.  New blamer bundles are forged for the
00180  * two pieces.
00181  **********************************************************************/
00182 void Tesseract::split_word(WERD_RES *word,
00183                            int split_pt,
00184                            WERD_RES **right_piece,
00185                            BlamerBundle **orig_blamer_bundle) const {
00186   ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
00187 
00188   // Save a copy of the blamer bundle so we can try to reconstruct it below.
00189   BlamerBundle *orig_bb =
00190       word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : NULL;
00191 
00192   WERD_RES *word2 = new WERD_RES(*word);
00193 
00194   // blow away the copied chopped_word, as we want to work with
00195   // the blobs from the input chopped_word so seam_arrays can be merged.
00196   TWERD *chopped = word->chopped_word;
00197   TWERD *chopped2 = new TWERD;
00198   chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
00199   for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
00200     chopped2->blobs.push_back(chopped->blobs[i]);
00201   }
00202   chopped->blobs.truncate(split_pt);
00203   word->chopped_word = NULL;
00204   delete word2->chopped_word;
00205   word2->chopped_word = NULL;
00206 
00207   const UNICHARSET &unicharset = *word->uch_set;
00208   word->ClearResults();
00209   word2->ClearResults();
00210   word->chopped_word = chopped;
00211   word2->chopped_word = chopped2;
00212   word->SetupBasicsFromChoppedWord(unicharset);
00213   word2->SetupBasicsFromChoppedWord(unicharset);
00214 
00215   // Try to adjust the blamer bundle.
00216   if (orig_bb != NULL) {
00217     // TODO(rays) Looks like a leak to me.
00218     // orig_bb should take, rather than copy.
00219     word->blamer_bundle = new BlamerBundle();
00220     word2->blamer_bundle = new BlamerBundle();
00221     orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
00222                          word2->chopped_word->blobs[0]->bounding_box().left(),
00223                          wordrec_debug_blamer,
00224                          word->blamer_bundle, word2->blamer_bundle);
00225   }
00226 
00227   *right_piece = word2;
00228   *orig_blamer_bundle = orig_bb;
00229 }
00230 
00231 
00232 /**********************************************************************
00233  * join_words
00234  *
00235  * The opposite of split_word():
00236  *  join word2 (including any recognized data / seam array / etc)
00237  *  onto the right of word and then delete word2.
00238  *  Also, if orig_bb is provided, stitch it back into word.
00239  **********************************************************************/
00240 void Tesseract::join_words(WERD_RES *word,
00241                            WERD_RES *word2,
00242                            BlamerBundle *orig_bb) const {
00243   TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
00244   TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
00245   // Tack the word2 outputs onto the end of the word outputs.
00246   word->chopped_word->blobs += word2->chopped_word->blobs;
00247   word->rebuild_word->blobs += word2->rebuild_word->blobs;
00248   word2->chopped_word->blobs.clear();
00249   word2->rebuild_word->blobs.clear();
00250   TPOINT split_pt;
00251   split_pt.x = (prev_box.right() + blob_box.left()) / 2;
00252   split_pt.y = (prev_box.top() + prev_box.bottom() +
00253                 blob_box.top() + blob_box.bottom()) / 4;
00254   // Move the word2 seams onto the end of the word1 seam_array.
00255   // Since the seam list is one element short, an empty seam marking the
00256   // end of the last blob in the first word is needed first.
00257   word->seam_array.push_back(new SEAM(0.0f, split_pt, NULL, NULL, NULL));
00258   word->seam_array += word2->seam_array;
00259   word2->seam_array.truncate(0);
00260   // Fix widths and gaps.
00261   word->blob_widths += word2->blob_widths;
00262   word->blob_gaps += word2->blob_gaps;
00263   // Fix the ratings matrix.
00264   int rat1 = word->ratings->dimension();
00265   int rat2 = word2->ratings->dimension();
00266   word->ratings->AttachOnCorner(word2->ratings);
00267   ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
00268   word->best_state += word2->best_state;
00269   // Append the word choices.
00270   *word->raw_choice += *word2->raw_choice;
00271 
00272   // How many alt choices from each should we try to get?
00273   const int kAltsPerPiece = 2;
00274   // When do we start throwing away extra alt choices?
00275   const int kTooManyAltChoices = 100;
00276 
00277   // Construct the cartesian product of the best_choices of word(1) and word2.
00278   WERD_CHOICE_LIST joined_choices;
00279   WERD_CHOICE_IT jc_it(&joined_choices);
00280   WERD_CHOICE_IT bc1_it(&word->best_choices);
00281   WERD_CHOICE_IT bc2_it(&word2->best_choices);
00282   int num_word1_choices = word->best_choices.length();
00283   int total_joined_choices = num_word1_choices;
00284   // Nota Bene: For the main loop here, we operate only on the 2nd and greater
00285   // word2 choices, and put them in the joined_choices list. The 1st word2
00286   // choice gets added to the original word1 choices in-place after we have
00287   // finished with them.
00288   int bc2_index = 1;
00289   for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
00290     if (total_joined_choices >= kTooManyAltChoices &&
00291         bc2_index > kAltsPerPiece)
00292       break;
00293     int bc1_index = 0;
00294     for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
00295         ++bc1_index, bc1_it.forward()) {
00296       if (total_joined_choices >= kTooManyAltChoices &&
00297           bc1_index > kAltsPerPiece)
00298         break;
00299       WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data());
00300       *wc += *bc2_it.data();
00301       jc_it.add_after_then_move(wc);
00302       ++total_joined_choices;
00303     }
00304   }
00305   // Now that we've filled in as many alternates as we want, paste the best
00306   // choice for word2 onto the original word alt_choices.
00307   bc1_it.move_to_first();
00308   bc2_it.move_to_first();
00309   for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
00310     *bc1_it.data() += *bc2_it.data();
00311   }
00312   bc1_it.move_to_last();
00313   bc1_it.add_list_after(&joined_choices);
00314 
00315   // Restore the pointer to original blamer bundle and combine blamer
00316   // information recorded in the splits.
00317   if (orig_bb != NULL) {
00318     orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
00319                         wordrec_debug_blamer);
00320     delete word->blamer_bundle;
00321     word->blamer_bundle = orig_bb;
00322   }
00323   word->SetupBoxWord();
00324   word->reject_map.initialise(word->box_word->length());
00325   delete word2;
00326 }
00327 
00328 
00329 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines