tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccstruct/pageres.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        pageres.cpp  (Formerly page_res.c)
00003  * Description: Results classes used by control.c
00004  * Author:              Phil Cheatle
00005  * Created:     Tue Sep 22 08:42:49 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 #include          <stdlib.h>
00020 #ifdef __UNIX__
00021 #include          <assert.h>
00022 #endif
00023 #include          "blamer.h"
00024 #include          "pageres.h"
00025 #include          "blobs.h"
00026 
00027 ELISTIZE (BLOCK_RES)
00028 CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES)
00029 
00030 // Gain factor for computing thresholds that determine the ambiguity of a word.
00031 static const double kStopperAmbiguityThresholdGain = 8.0;
00032 // Constant offset for computing thresholds that determine the ambiguity of a
00033 // word.
00034 static const double kStopperAmbiguityThresholdOffset = 1.5;
00035 // Max number of broken pieces to associate.
00036 const int kWordrecMaxNumJoinChunks = 4;
00037 
00038 // Computes and returns a threshold of certainty difference used to determine
00039 // which words to keep, based on the adjustment factors of the two words.
00040 // TODO(rays) This is horrible. Replace with an enhance params training model.
00041 static double StopperAmbigThreshold(double f1, double f2) {
00042   return (f2 - f1) * kStopperAmbiguityThresholdGain -
00043       kStopperAmbiguityThresholdOffset;
00044 }
00045 
00046 /*************************************************************************
00047  * PAGE_RES::PAGE_RES
00048  *
00049  * Constructor for page results
00050  *************************************************************************/
00051 PAGE_RES::PAGE_RES(
00052     BLOCK_LIST *the_block_list,
00053     WERD_CHOICE **prev_word_best_choice_ptr) {
00054   Init();
00055   BLOCK_IT block_it(the_block_list);
00056   BLOCK_RES_IT block_res_it(&block_res_list);
00057   for (block_it.mark_cycle_pt();
00058        !block_it.cycled_list(); block_it.forward()) {
00059     block_res_it.add_to_end(new BLOCK_RES(block_it.data()));
00060   }
00061   prev_word_best_choice = prev_word_best_choice_ptr;
00062 }
00063 
00064 /*************************************************************************
00065  * BLOCK_RES::BLOCK_RES
00066  *
00067  * Constructor for BLOCK results
00068  *************************************************************************/
00069 
00070 BLOCK_RES::BLOCK_RES(BLOCK *the_block) {
00071   ROW_IT row_it (the_block->row_list ());
00072   ROW_RES_IT row_res_it(&row_res_list);
00073 
00074   char_count = 0;
00075   rej_count = 0;
00076   font_class = -1;               //not assigned
00077   x_height = -1.0;
00078   font_assigned = FALSE;
00079   bold = FALSE;
00080   italic = FALSE;
00081   row_count = 0;
00082 
00083   block = the_block;
00084 
00085   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00086     row_res_it.add_to_end(new ROW_RES(row_it.data()));
00087   }
00088 }
00089 
00090 
00091 /*************************************************************************
00092  * ROW_RES::ROW_RES
00093  *
00094  * Constructor for ROW results
00095  *************************************************************************/
00096 
00097 ROW_RES::ROW_RES(ROW *the_row) {
00098   WERD_IT word_it(the_row->word_list());
00099   WERD_RES_IT word_res_it(&word_res_list);
00100   WERD_RES *combo = NULL;        // current combination of fuzzies
00101   WERD_RES *word_res;            // current word
00102   WERD *copy_word;
00103 
00104   char_count = 0;
00105   rej_count = 0;
00106   whole_word_rej_count = 0;
00107 
00108   row = the_row;
00109   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00110     word_res = new WERD_RES(word_it.data());
00111     word_res->x_height = the_row->x_height();
00112 
00113     if (word_res->word->flag(W_FUZZY_NON)) {
00114       ASSERT_HOST(combo != NULL);
00115       word_res->part_of_combo = TRUE;
00116       combo->copy_on(word_res);
00117     }
00118     if (word_it.data_relative(1)->flag(W_FUZZY_NON)) {
00119       if (combo == NULL) {
00120         copy_word = new WERD;
00121                                  //deep copy
00122         *copy_word = *(word_it.data());
00123         combo = new WERD_RES(copy_word);
00124         combo->x_height = the_row->x_height();
00125         combo->combination = TRUE;
00126         word_res_it.add_to_end(combo);
00127       }
00128       word_res->part_of_combo = TRUE;
00129     } else {
00130       combo = NULL;
00131     }
00132     word_res_it.add_to_end(word_res);
00133   }
00134 }
00135 
00136 
00137 WERD_RES& WERD_RES::operator=(const WERD_RES & source) {
00138   this->ELIST_LINK::operator=(source);
00139   Clear();
00140   if (source.combination) {
00141     word = new WERD;
00142     *word = *(source.word);      // deep copy
00143   } else {
00144     word = source.word;          // pt to same word
00145   }
00146   if (source.bln_boxes != NULL)
00147     bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
00148   if (source.chopped_word != NULL)
00149     chopped_word = new TWERD(*source.chopped_word);
00150   if (source.rebuild_word != NULL)
00151     rebuild_word = new TWERD(*source.rebuild_word);
00152   // TODO(rays) Do we ever need to copy the seam_array?
00153   blob_row = source.blob_row;
00154   denorm = source.denorm;
00155   if (source.box_word != NULL)
00156     box_word = new tesseract::BoxWord(*source.box_word);
00157   best_state = source.best_state;
00158   correct_text = source.correct_text;
00159   blob_widths = source.blob_widths;
00160   blob_gaps = source.blob_gaps;
00161   // None of the uses of operator= require the ratings matrix to be copied,
00162   // so don't as it would be really slow.
00163 
00164   // Copy the cooked choices.
00165   WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.best_choices));
00166   WERD_CHOICE_IT wc_dest_it(&best_choices);
00167   for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
00168     const WERD_CHOICE *choice = wc_it.data();
00169     wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
00170   }
00171   if (!wc_dest_it.empty()) {
00172     wc_dest_it.move_to_first();
00173     best_choice = wc_dest_it.data();
00174     best_choice_fontinfo_ids = source.best_choice_fontinfo_ids;
00175   } else {
00176     best_choice = NULL;
00177     if (!best_choice_fontinfo_ids.empty()) {
00178       best_choice_fontinfo_ids.clear();
00179     }
00180   }
00181 
00182   if (source.raw_choice != NULL) {
00183     raw_choice = new WERD_CHOICE(*source.raw_choice);
00184   } else {
00185     raw_choice = NULL;
00186   }
00187   if (source.ep_choice != NULL) {
00188     ep_choice = new WERD_CHOICE(*source.ep_choice);
00189   } else {
00190     ep_choice = NULL;
00191   }
00192   reject_map = source.reject_map;
00193   combination = source.combination;
00194   part_of_combo = source.part_of_combo;
00195   CopySimpleFields(source);
00196   if (source.blamer_bundle != NULL) {
00197     blamer_bundle =  new BlamerBundle(*(source.blamer_bundle));
00198   }
00199   return *this;
00200 }
00201 
00202 // Copies basic fields that don't involve pointers that might be useful
00203 // to copy when making one WERD_RES from another.
00204 void WERD_RES::CopySimpleFields(const WERD_RES& source) {
00205   tess_failed = source.tess_failed;
00206   tess_accepted = source.tess_accepted;
00207   tess_would_adapt = source.tess_would_adapt;
00208   done = source.done;
00209   unlv_crunch_mode = source.unlv_crunch_mode;
00210   small_caps = source.small_caps;
00211   italic = source.italic;
00212   bold = source.bold;
00213   fontinfo = source.fontinfo;
00214   fontinfo2 = source.fontinfo2;
00215   fontinfo_id_count = source.fontinfo_id_count;
00216   fontinfo_id2_count = source.fontinfo_id2_count;
00217   x_height = source.x_height;
00218   caps_height = source.caps_height;
00219   guessed_x_ht = source.guessed_x_ht;
00220   guessed_caps_ht = source.guessed_caps_ht;
00221   reject_spaces = source.reject_spaces;
00222   uch_set = source.uch_set;
00223   tesseract = source.tesseract;
00224 }
00225 
00226 // Initializes a blank (default constructed) WERD_RES from one that has
00227 // already been recognized.
00228 // Use SetupFor*Recognition afterwards to complete the setup and make
00229 // it ready for a retry recognition.
00230 void WERD_RES::InitForRetryRecognition(const WERD_RES& source) {
00231   word = source.word;
00232   CopySimpleFields(source);
00233   if (source.blamer_bundle != NULL) {
00234     blamer_bundle = new BlamerBundle();
00235     blamer_bundle->CopyTruth(*source.blamer_bundle);
00236   }
00237 }
00238 
00239 // Sets up the members used in recognition: bln_boxes, chopped_word,
00240 // seam_array, denorm.  Returns false if
00241 // the word is empty and sets up fake results.  If use_body_size is
00242 // true and row->body_size is set, then body_size will be used for
00243 // blob normalization instead of xheight + ascrise. This flag is for
00244 // those languages that are using CJK pitch model and thus it has to
00245 // be true if and only if tesseract->textord_use_cjk_fp_model is
00246 // true.
00247 // If allow_detailed_fx is true, the feature extractor will receive fine
00248 // precision outline information, allowing smoother features and better
00249 // features on low resolution images.
00250 // The norm_mode_hint sets the default mode for normalization in absence
00251 // of any of the above flags.
00252 // norm_box is used to override the word bounding box to determine the
00253 // normalization scale and offset.
00254 // Returns false if the word is empty and sets up fake results.
00255 bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in,
00256                                    tesseract::Tesseract* tess, Pix* pix,
00257                                    int norm_mode,
00258                                    const TBOX* norm_box,
00259                                    bool numeric_mode,
00260                                    bool use_body_size,
00261                                    bool allow_detailed_fx,
00262                                    ROW *row, const BLOCK* block) {
00263   tesseract::OcrEngineMode norm_mode_hint =
00264       static_cast<tesseract::OcrEngineMode>(norm_mode);
00265   tesseract = tess;
00266   POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
00267   if ((norm_mode_hint != tesseract::OEM_CUBE_ONLY &&
00268        word->cblob_list()->empty()) || (pb != NULL && !pb->IsText())) {
00269     // Empty words occur when all the blobs have been moved to the rej_blobs
00270     // list, which seems to occur frequently in junk.
00271     SetupFake(unicharset_in);
00272     word->set_flag(W_REP_CHAR, false);
00273     return false;
00274   }
00275   ClearResults();
00276   SetupWordScript(unicharset_in);
00277   chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
00278   float word_xheight = use_body_size && row != NULL && row->body_size() > 0.0f
00279                      ? row->body_size() : x_height;
00280   chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
00281                             word_xheight, numeric_mode, norm_mode_hint,
00282                             norm_box, &denorm);
00283   blob_row = row;
00284   SetupBasicsFromChoppedWord(unicharset_in);
00285   SetupBlamerBundle();
00286   int num_blobs = chopped_word->NumBlobs();
00287   ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
00288   tess_failed = false;
00289   return true;
00290 }
00291 
00292 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
00293 // accumulators from a made chopped word.  We presume the fields are already
00294 // empty.
00295 void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) {
00296   bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word);
00297   start_seam_list(chopped_word, &seam_array);
00298   SetupBlobWidthsAndGaps();
00299   ClearWordChoices();
00300 }
00301 
00302 // Sets up the members used in recognition for an empty recognition result:
00303 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
00304 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
00305   ClearResults();
00306   SetupWordScript(unicharset_in);
00307   chopped_word = new TWERD;
00308   rebuild_word = new TWERD;
00309   bln_boxes = new tesseract::BoxWord;
00310   box_word = new tesseract::BoxWord;
00311   int blob_count = word->cblob_list()->length();
00312   if (blob_count > 0) {
00313     BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count];
00314     // For non-text blocks, just pass any blobs through to the box_word
00315     // and call the word failed with a fake classification.
00316     C_BLOB_IT b_it(word->cblob_list());
00317     int blob_id = 0;
00318     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00319       TBOX box = b_it.data()->bounding_box();
00320       box_word->InsertBox(box_word->length(), box);
00321       fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f,
00322                                                 -1, -1, -1, 0, 0, 0, BCC_FAKE);
00323     }
00324     FakeClassifyWord(blob_count, fake_choices);
00325     delete [] fake_choices;
00326   } else {
00327     WERD_CHOICE* word = new WERD_CHOICE(&unicharset_in);
00328     word->make_bad();
00329     LogNewRawChoice(word);
00330     // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
00331     LogNewCookedChoice(1, false, word);
00332   }
00333   tess_failed = true;
00334 }
00335 
00336 void WERD_RES::SetupWordScript(const UNICHARSET& uch) {
00337   uch_set = &uch;
00338   int script = uch.default_sid();
00339   word->set_script_id(script);
00340   word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
00341   word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
00342 }
00343 
00344 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
00345 void WERD_RES::SetupBlamerBundle() {
00346   if (blamer_bundle != NULL) {
00347     blamer_bundle->SetupNormTruthWord(denorm);
00348   }
00349 }
00350 
00351 // Computes the blob_widths and blob_gaps from the chopped_word.
00352 void WERD_RES::SetupBlobWidthsAndGaps() {
00353   blob_widths.truncate(0);
00354   blob_gaps.truncate(0);
00355   int num_blobs = chopped_word->NumBlobs();
00356   for (int b = 0; b < num_blobs; ++b) {
00357     TBLOB *blob = chopped_word->blobs[b];
00358     TBOX box = blob->bounding_box();
00359     blob_widths.push_back(box.width());
00360     if (b + 1 < num_blobs) {
00361       blob_gaps.push_back(
00362           chopped_word->blobs[b + 1]->bounding_box().left() - box.right());
00363     }
00364   }
00365 }
00366 
00367 // Updates internal data to account for a new SEAM (chop) at the given
00368 // blob_number. Fixes the ratings matrix and states in the choices, as well
00369 // as the blob widths and gaps.
00370 void WERD_RES::InsertSeam(int blob_number, SEAM* seam) {
00371   // Insert the seam into the SEAMS array.
00372   insert_seam(chopped_word, blob_number, seam, &seam_array);
00373   if (ratings != NULL) {
00374     // Expand the ratings matrix.
00375     ratings = ratings->ConsumeAndMakeBigger(blob_number);
00376     // Fix all the segmentation states.
00377     if (raw_choice != NULL)
00378       raw_choice->UpdateStateForSplit(blob_number);
00379     WERD_CHOICE_IT wc_it(&best_choices);
00380     for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
00381       WERD_CHOICE* choice = wc_it.data();
00382       choice->UpdateStateForSplit(blob_number);
00383     }
00384     SetupBlobWidthsAndGaps();
00385   }
00386 }
00387 
00388 // Returns true if all the word choices except the first have adjust_factors
00389 // worse than the given threshold.
00390 bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const {
00391   // The choices are not changed by this iteration.
00392   WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
00393   for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
00394     WERD_CHOICE* choice = wc_it.data();
00395     if (choice->adjust_factor() <= threshold)
00396       return false;
00397   }
00398   return true;
00399 }
00400 
00401 // Returns true if the current word is ambiguous (by number of answers or
00402 // by dangerous ambigs.)
00403 bool WERD_RES::IsAmbiguous() {
00404   return !best_choices.singleton() || best_choice->dangerous_ambig_found();
00405 }
00406 
00407 // Returns true if the ratings matrix size matches the sum of each of the
00408 // segmentation states.
00409 bool WERD_RES::StatesAllValid() {
00410   int ratings_dim = ratings->dimension();
00411   if (raw_choice->TotalOfStates() != ratings_dim) {
00412     tprintf("raw_choice has total of states = %d vs ratings dim of %d\n",
00413             raw_choice->TotalOfStates(), ratings_dim);
00414     return false;
00415   }
00416   WERD_CHOICE_IT it(&best_choices);
00417   int index = 0;
00418   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
00419     WERD_CHOICE* choice = it.data();
00420     if (choice->TotalOfStates() != ratings_dim) {
00421       tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n",
00422               choice->TotalOfStates(), ratings_dim);
00423       return false;
00424     }
00425   }
00426   return true;
00427 }
00428 
00429 // Prints a list of words found if debug is true or the word result matches
00430 // the word_to_debug.
00431 void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
00432   if (debug ||
00433       (word_to_debug != NULL && *word_to_debug != '\0' && best_choice != NULL &&
00434        best_choice->unichar_string() == STRING(word_to_debug))) {
00435     if (raw_choice != NULL)
00436       raw_choice->print("\nBest Raw Choice");
00437 
00438     WERD_CHOICE_IT it(&best_choices);
00439     int index = 0;
00440     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
00441       WERD_CHOICE* choice = it.data();
00442       STRING label;
00443       label.add_str_int("\nCooked Choice #", index);
00444       choice->print(label.string());
00445     }
00446   }
00447 }
00448 
00449 // Removes from best_choices all choices which are not within a reasonable
00450 // range of the best choice.
00451 // TODO(rays) incorporate the information used here into the params training
00452 // re-ranker, in place of this heuristic that is based on the previous
00453 // adjustment factor.
00454 void WERD_RES::FilterWordChoices(int debug_level) {
00455   if (best_choice == NULL || best_choices.singleton())
00456     return;
00457 
00458   if (debug_level >= 2)
00459     best_choice->print("\nFiltering against best choice");
00460   WERD_CHOICE_IT it(&best_choices);
00461   int index = 0;
00462   for (it.forward(); !it.at_first(); it.forward(), ++index) {
00463     WERD_CHOICE* choice = it.data();
00464     float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
00465                                             choice->adjust_factor());
00466     // i, j index the blob choice in choice, best_choice.
00467     // chunk is an index into the chopped_word blobs (AKA chunks).
00468     // Since the two words may use different segmentations of the chunks, we
00469     // iterate over the chunks to find out whether a comparable blob
00470     // classification is much worse than the best result.
00471     int i = 0, j = 0, chunk = 0;
00472     // Each iteration of the while deals with 1 chunk. On entry choice_chunk
00473     // and best_chunk are the indices of the first chunk in the NEXT blob,
00474     // i.e. we don't have to increment i, j while chunk < choice_chunk and
00475     // best_chunk respectively.
00476     int choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
00477     while (i < choice->length() && j < best_choice->length()) {
00478       if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
00479           choice->certainty(i) - best_choice->certainty(j) < threshold) {
00480         if (debug_level >= 2) {
00481           STRING label;
00482           label.add_str_int("\nDiscarding bad choice #", index);
00483           choice->print(label.string());
00484           tprintf("i %d j %d Chunk %d Choice->Blob[i].Certainty %.4g"
00485               " BestChoice->ChunkCertainty[Chunk] %g Threshold %g\n",
00486               i, j, chunk, choice->certainty(i),
00487               best_choice->certainty(j), threshold);
00488         }
00489         delete it.extract();
00490         break;
00491       }
00492       ++chunk;
00493       // If needed, advance choice_chunk to keep up with chunk.
00494       while (choice_chunk < chunk && ++i < choice->length())
00495         choice_chunk += choice->state(i);
00496       // If needed, advance best_chunk to keep up with chunk.
00497       while (best_chunk < chunk && ++j < best_choice->length())
00498         best_chunk += best_choice->state(j);
00499     }
00500   }
00501 }
00502 
00503 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
00504                                          float min_rating,
00505                                          float max_rating,
00506                                          float rating_margin,
00507                                          float* thresholds) {
00508   int chunk = 0;
00509   int end_chunk = best_choice->state(0);
00510   int end_raw_chunk = raw_choice->state(0);
00511   int raw_blob = 0;
00512   for (int i = 0; i < best_choice->length(); i++, thresholds++) {
00513     float avg_rating = 0.0f;
00514     int num_error_chunks = 0;
00515 
00516     // For each chunk in best choice blob i, count non-matching raw results.
00517     while (chunk < end_chunk) {
00518       if (chunk >= end_raw_chunk) {
00519         ++raw_blob;
00520         end_raw_chunk += raw_choice->state(raw_blob);
00521       }
00522       if (best_choice->unichar_id(i) !=
00523           raw_choice->unichar_id(raw_blob)) {
00524         avg_rating += raw_choice->certainty(raw_blob);
00525         ++num_error_chunks;
00526       }
00527       ++chunk;
00528     }
00529 
00530     if (num_error_chunks > 0) {
00531       avg_rating /= num_error_chunks;
00532       *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
00533     } else {
00534       *thresholds = max_rating;
00535     }
00536 
00537     if (*thresholds > max_rating)
00538       *thresholds = max_rating;
00539     if (*thresholds < min_rating)
00540       *thresholds = min_rating;
00541   }
00542 }
00543 
00544 // Saves a copy of the word_choice if it has the best unadjusted rating.
00545 // Returns true if the word_choice was the new best.
00546 bool WERD_RES::LogNewRawChoice(WERD_CHOICE* word_choice) {
00547   if (raw_choice == NULL || word_choice->rating() < raw_choice->rating()) {
00548     delete raw_choice;
00549     raw_choice = new WERD_CHOICE(*word_choice);
00550     raw_choice->set_permuter(TOP_CHOICE_PERM);
00551     return true;
00552   }
00553   return false;
00554 }
00555 
00556 // Consumes word_choice by adding it to best_choices, (taking ownership) if
00557 // the certainty for word_choice is some distance of the best choice in
00558 // best_choices, or by deleting the word_choice and returning false.
00559 // The best_choices list is kept in sorted order by rating. Duplicates are
00560 // removed, and the list is kept no longer than max_num_choices in length.
00561 // Returns true if the word_choice is still a valid pointer.
00562 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
00563                                   WERD_CHOICE* word_choice) {
00564   if (best_choice != NULL) {
00565     // Throw out obviously bad choices to save some work.
00566     // TODO(rays) Get rid of this! This piece of code produces different
00567     // results according to the order in which words are found, which is an
00568     // undesirable behavior. It would be better to keep all the choices and
00569     // prune them later when more information is available.
00570     float max_certainty_delta =
00571         StopperAmbigThreshold(best_choice->adjust_factor(),
00572                               word_choice->adjust_factor());
00573     if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
00574       max_certainty_delta = -kStopperAmbiguityThresholdOffset;
00575     if (word_choice->certainty() - best_choice->certainty() <
00576         max_certainty_delta) {
00577       if (debug) {
00578         STRING bad_string;
00579         word_choice->string_and_lengths(&bad_string, NULL);
00580         tprintf("Discarding choice \"%s\" with an overly low certainty"
00581                 " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
00582                 bad_string.string(), word_choice->certainty(),
00583                 best_choice->certainty(),
00584                 max_certainty_delta + best_choice->certainty());
00585       }
00586       delete word_choice;
00587       return false;
00588     }
00589   }
00590 
00591   // Insert in the list in order of increasing rating, but knock out worse
00592   // string duplicates.
00593   WERD_CHOICE_IT it(&best_choices);
00594   const STRING& new_str = word_choice->unichar_string();
00595   bool inserted = false;
00596   int num_choices = 0;
00597   if (!it.empty()) {
00598     do {
00599       WERD_CHOICE* choice = it.data();
00600       if (choice->rating() > word_choice->rating() && !inserted) {
00601         // Time to insert.
00602         it.add_before_stay_put(word_choice);
00603         inserted = true;
00604         if (num_choices == 0)
00605           best_choice = word_choice;  // This is the new best.
00606         ++num_choices;
00607       }
00608       if (choice->unichar_string() == new_str) {
00609         if (inserted) {
00610           // New is better.
00611           delete it.extract();
00612         } else {
00613           // Old is better.
00614           if (debug) {
00615             tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
00616                     new_str.string(), word_choice->rating(), choice->rating());
00617           }
00618           delete word_choice;
00619           return false;
00620         }
00621       } else {
00622         ++num_choices;
00623         if (num_choices > max_num_choices)
00624           delete it.extract();
00625       }
00626       it.forward();
00627     } while (!it.at_first());
00628   }
00629   if (!inserted && num_choices < max_num_choices) {
00630     it.add_to_end(word_choice);
00631     inserted = true;
00632     if (num_choices == 0)
00633       best_choice = word_choice;  // This is the new best.
00634   }
00635   if (debug) {
00636     if (inserted)
00637       tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
00638     else
00639       tprintf("Poor");
00640     word_choice->print(" Word Choice");
00641   }
00642   if (!inserted) {
00643     delete word_choice;
00644     return false;
00645   }
00646   return true;
00647 }
00648 
00649 
00650 // Simple helper moves the ownership of the pointer data from src to dest,
00651 // first deleting anything in dest, and nulling out src afterwards.
00652 template<class T> static void MovePointerData(T** dest, T**src) {
00653   delete *dest;
00654   *dest = *src;
00655   *src = NULL;
00656 }
00657 
00658 // Prints a brief list of all the best choices.
00659 void WERD_RES::PrintBestChoices() const {
00660   STRING alternates_str;
00661   WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
00662   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00663     if (!it.at_first()) alternates_str += "\", \"";
00664     alternates_str += it.data()->unichar_string();
00665   }
00666   tprintf("Alternates for \"%s\": {\"%s\"}\n",
00667           best_choice->unichar_string().string(), alternates_str.string());
00668 }
00669 
00670 // Returns the sum of the widths of the blob between start_blob and last_blob
00671 // inclusive.
00672 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) {
00673   int result = 0;
00674   for (int b = start_blob; b <= last_blob; ++b) {
00675     result += blob_widths[b];
00676     if (b < last_blob)
00677       result += blob_gaps[b];
00678   }
00679   return result;
00680 }
00681 // Returns the width of a gap between the specified blob and the next one.
00682 int WERD_RES::GetBlobsGap(int blob_index) {
00683   if (blob_index < 0 || blob_index >= blob_gaps.size())
00684     return 0;
00685   return blob_gaps[blob_index];
00686 }
00687 
00688 // Returns the BLOB_CHOICE corresponding to the given index in the
00689 // best choice word taken from the appropriate cell in the ratings MATRIX.
00690 // Borrowed pointer, so do not delete. May return NULL if there is no
00691 // BLOB_CHOICE matching the unichar_id at the given index.
00692 BLOB_CHOICE* WERD_RES::GetBlobChoice(int index) const {
00693   if (index < 0 || index >= best_choice->length()) return NULL;
00694   BLOB_CHOICE_LIST* choices = GetBlobChoices(index);
00695   return FindMatchingChoice(best_choice->unichar_id(index), choices);
00696 }
00697 
00698 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
00699 // best choice word taken from the appropriate cell in the ratings MATRIX.
00700 // Borrowed pointer, so do not delete.
00701 BLOB_CHOICE_LIST* WERD_RES::GetBlobChoices(int index) const {
00702   return best_choice->blob_choices(index, ratings);
00703 }
00704 
00705 // Moves the results fields from word to this. This takes ownership of all
00706 // the data, so src can be destructed.
00707 void WERD_RES::ConsumeWordResults(WERD_RES* word) {
00708   denorm = word->denorm;
00709   blob_row = word->blob_row;
00710   MovePointerData(&chopped_word, &word->chopped_word);
00711   MovePointerData(&rebuild_word, &word->rebuild_word);
00712   MovePointerData(&box_word, &word->box_word);
00713   seam_array.delete_data_pointers();
00714   seam_array = word->seam_array;
00715   word->seam_array.clear();
00716   best_state.move(&word->best_state);
00717   correct_text.move(&word->correct_text);
00718   blob_widths.move(&word->blob_widths);
00719   blob_gaps.move(&word->blob_gaps);
00720   if (ratings != NULL) ratings->delete_matrix_pointers();
00721   MovePointerData(&ratings, &word->ratings);
00722   best_choice = word->best_choice;
00723   MovePointerData(&raw_choice, &word->raw_choice);
00724   best_choices.clear();
00725   WERD_CHOICE_IT wc_it(&best_choices);
00726   wc_it.add_list_after(&word->best_choices);
00727   reject_map = word->reject_map;
00728   if (word->blamer_bundle != NULL) {
00729     assert(blamer_bundle != NULL);
00730     blamer_bundle->CopyResults(*(word->blamer_bundle));
00731   }
00732   CopySimpleFields(*word);
00733 }
00734 
00735 // Replace the best choice and rebuild box word.
00736 // choice must be from the current best_choices list.
00737 void WERD_RES::ReplaceBestChoice(WERD_CHOICE* choice) {
00738   best_choice = choice;
00739   RebuildBestState();
00740   SetupBoxWord();
00741   // Make up a fake reject map of the right length to keep the
00742   // rejection pass happy.
00743   reject_map.initialise(best_state.length());
00744   done = tess_accepted = tess_would_adapt = true;
00745   SetScriptPositions();
00746 }
00747 
00748 // Builds the rebuild_word and sets the best_state from the chopped_word and
00749 // the best_choice->state.
00750 void WERD_RES::RebuildBestState() {
00751   ASSERT_HOST(best_choice != NULL);
00752   if (rebuild_word != NULL)
00753     delete rebuild_word;
00754   rebuild_word = new TWERD;
00755   if (seam_array.empty())
00756     start_seam_list(chopped_word, &seam_array);
00757   best_state.truncate(0);
00758   int start = 0;
00759   for (int i = 0; i < best_choice->length(); ++i) {
00760     int length = best_choice->state(i);
00761     best_state.push_back(length);
00762     if (length > 1)
00763       join_pieces(seam_array, start, start + length - 1, chopped_word);
00764     TBLOB* blob = chopped_word->blobs[start];
00765     rebuild_word->blobs.push_back(new TBLOB(*blob));
00766     if (length > 1)
00767       break_pieces(seam_array, start, start + length - 1, chopped_word);
00768     start += length;
00769   }
00770 }
00771 
00772 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
00773 // Also sets up the output box_word.
00774 void WERD_RES::CloneChoppedToRebuild() {
00775   if (rebuild_word != NULL)
00776     delete rebuild_word;
00777   rebuild_word = new TWERD(*chopped_word);
00778   SetupBoxWord();
00779   int word_len = box_word->length();
00780   best_state.reserve(word_len);
00781   correct_text.reserve(word_len);
00782   for (int i = 0; i < word_len; ++i) {
00783     best_state.push_back(1);
00784     correct_text.push_back(STRING(""));
00785   }
00786 }
00787 
00788 // Sets/replaces the box_word with one made from the rebuild_word.
00789 void WERD_RES::SetupBoxWord() {
00790   if (box_word != NULL)
00791     delete box_word;
00792   rebuild_word->ComputeBoundingBoxes();
00793   box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word);
00794   box_word->ClipToOriginalWord(denorm.block(), word);
00795 }
00796 
00797 // Sets up the script positions in the output best_choice using the best_choice
00798 // to get the unichars, and the unicharset to get the target positions.
00799 void WERD_RES::SetScriptPositions() {
00800   best_choice->SetScriptPositions(small_caps, chopped_word);
00801 }
00802 // Sets all the blobs in all the words (raw choice and best choices) to be
00803 // the given position. (When a sub/superscript is recognized as a separate
00804 // word, it falls victim to the rule that a whole word cannot be sub or
00805 // superscript, so this function overrides that problem.)
00806 void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) {
00807   raw_choice->SetAllScriptPositions(position);
00808   WERD_CHOICE_IT wc_it(&best_choices);
00809   for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
00810     wc_it.data()->SetAllScriptPositions(position);
00811 }
00812 
00813 // Classifies the word with some already-calculated BLOB_CHOICEs.
00814 // The choices are an array of blob_count pointers to BLOB_CHOICE,
00815 // providing a single classifier result for each blob.
00816 // The BLOB_CHOICEs are consumed and the word takes ownership.
00817 // The number of blobs in the box_word must match blob_count.
00818 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
00819   // Setup the WERD_RES.
00820   ASSERT_HOST(box_word != NULL);
00821   ASSERT_HOST(blob_count == box_word->length());
00822   ClearWordChoices();
00823   ClearRatings();
00824   ratings = new MATRIX(blob_count, 1);
00825   for (int c = 0; c < blob_count; ++c) {
00826     BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST;
00827     BLOB_CHOICE_IT choice_it(choice_list);
00828     choice_it.add_after_then_move(choices[c]);
00829     ratings->put(c, c, choice_list);
00830   }
00831   FakeWordFromRatings();
00832   reject_map.initialise(blob_count);
00833 }
00834 
00835 // Creates a WERD_CHOICE for the word using the top choices from the leading
00836 // diagonal of the ratings matrix.
00837 void WERD_RES::FakeWordFromRatings() {
00838   int num_blobs = ratings->dimension();
00839   WERD_CHOICE* word_choice = new WERD_CHOICE(uch_set, num_blobs);
00840   word_choice->set_permuter(TOP_CHOICE_PERM);
00841   for (int b = 0; b < num_blobs; ++b) {
00842     UNICHAR_ID unichar_id = UNICHAR_SPACE;
00843     float rating = MAX_INT32;
00844     float certainty = -MAX_INT32;
00845     BLOB_CHOICE_LIST* choices = ratings->get(b, b);
00846     if (choices != NULL && !choices->empty()) {
00847       BLOB_CHOICE_IT bc_it(choices);
00848       BLOB_CHOICE* choice = bc_it.data();
00849       unichar_id = choice->unichar_id();
00850       rating = choice->rating();
00851       certainty = choice->certainty();
00852     }
00853     word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
00854                                                    certainty);
00855   }
00856   LogNewRawChoice(word_choice);
00857   // Ownership of word_choice taken by word here.
00858   LogNewCookedChoice(1, false, word_choice);
00859 }
00860 
00861 // Copies the best_choice strings to the correct_text for adaption/training.
00862 void WERD_RES::BestChoiceToCorrectText() {
00863   correct_text.clear();
00864   ASSERT_HOST(best_choice != NULL);
00865   for (int i = 0; i < best_choice->length(); ++i) {
00866     UNICHAR_ID choice_id = best_choice->unichar_id(i);
00867     const char* blob_choice = uch_set->id_to_unichar(choice_id);
00868     correct_text.push_back(STRING(blob_choice));
00869   }
00870 }
00871 
00872 // Merges 2 adjacent blobs in the result if the permanent callback
00873 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
00874 // callback box_cb is NULL or returns true, setting the merged blob
00875 // result to the class returned from class_cb.
00876 // Returns true if anything was merged.
00877 bool WERD_RES::ConditionalBlobMerge(
00878     TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
00879     TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb) {
00880   ASSERT_HOST(best_choice->length() == 0 || ratings != NULL);
00881   bool modified = false;
00882   for (int i = 0; i + 1 < best_choice->length(); ++i) {
00883     UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i),
00884                                       best_choice->unichar_id(i+1));
00885     if (new_id != INVALID_UNICHAR_ID &&
00886         (box_cb == NULL || box_cb->Run(box_word->BlobBox(i),
00887                                        box_word->BlobBox(i + 1)))) {
00888       // Raw choice should not be fixed.
00889       best_choice->set_unichar_id(new_id, i);
00890       modified = true;
00891       MergeAdjacentBlobs(i);
00892       const MATRIX_COORD& coord = best_choice->MatrixCoord(i);
00893       if (!coord.Valid(*ratings)) {
00894         ratings->IncreaseBandSize(coord.row + 1 - coord.col);
00895       }
00896       BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i);
00897       if (FindMatchingChoice(new_id, blob_choices) == NULL) {
00898         // Insert a fake result.
00899         BLOB_CHOICE* blob_choice = new BLOB_CHOICE;
00900         blob_choice->set_unichar_id(new_id);
00901         BLOB_CHOICE_IT bc_it(blob_choices);
00902         bc_it.add_before_then_move(blob_choice);
00903       }
00904     }
00905   }
00906   delete class_cb;
00907   delete box_cb;
00908   return modified;
00909 }
00910 
00911 // Merges 2 adjacent blobs in the result (index and index+1) and corrects
00912 // all the data to account for the change.
00913 void WERD_RES::MergeAdjacentBlobs(int index) {
00914   if (reject_map.length() == best_choice->length())
00915     reject_map.remove_pos(index);
00916   best_choice->remove_unichar_id(index + 1);
00917   rebuild_word->MergeBlobs(index, index + 2);
00918   box_word->MergeBoxes(index, index + 2);
00919   if (index + 1 < best_state.length()) {
00920     best_state[index] += best_state[index + 1];
00921     best_state.remove(index + 1);
00922   }
00923 }
00924 
00925 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
00926 // training data.
00927 
00928 // Utility function for fix_quotes
00929 // Return true if the next character in the string (given the UTF8 length in
00930 // bytes) is a quote character.
00931 static int is_simple_quote(const char* signed_str, int length) {
00932   const unsigned char* str =
00933       reinterpret_cast<const unsigned char*>(signed_str);
00934   // Standard 1 byte quotes.
00935   return (length == 1 && (*str == '\'' || *str == '`')) ||
00936       // UTF-8 3 bytes curved quotes.
00937       (length == 3 && ((*str == 0xe2 &&
00938                         *(str + 1) == 0x80 &&
00939                         *(str + 2) == 0x98) ||
00940                        (*str == 0xe2 &&
00941                         *(str + 1) == 0x80 &&
00942                         *(str + 2) == 0x99)));
00943 }
00944 
00945 // Callback helper for fix_quotes returns a double quote if both
00946 // arguments are quote, otherwise INVALID_UNICHAR_ID.
00947 UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {
00948   const char *ch = uch_set->id_to_unichar(id1);
00949   const char *next_ch = uch_set->id_to_unichar(id2);
00950   if (is_simple_quote(ch, strlen(ch)) &&
00951       is_simple_quote(next_ch, strlen(next_ch)))
00952     return uch_set->unichar_to_id("\"");
00953   return INVALID_UNICHAR_ID;
00954 }
00955 
00956 // Change pairs of quotes to double quotes.
00957 void WERD_RES::fix_quotes() {
00958   if (!uch_set->contains_unichar("\"") ||
00959       !uch_set->get_enabled(uch_set->unichar_to_id("\"")))
00960     return;  // Don't create it if it is disallowed.
00961 
00962   ConditionalBlobMerge(
00963       NewPermanentTessCallback(this, &WERD_RES::BothQuotes),
00964       NULL);
00965 }
00966 
00967 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
00968 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
00969 UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {
00970   const char *ch = uch_set->id_to_unichar(id1);
00971   const char *next_ch = uch_set->id_to_unichar(id2);
00972   if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
00973       (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
00974     return uch_set->unichar_to_id("-");
00975   return INVALID_UNICHAR_ID;
00976 }
00977 
00978 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
00979 // (assuming both on the same textline, are in order and a chopped em dash.)
00980 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
00981   return box1.right() >= box2.left();
00982 }
00983 
00984 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
00985 // Typically a long dash which has been segmented.
00986 void WERD_RES::fix_hyphens() {
00987   if (!uch_set->contains_unichar("-") ||
00988       !uch_set->get_enabled(uch_set->unichar_to_id("-")))
00989     return;  // Don't create it if it is disallowed.
00990 
00991   ConditionalBlobMerge(
00992       NewPermanentTessCallback(this, &WERD_RES::BothHyphens),
00993       NewPermanentTessCallback(this, &WERD_RES::HyphenBoxesOverlap));
00994 }
00995 
00996 // Callback helper for merge_tess_fails returns a space if both
00997 // arguments are space, otherwise INVALID_UNICHAR_ID.
00998 UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
00999   if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
01000     return id1;
01001   else
01002     return INVALID_UNICHAR_ID;
01003 }
01004 
01005 // Change pairs of tess failures to a single one
01006 void WERD_RES::merge_tess_fails() {
01007   if (ConditionalBlobMerge(
01008       NewPermanentTessCallback(this, &WERD_RES::BothSpaces), NULL)) {
01009     int len = best_choice->length();
01010     ASSERT_HOST(reject_map.length() == len);
01011     ASSERT_HOST(box_word->length() == len);
01012   }
01013 }
01014 
01015 // Returns true if the collection of count pieces, starting at start, are all
01016 // natural connected components, ie there are no real chops involved.
01017 bool WERD_RES::PiecesAllNatural(int start, int count) const {
01018   // all seams must have no splits.
01019   for (int index = start; index < start + count - 1; ++index) {
01020     if (index >= 0 && index < seam_array.size()) {
01021       SEAM* seam = seam_array[index];
01022       if (seam != NULL && seam->split1 != NULL)
01023         return false;
01024     }
01025   }
01026   return true;
01027 }
01028 
01029 
01030 WERD_RES::~WERD_RES () {
01031   Clear();
01032 }
01033 
01034 void WERD_RES::InitNonPointers() {
01035   tess_failed = FALSE;
01036   tess_accepted = FALSE;
01037   tess_would_adapt = FALSE;
01038   done = FALSE;
01039   unlv_crunch_mode = CR_NONE;
01040   small_caps = false;
01041   italic = FALSE;
01042   bold = FALSE;
01043   // The fontinfos and tesseract count as non-pointers as they point to
01044   // data owned elsewhere.
01045   fontinfo = NULL;
01046   fontinfo2 = NULL;
01047   tesseract = NULL;
01048   fontinfo_id_count = 0;
01049   fontinfo_id2_count = 0;
01050   x_height = 0.0;
01051   caps_height = 0.0;
01052   guessed_x_ht = TRUE;
01053   guessed_caps_ht = TRUE;
01054   combination = FALSE;
01055   part_of_combo = FALSE;
01056   reject_spaces = FALSE;
01057 }
01058 
01059 void WERD_RES::InitPointers() {
01060   word = NULL;
01061   bln_boxes = NULL;
01062   blob_row = NULL;
01063   uch_set = NULL;
01064   chopped_word = NULL;
01065   rebuild_word = NULL;
01066   box_word = NULL;
01067   ratings = NULL;
01068   best_choice = NULL;
01069   raw_choice = NULL;
01070   ep_choice = NULL;
01071   blamer_bundle = NULL;
01072 }
01073 
01074 void WERD_RES::Clear() {
01075   if (word != NULL && combination) {
01076     delete word;
01077   }
01078   word = NULL;
01079   delete blamer_bundle;
01080   blamer_bundle = NULL;
01081   ClearResults();
01082 }
01083 
01084 void WERD_RES::ClearResults() {
01085   done = false;
01086   fontinfo = NULL;
01087   fontinfo2 = NULL;
01088   fontinfo_id_count = 0;
01089   fontinfo_id2_count = 0;
01090   if (bln_boxes != NULL) {
01091     delete bln_boxes;
01092     bln_boxes = NULL;
01093   }
01094   blob_row = NULL;
01095   if (chopped_word != NULL) {
01096     delete chopped_word;
01097     chopped_word = NULL;
01098   }
01099   if (rebuild_word != NULL) {
01100     delete rebuild_word;
01101     rebuild_word = NULL;
01102   }
01103   if (box_word != NULL) {
01104     delete box_word;
01105     box_word = NULL;
01106   }
01107   best_state.clear();
01108   correct_text.clear();
01109   seam_array.delete_data_pointers();
01110   seam_array.clear();
01111   blob_widths.clear();
01112   blob_gaps.clear();
01113   ClearRatings();
01114   ClearWordChoices();
01115   if (blamer_bundle != NULL) blamer_bundle->ClearResults();
01116 }
01117 void WERD_RES::ClearWordChoices() {
01118   best_choice = NULL;
01119   if (raw_choice != NULL) {
01120     delete raw_choice;
01121     raw_choice = NULL;
01122   }
01123   best_choices.clear();
01124   if (ep_choice != NULL) {
01125     delete ep_choice;
01126     ep_choice = NULL;
01127   }
01128 }
01129 void WERD_RES::ClearRatings() {
01130   if (ratings != NULL) {
01131     ratings->delete_matrix_pointers();
01132     delete ratings;
01133     ratings = NULL;
01134   }
01135 }
01136 
01137 
01138 bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const {
01139   return word_res == other.word_res &&
01140       row_res == other.row_res &&
01141       block_res == other.block_res;
01142 }
01143 
01144 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
01145   ASSERT_HOST(page_res == other.page_res);
01146   if (other.block_res == NULL) {
01147     // other points to the end of the page.
01148     if (block_res == NULL)
01149       return 0;
01150     return -1;
01151   }
01152   if (block_res == NULL) {
01153     return 1; // we point to the end of the page.
01154   }
01155   if (block_res == other.block_res) {
01156     if (other.row_res == NULL || row_res == NULL) {
01157       // this should only happen if we hit an image block.
01158       return 0;
01159     }
01160     if (row_res == other.row_res) {
01161       // we point to the same block and row.
01162       ASSERT_HOST(other.word_res != NULL && word_res != NULL);
01163       if (word_res == other.word_res) {
01164         // we point to the same word!
01165         return 0;
01166       }
01167 
01168       WERD_RES_IT word_res_it(&row_res->word_res_list);
01169       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
01170            word_res_it.forward()) {
01171         if (word_res_it.data() == word_res) {
01172           return -1;
01173         } else if (word_res_it.data() == other.word_res) {
01174           return 1;
01175         }
01176       }
01177       ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
01178     }
01179 
01180     // we both point to the same block, but different rows.
01181     ROW_RES_IT row_res_it(&block_res->row_res_list);
01182     for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
01183          row_res_it.forward()) {
01184       if (row_res_it.data() == row_res) {
01185         return -1;
01186       } else if (row_res_it.data() == other.row_res) {
01187         return 1;
01188       }
01189     }
01190     ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
01191   }
01192 
01193   // We point to different blocks.
01194   BLOCK_RES_IT block_res_it(&page_res->block_res_list);
01195   for (block_res_it.mark_cycle_pt();
01196        !block_res_it.cycled_list(); block_res_it.forward()) {
01197     if (block_res_it.data() == block_res) {
01198       return -1;
01199     } else if (block_res_it.data() == other.block_res) {
01200       return 1;
01201     }
01202   }
01203   // Shouldn't happen...
01204   ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
01205   return 0;
01206 }
01207 
01208 // Inserts the new_word and a corresponding WERD_RES before the current
01209 // position. The simple fields of the WERD_RES are copied from clone_res and
01210 // the resulting WERD_RES is returned for further setup with best_choice etc.
01211 WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
01212                                              WERD* new_word) {
01213   // Insert new_word into the ROW.
01214   WERD_IT w_it(row()->row->word_list());
01215   for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
01216     WERD* word = w_it.data();
01217     if (word == word_res->word)
01218       break;
01219   }
01220   ASSERT_HOST(!w_it.cycled_list());
01221   w_it.add_before_then_move(new_word);
01222   // Make a WERD_RES for the new_word.
01223   WERD_RES* new_res = new WERD_RES(new_word);
01224   new_res->CopySimpleFields(clone_res);
01225   // Insert into the appropriate place in the ROW_RES.
01226   WERD_RES_IT wr_it(&row()->word_res_list);
01227   for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
01228     WERD_RES* word = wr_it.data();
01229     if (word == word_res)
01230       break;
01231   }
01232   ASSERT_HOST(!wr_it.cycled_list());
01233   wr_it.add_before_then_move(new_res);
01234   if (wr_it.at_first()) {
01235     // This is the new first word, so reset the member iterator so it
01236     // detects the cycled_list state correctly.
01237     ResetWordIterator();
01238   }
01239   return new_res;
01240 }
01241 
01242 // Deletes the current WERD_RES and its underlying WERD.
01243 void PAGE_RES_IT::DeleteCurrentWord() {
01244   // Check that this word is as we expect. part_of_combos are NEVER iterated
01245   // by the normal iterator, so we should never be trying to delete them.
01246   ASSERT_HOST(!word_res->part_of_combo);
01247   if (!word_res->combination) {
01248     // Combinations own their own word, so we won't find the word on the
01249     // row's word_list, but it is legitimate to try to delete them.
01250     // Delete word from the ROW when not a combination.
01251     WERD_IT w_it(row()->row->word_list());
01252     for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
01253       if (w_it.data() == word_res->word) {
01254         break;
01255       }
01256     }
01257     ASSERT_HOST(!w_it.cycled_list());
01258     delete w_it.extract();
01259   }
01260   // Remove the WERD_RES for the new_word.
01261   // Remove the WORD_RES from the ROW_RES.
01262   WERD_RES_IT wr_it(&row()->word_res_list);
01263   for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
01264     if (wr_it.data() == word_res) {
01265       word_res = NULL;
01266       break;
01267     }
01268   }
01269   ASSERT_HOST(!wr_it.cycled_list());
01270   delete wr_it.extract();
01271   ResetWordIterator();
01272 }
01273 
01274 /*************************************************************************
01275  * PAGE_RES_IT::restart_page
01276  *
01277  * Set things up at the start of the page
01278  *************************************************************************/
01279 
01280 WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
01281   block_res_it.set_to_list(&page_res->block_res_list);
01282   block_res_it.mark_cycle_pt();
01283   prev_block_res = NULL;
01284   prev_row_res = NULL;
01285   prev_word_res = NULL;
01286   block_res = NULL;
01287   row_res = NULL;
01288   word_res = NULL;
01289   next_block_res = NULL;
01290   next_row_res = NULL;
01291   next_word_res = NULL;
01292   internal_forward(true, empty_ok);
01293   return internal_forward(false, empty_ok);
01294 }
01295 
01296 // Recovers from operations on the current word, such as in InsertCloneWord
01297 // and DeleteCurrentWord.
01298 // Resets the word_res_it so that it is one past the next_word_res, as
01299 // it should be after internal_forward. If next_row_res != row_res,
01300 // then the next_word_res is in the next row, so there is no need to do
01301 // anything, since operations on the current word will not have disturbed
01302 // the word_res_it.
01303 void PAGE_RES_IT::ResetWordIterator() {
01304   if (row_res == next_row_res) {
01305     // Reset the member iterator so it can move forward and detect the
01306     // cycled_list state correctly.
01307     word_res_it.move_to_first();
01308     word_res_it.mark_cycle_pt();
01309     while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res)
01310       word_res_it.forward();
01311     ASSERT_HOST(!word_res_it.cycled_list());
01312     word_res_it.forward();
01313   }
01314 }
01315 
01316 /*************************************************************************
01317  * PAGE_RES_IT::internal_forward
01318  *
01319  * Find the next word on the page. If empty_ok is true, then non-text blocks
01320  * and text blocks with no text are visited as if they contain a single
01321  * imaginary word in a single imaginary row. (word() and row() both return NULL
01322  * in such a block and the return value is NULL.)
01323  * If empty_ok is false, the old behaviour is maintained. Each real word
01324  * is visited and empty and non-text blocks and rows are skipped.
01325  * new_block is used to initialize the iterators for a new block.
01326  * The iterator maintains pointers to block, row and word for the previous,
01327  * current and next words.  These are correct, regardless of block/row
01328  * boundaries. NULL values denote start and end of the page.
01329  *************************************************************************/
01330 
01331 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
01332   bool new_row = false;
01333 
01334   prev_block_res = block_res;
01335   prev_row_res = row_res;
01336   prev_word_res = word_res;
01337   block_res = next_block_res;
01338   row_res = next_row_res;
01339   word_res = next_word_res;
01340   next_block_res = NULL;
01341   next_row_res = NULL;
01342   next_word_res = NULL;
01343 
01344   while (!block_res_it.cycled_list()) {
01345     if (new_block) {
01346       new_block = false;
01347       row_res_it.set_to_list(&block_res_it.data()->row_res_list);
01348       row_res_it.mark_cycle_pt();
01349       if (row_res_it.empty() && empty_ok) {
01350         next_block_res = block_res_it.data();
01351         break;
01352       }
01353       new_row = true;
01354     }
01355     while (!row_res_it.cycled_list()) {
01356       if (new_row) {
01357         new_row = false;
01358         word_res_it.set_to_list(&row_res_it.data()->word_res_list);
01359         word_res_it.mark_cycle_pt();
01360       }
01361       // Skip any part_of_combo words.
01362       while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
01363         word_res_it.forward();
01364       if (!word_res_it.cycled_list()) {
01365         next_block_res = block_res_it.data();
01366         next_row_res = row_res_it.data();
01367         next_word_res = word_res_it.data();
01368         word_res_it.forward();
01369         goto foundword;
01370       }
01371       // end of row reached
01372       row_res_it.forward();
01373       new_row = true;
01374     }
01375     // end of block reached
01376     block_res_it.forward();
01377     new_block = true;
01378   }
01379   foundword:
01380   // Update prev_word_best_choice pointer.
01381   if (page_res != NULL && page_res->prev_word_best_choice != NULL) {
01382     *page_res->prev_word_best_choice =
01383       (new_block || prev_word_res == NULL) ? NULL : prev_word_res->best_choice;
01384   }
01385   return word_res;
01386 }
01387 
01388 /*************************************************************************
01389  * PAGE_RES_IT::restart_row()
01390  *
01391  * Move to the beginning (leftmost word) of the current row.
01392  *************************************************************************/
01393 WERD_RES *PAGE_RES_IT::restart_row() {
01394   ROW_RES *row = this->row();
01395   if (!row) return NULL;
01396   for (restart_page(); this->row() != row; forward()) {
01397     // pass
01398   }
01399   return word();
01400 }
01401 
01402 /*************************************************************************
01403  * PAGE_RES_IT::forward_paragraph
01404  *
01405  * Move to the beginning of the next paragraph, allowing empty blocks.
01406  *************************************************************************/
01407 
01408 WERD_RES *PAGE_RES_IT::forward_paragraph() {
01409   while (block_res == next_block_res &&
01410          (next_row_res != NULL && next_row_res->row != NULL &&
01411           row_res->row->para() == next_row_res->row->para())) {
01412     internal_forward(false, true);
01413   }
01414   return internal_forward(false, true);
01415 }
01416 
01417 /*************************************************************************
01418  * PAGE_RES_IT::forward_block
01419  *
01420  * Move to the beginning of the next block, allowing empty blocks.
01421  *************************************************************************/
01422 
01423 WERD_RES *PAGE_RES_IT::forward_block() {
01424   while (block_res == next_block_res) {
01425     internal_forward(false, true);
01426   }
01427   return internal_forward(false, true);
01428 }
01429 
01430 void PAGE_RES_IT::rej_stat_word() {
01431   inT16 chars_in_word;
01432   inT16 rejects_in_word = 0;
01433 
01434   chars_in_word = word_res->reject_map.length ();
01435   page_res->char_count += chars_in_word;
01436   block_res->char_count += chars_in_word;
01437   row_res->char_count += chars_in_word;
01438 
01439   rejects_in_word = word_res->reject_map.reject_count ();
01440 
01441   page_res->rej_count += rejects_in_word;
01442   block_res->rej_count += rejects_in_word;
01443   row_res->rej_count += rejects_in_word;
01444   if (chars_in_word == rejects_in_word)
01445     row_res->whole_word_rej_count += rejects_in_word;
01446 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines