tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/dict/dict.cpp
Go to the documentation of this file.
00001 
00002 // File:        dict.cpp
00003 // Description: dict class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018 
00019 #include <stdio.h>
00020 
00021 #include "dict.h"
00022 #include "unicodes.h"
00023 
00024 #ifdef _MSC_VER
00025 #pragma warning(disable:4244)  // Conversion warnings
00026 #endif
00027 #include "tprintf.h"
00028 
00029 namespace tesseract {
00030 
00031 class Image;
00032 
00033 Dict::Dict(CCUtil* ccutil)
00034     : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
00035       probability_in_context_(&tesseract::Dict::def_probability_in_context),
00036       params_model_classify_(NULL),
00037       ccutil_(ccutil),
00038       STRING_INIT_MEMBER(user_words_suffix, "",
00039                          "A list of user-provided words.",
00040                          getCCUtil()->params()),
00041       STRING_INIT_MEMBER(user_patterns_suffix, "",
00042                          "A list of user-provided patterns.",
00043                          getCCUtil()->params()),
00044       BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
00045                        getCCUtil()->params()),
00046       BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
00047                        getCCUtil()->params()),
00048       BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
00049                        getCCUtil()->params()),
00050       BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"
00051                        " patterns.", getCCUtil()->params()),
00052       BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"
00053                        " patterns.", getCCUtil()->params()),
00054       BOOL_INIT_MEMBER(load_bigram_dawg, true, "Load dawg with special word "
00055                        "bigrams.", getCCUtil()->params()),
00056       double_MEMBER(xheight_penalty_subscripts, 0.125,
00057                     "Score penalty (0.1 = 10%) added if there are subscripts "
00058                     "or superscripts in a word, but it is otherwise OK.",
00059                     getCCUtil()->params()),
00060       double_MEMBER(xheight_penalty_inconsistent, 0.25,
00061                     "Score penalty (0.1 = 10%) added if an xheight is "
00062                     "inconsistent.", getCCUtil()->params()),
00063       double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
00064                     "Score multiplier for word matches which have good case and"
00065                     "are frequent in the given language (lower is better).",
00066                     getCCUtil()->params()),
00067       double_MEMBER(segment_penalty_dict_case_ok, 1.1,
00068                     "Score multiplier for word matches that have good case "
00069                     "(lower is better).", getCCUtil()->params()),
00070       double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
00071                     "Default score multiplier for word matches, which may have "
00072                     "case issues (lower is better).",
00073                     getCCUtil()->params()),
00074       double_MEMBER(segment_penalty_ngram_best_choice, 1.24,
00075                    "Multipler to for the best choice from the ngram model.",
00076                    getCCUtil()->params()),
00077       double_MEMBER(segment_penalty_dict_nonword, 1.25,
00078                     "Score multiplier for glyph fragment segmentations which "
00079                     "do not match a dictionary word (lower is better).",
00080                     getCCUtil()->params()),
00081       double_MEMBER(segment_penalty_garbage, 1.50,
00082                     "Score multiplier for poorly cased strings that are not in"
00083                     " the dictionary and generally look like garbage (lower is"
00084                     " better).", getCCUtil()->params()),
00085       STRING_MEMBER(output_ambig_words_file, "",
00086                     "Output file for ambiguities found in the dictionary",
00087                     getCCUtil()->params()),
00088       INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"
00089                  ", to 2 for more details, to 3 to see all the debug messages",
00090                  getCCUtil()->params()),
00091       INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
00092                  getCCUtil()->params()),
00093       INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
00094                  getCCUtil()->params()),
00095       BOOL_MEMBER(use_only_first_uft8_step, false,
00096                   "Use only the first UTF8 step of the given string"
00097                   " when computing log probabilities.",
00098                   getCCUtil()->params()),
00099       double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
00100                     getCCUtil()->params()),
00101       double_MEMBER(stopper_nondict_certainty_base, -2.50,
00102                     "Certainty threshold for non-dict words",
00103                     getCCUtil()->params()),
00104       double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
00105                     "Reject certainty offset",
00106                     getCCUtil()->params()),
00107       INT_MEMBER(stopper_smallword_size, 2,
00108                  "Size of dict word to be treated as non-dict word",
00109                  getCCUtil()->params()),
00110       double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add"
00111                     " for each dict char above small word size.",
00112                     getCCUtil()->params()),
00113       double_MEMBER(stopper_allowable_character_badness, 3.0,
00114                     "Max certaintly variation allowed in a word (in sigma)",
00115                     getCCUtil()->params()),
00116       INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
00117                  getCCUtil()->params()),
00118       BOOL_MEMBER(stopper_no_acceptable_choices, false,
00119                   "Make AcceptableChoice() always return false. Useful"
00120                   " when there is a need to explore all segmentations",
00121                   getCCUtil()->params()),
00122       BOOL_MEMBER(save_raw_choices, false,
00123                   "Deprecated- backward compatablity only",
00124                   getCCUtil()->params()),
00125       INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
00126                  "Max words to keep in list",
00127                  getCCUtil()->params()),
00128       STRING_MEMBER(word_to_debug, "", "Word for which stopper debug"
00129                     " information should be printed to stdout",
00130                     getCCUtil()->params()),
00131       STRING_MEMBER(word_to_debug_lengths, "",
00132                     "Lengths of unichars in word_to_debug",
00133                     getCCUtil()->params()),
00134       INT_MEMBER(fragments_debug, 0, "Debug character fragments",
00135                  getCCUtil()->params()),
00136       BOOL_MEMBER(segment_nonalphabetic_script, false,
00137                  "Don't use any alphabetic-specific tricks."
00138                  "Set to true in the traineddata config file for"
00139                  " scripts that are cursive or inherently fixed-pitch",
00140                  getCCUtil()->params()),
00141       BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
00142                   getCCUtil()->params()),
00143       double_MEMBER(doc_dict_pending_threshold, 0.0,
00144                     "Worst certainty for using pending dictionary",
00145                     getCCUtil()->params()),
00146       double_MEMBER(doc_dict_certainty_threshold, -2.25,
00147                     "Worst certainty for words that can be inserted into the"
00148                     "document dictionary", getCCUtil()->params()),
00149       INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different"
00150                  " character choices to consider during permutation."
00151                  " This limit is especially useful when user patterns"
00152                  " are specified, since overly generic patterns can result in"
00153                  " dawg search exploring an overly large number of options.",
00154                  getCCUtil()->params()) {
00155   dang_ambigs_table_ = NULL;
00156   replace_ambigs_table_ = NULL;
00157   reject_offset_ = 0.0;
00158   go_deeper_fxn_ = NULL;
00159   hyphen_word_ = NULL;
00160   last_word_on_line_ = false;
00161   hyphen_unichar_id_ = INVALID_UNICHAR_ID;
00162   document_words_ = NULL;
00163   dawg_cache_ = NULL;
00164   dawg_cache_is_ours_ = false;
00165   pending_words_ = NULL;
00166   bigram_dawg_ = NULL;
00167   freq_dawg_ = NULL;
00168   punc_dawg_ = NULL;
00169   unambig_dawg_ = NULL;
00170   wordseg_rating_adjust_factor_ = -1.0f;
00171   output_ambig_words_file_ = NULL;
00172 }
00173 
00174 Dict::~Dict() {
00175   if (hyphen_word_ != NULL) delete hyphen_word_;
00176   if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
00177 }
00178 
00179 DawgCache *Dict::GlobalDawgCache() {
00180   // We dynamically allocate this global cache (a singleton) so it will outlive
00181   // every Tesseract instance (even those that someone else might declare as
00182   // global statics).
00183   static DawgCache *cache = new DawgCache();  // evil global singleton
00184   return cache;
00185 }
00186 
00187 void Dict::Load(DawgCache *dawg_cache) {
00188   STRING name;
00189   STRING &lang = getCCUtil()->lang;
00190 
00191   if (dawgs_.length() != 0) this->End();
00192 
00193   apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
00194   question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
00195   slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
00196   hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
00197 
00198   if (dawg_cache != NULL) {
00199     dawg_cache_ = dawg_cache;
00200     dawg_cache_is_ours_ = false;
00201   } else {
00202     dawg_cache_ = new DawgCache();
00203     dawg_cache_is_ours_ = true;
00204   }
00205 
00206   TessdataManager &tessdata_manager = getCCUtil()->tessdata_manager;
00207   const char *data_file_name = tessdata_manager.GetDataFileName().string();
00208 
00209   // Load dawgs_.
00210   if (load_punc_dawg) {
00211     punc_dawg_ = dawg_cache_->GetSquishedDawg(
00212         lang, data_file_name, TESSDATA_PUNC_DAWG, dawg_debug_level);
00213     if (punc_dawg_) dawgs_ += punc_dawg_;
00214   }
00215   if (load_system_dawg) {
00216     Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
00217         lang, data_file_name, TESSDATA_SYSTEM_DAWG, dawg_debug_level);
00218     if (system_dawg) dawgs_ += system_dawg;
00219   }
00220   if (load_number_dawg) {
00221     Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
00222         lang, data_file_name, TESSDATA_NUMBER_DAWG, dawg_debug_level);
00223     if (number_dawg) dawgs_ += number_dawg;
00224   }
00225   if (load_bigram_dawg) {
00226     bigram_dawg_ = dawg_cache_->GetSquishedDawg(
00227         lang, data_file_name, TESSDATA_BIGRAM_DAWG, dawg_debug_level);
00228   }
00229   if (load_freq_dawg) {
00230     freq_dawg_ = dawg_cache_->GetSquishedDawg(
00231         lang, data_file_name, TESSDATA_FREQ_DAWG, dawg_debug_level);
00232     if (freq_dawg_) { dawgs_ += freq_dawg_; }
00233   }
00234   if (load_unambig_dawg) {
00235     unambig_dawg_ = dawg_cache_->GetSquishedDawg(
00236         lang, data_file_name, TESSDATA_UNAMBIG_DAWG, dawg_debug_level);
00237     if (unambig_dawg_) dawgs_ += unambig_dawg_;
00238   }
00239 
00240   if (((STRING &)user_words_suffix).length() > 0) {
00241     Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
00242                               kMaxUserDawgEdges, getUnicharset().size(),
00243                               dawg_debug_level);
00244     name = getCCUtil()->language_data_path_prefix;
00245     name += user_words_suffix;
00246     if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
00247                                           Trie::RRP_REVERSE_IF_HAS_RTL)) {
00248       tprintf("Error: failed to load %s\n", name.string());
00249       delete trie_ptr;
00250     } else {
00251       dawgs_ += trie_ptr;
00252     }
00253   }
00254 
00255   if (((STRING &)user_patterns_suffix).length() > 0) {
00256     Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
00257                               kMaxUserDawgEdges, getUnicharset().size(),
00258                               dawg_debug_level);
00259     trie_ptr->initialize_patterns(&(getUnicharset()));
00260     name = getCCUtil()->language_data_path_prefix;
00261     name += user_patterns_suffix;
00262     if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
00263       tprintf("Error: failed to load %s\n", name.string());
00264       delete trie_ptr;
00265     } else {
00266       dawgs_ += trie_ptr;
00267     }
00268   }
00269 
00270   document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
00271                              kMaxDocDawgEdges, getUnicharset().size(),
00272                              dawg_debug_level);
00273   dawgs_ += document_words_;
00274 
00275   // This dawg is temporary and should not be searched by letter_is_ok.
00276   pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
00277                             kMaxDocDawgEdges, getUnicharset().size(),
00278                             dawg_debug_level);
00279 
00280   // Construct a list of corresponding successors for each dawg. Each entry i
00281   // in the successors_ vector is a vector of integers that represent the
00282   // indices into the dawgs_ vector of the successors for dawg i.
00283   successors_.reserve(dawgs_.length());
00284   for (int i = 0; i < dawgs_.length(); ++i) {
00285     const Dawg *dawg = dawgs_[i];
00286     SuccessorList *lst = new SuccessorList();
00287     for (int j = 0; j < dawgs_.length(); ++j) {
00288       const Dawg *other = dawgs_[j];
00289       if (dawg != NULL && other != NULL &&
00290           (dawg->lang() == other->lang()) &&
00291           kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
00292     }
00293     successors_ += lst;
00294   }
00295 }
00296 
00297 void Dict::End() {
00298   if (dawgs_.length() == 0)
00299     return;  // Not safe to call twice.
00300   for (int i = 0; i < dawgs_.size(); i++) {
00301     if (!dawg_cache_->FreeDawg(dawgs_[i])) {
00302       delete dawgs_[i];
00303     }
00304   }
00305   dawg_cache_->FreeDawg(bigram_dawg_);
00306   if (dawg_cache_is_ours_) {
00307     delete dawg_cache_;
00308     dawg_cache_ = NULL;
00309   }
00310   successors_.delete_data_pointers();
00311   dawgs_.clear();
00312   successors_.clear();
00313   document_words_ = NULL;
00314   if (pending_words_ != NULL) {
00315     delete pending_words_;
00316     pending_words_ = NULL;
00317   }
00318 }
00319 
00320 // Returns true if in light of the current state unichar_id is allowed
00321 // according to at least one of the dawgs in the dawgs_ vector.
00322 // See more extensive comments in dict.h where this function is declared.
00323 int Dict::def_letter_is_okay(void* void_dawg_args,
00324                              UNICHAR_ID unichar_id,
00325                              bool word_end) const {
00326   DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
00327 
00328   if (dawg_debug_level >= 3) {
00329     tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
00330             " num active dawgs=%d\n",
00331             getUnicharset().debug_str(unichar_id).string(), word_end,
00332             dawg_args->active_dawgs->length());
00333   }
00334 
00335   // Do not accept words that contain kPatternUnicharID.
00336   // (otherwise pattern dawgs would not function correctly).
00337   // Do not accept words containing INVALID_UNICHAR_IDs.
00338   if (unichar_id == Dawg::kPatternUnicharID ||
00339       unichar_id == INVALID_UNICHAR_ID) {
00340     dawg_args->permuter = NO_PERM;
00341     return NO_PERM;
00342   }
00343 
00344   // Initialization.
00345   PermuterType curr_perm = NO_PERM;
00346   dawg_args->updated_dawgs->clear();
00347 
00348   // Go over the active_dawgs vector and insert DawgPosition records
00349   // with the updated ref (an edge with the corresponding unichar id) into
00350   // dawg_args->updated_pos.
00351   for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
00352     const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
00353     const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : NULL;
00354     const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : NULL;
00355 
00356     if (!dawg && !punc_dawg) {
00357       // shouldn't happen.
00358       tprintf("Received DawgPosition with no dawg or punc_dawg.  wth?\n");
00359       continue;
00360     }
00361     if (!dawg) {
00362       // We're in the punctuation dawg.  A core dawg has not been chosen.
00363       NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
00364       EDGE_REF punc_transition_edge = punc_dawg->edge_char_of(
00365           punc_node, Dawg::kPatternUnicharID, word_end);
00366       if (punc_transition_edge != NO_EDGE) {
00367         // Find all successors, and see which can transition.
00368         const SuccessorList &slist = *(successors_[pos.punc_index]);
00369         for (int s = 0; s < slist.length(); ++s) {
00370           int sdawg_index = slist[s];
00371           const Dawg *sdawg = dawgs_[sdawg_index];
00372           UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg);
00373           EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
00374           if (dawg_edge != NO_EDGE) {
00375             if (dawg_debug_level >=3) {
00376               tprintf("Letter found in dawg %d\n", sdawg_index);
00377             }
00378             dawg_args->updated_dawgs->add_unique(
00379                 DawgPosition(sdawg_index, dawg_edge,
00380                              pos.punc_index, punc_transition_edge, false),
00381                 dawg_debug_level > 0,
00382                 "Append transition from punc dawg to current dawgs: ");
00383             if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
00384           }
00385         }
00386       }
00387       EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id,
00388                                                    word_end);
00389       if (punc_edge != NO_EDGE) {
00390         if (dawg_debug_level >=3) {
00391           tprintf("Letter found in punctuation dawg\n");
00392         }
00393         dawg_args->updated_dawgs->add_unique(
00394             DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
00395             dawg_debug_level > 0,
00396             "Extend punctuation dawg: ");
00397         if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
00398       }
00399       continue;
00400     }
00401 
00402     if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
00403       // We can end the main word here.
00404       //  If we can continue on the punc ref, add that possibility.
00405       NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
00406       EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE
00407           : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
00408       if (punc_edge != NO_EDGE) {
00409         dawg_args->updated_dawgs->add_unique(
00410             DawgPosition(pos.dawg_index, pos.dawg_ref,
00411                          pos.punc_index, punc_edge, true),
00412             dawg_debug_level > 0,
00413             "Return to punctuation dawg: ");
00414         if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
00415       }
00416     }
00417 
00418     if (pos.back_to_punc) continue;
00419 
00420     // If we are dealing with the pattern dawg, look up all the
00421     // possible edges, not only for the exact unichar_id, but also
00422     // for all its character classes (alpha, digit, etc).
00423     if (dawg->type() == DAWG_TYPE_PATTERN) {
00424       ProcessPatternEdges(dawg, pos, unichar_id, word_end,
00425                           dawg_args->updated_dawgs, &curr_perm);
00426       // There can't be any successors to dawg that is of type
00427       // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
00428       continue;
00429     }
00430 
00431     // Find the edge out of the node for the unichar_id.
00432     NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
00433     EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
00434         : dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end);
00435 
00436     if (dawg_debug_level >= 3) {
00437       tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
00438               pos.dawg_index, node, edge);
00439     }
00440 
00441     if (edge != NO_EDGE) {  // the unichar was found in the current dawg
00442       if (dawg_debug_level >=3) {
00443         tprintf("Letter found in dawg %d\n", pos.dawg_index);
00444       }
00445       if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
00446         if (dawg_debug_level >= 3) {
00447           tprintf("Punctuation constraint not satisfied at end of word.\n");
00448         }
00449         continue;
00450       }
00451       if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
00452       dawg_args->updated_dawgs->add_unique(
00453           DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
00454                        false),
00455           dawg_debug_level > 0,
00456           "Append current dawg to updated active dawgs: ");
00457     }
00458   }  // end for
00459   // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
00460   // or if we found the current letter in a non-punctuation dawg. This
00461   // allows preserving information on which dawg the "core" word came from.
00462   // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
00463   if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
00464       (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
00465     dawg_args->permuter = curr_perm;
00466   }
00467   if (dawg_debug_level >= 2) {
00468     tprintf("Returning %d for permuter code for this character.\n");
00469   }
00470   return dawg_args->permuter;
00471 }
00472 
00473 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos,
00474                                UNICHAR_ID unichar_id, bool word_end,
00475                                DawgPositionVector *updated_dawgs,
00476                                PermuterType *curr_perm) const {
00477   NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
00478   // Try to find the edge corresponding to the exact unichar_id and to all the
00479   // edges corresponding to the character class of unichar_id.
00480   GenericVector<UNICHAR_ID> unichar_id_patterns;
00481   unichar_id_patterns.push_back(unichar_id);
00482   dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
00483                                &unichar_id_patterns);
00484   for (int i = 0; i < unichar_id_patterns.size(); ++i) {
00485     // On the first iteration check all the outgoing edges.
00486     // On the second iteration check all self-loops.
00487     for (int k = 0; k < 2; ++k) {
00488       EDGE_REF edge = (k == 0)
00489       ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
00490       : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end);
00491       if (edge == NO_EDGE) continue;
00492       if (dawg_debug_level >= 3) {
00493         tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
00494                 pos.dawg_index, node, edge);
00495         tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
00496       }
00497       if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
00498       updated_dawgs->add_unique(
00499           DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
00500                        pos.back_to_punc),
00501           dawg_debug_level > 0,
00502           "Append current dawg to updated active dawgs: ");
00503     }
00504   }
00505 }
00506 
00507 // Fill the given active_dawgs vector with dawgs that could contain the
00508 // beginning of the word. If hyphenated() returns true, copy the entries
00509 // from hyphen_active_dawgs_ instead.
00510 void Dict::init_active_dawgs(DawgPositionVector *active_dawgs,
00511                              bool ambigs_mode) const {
00512   int i;
00513   if (hyphenated()) {
00514     *active_dawgs = hyphen_active_dawgs_;
00515     if (dawg_debug_level >= 3) {
00516       for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
00517         tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
00518                 hyphen_active_dawgs_[i].dawg_index,
00519                 hyphen_active_dawgs_[i].dawg_ref);
00520       }
00521     }
00522   } else {
00523     default_dawgs(active_dawgs, ambigs_mode);
00524   }
00525 }
00526 
00527 void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec,
00528                          bool suppress_patterns) const {
00529   bool punc_dawg_available =
00530     (punc_dawg_ != NULL) &&
00531     punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
00532 
00533   for (int i = 0; i < dawgs_.length(); i++) {
00534     if (dawgs_[i] != NULL &&
00535         !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
00536       int dawg_ty = dawgs_[i]->type();
00537       bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
00538       if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
00539         *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
00540         if (dawg_debug_level >= 3) {
00541           tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
00542                   NO_EDGE);
00543         }
00544       } else if (!punc_dawg_available || !subsumed_by_punc) {
00545         *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
00546         if (dawg_debug_level >= 3) {
00547           tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
00548         }
00549       }
00550     }
00551   }
00552 }
00553 
00554 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
00555   // Do not add hyphenated word parts to the document dawg.
00556   // hyphen_word_ will be non-NULL after the set_hyphen_word() is
00557   // called when the first part of the hyphenated word is
00558   // discovered and while the second part of the word is recognized.
00559   // hyphen_word_ is cleared in cc_recg() before the next word on
00560   // the line is recognized.
00561   if (hyphen_word_) return;
00562 
00563   char filename[CHARS_PER_LINE];
00564   FILE *doc_word_file;
00565   int stringlen = best_choice.length();
00566 
00567   if (valid_word(best_choice) || stringlen < 2)
00568     return;
00569 
00570   // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
00571   if (best_choice.length() >= kDocDictMaxRepChars) {
00572     int num_rep_chars = 1;
00573     UNICHAR_ID uch_id = best_choice.unichar_id(0);
00574     for (int i = 1; i < best_choice.length(); ++i) {
00575       if (best_choice.unichar_id(i) != uch_id) {
00576         num_rep_chars = 1;
00577         uch_id = best_choice.unichar_id(i);
00578       } else {
00579         ++num_rep_chars;
00580         if (num_rep_chars == kDocDictMaxRepChars) return;
00581       }
00582     }
00583   }
00584 
00585   if (best_choice.certainty() < doc_dict_certainty_threshold ||
00586       stringlen == 2) {
00587     if (best_choice.certainty() < doc_dict_pending_threshold)
00588       return;
00589 
00590     if (!pending_words_->word_in_dawg(best_choice)) {
00591       if (stringlen > 2 ||
00592           (stringlen == 2 &&
00593            getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
00594            getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
00595         pending_words_->add_word_to_dawg(best_choice);
00596       }
00597       return;
00598     }
00599   }
00600 
00601   if (save_doc_words) {
00602     strcpy(filename, getCCUtil()->imagefile.string());
00603     strcat(filename, ".doc");
00604     doc_word_file = open_file (filename, "a");
00605     fprintf(doc_word_file, "%s\n",
00606             best_choice.debug_string().string());
00607     fclose(doc_word_file);
00608   }
00609   document_words_->add_word_to_dawg(best_choice);
00610 }
00611 
00612 void Dict::adjust_word(WERD_CHOICE *word,
00613                        bool nonword,
00614                        XHeightConsistencyEnum xheight_consistency,
00615                        float additional_adjust,
00616                        bool modify_rating,
00617                        bool debug) {
00618   bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
00619                  word->GetTopScriptID() == getUnicharset().han_sid());
00620   bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));
00621   bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
00622 
00623   float adjust_factor = additional_adjust;
00624   float new_rating = word->rating();
00625   new_rating += kRatingPad;
00626   const char *xheight_triggered = "";
00627   if (word->length() > 1) {
00628     // Calculate x-height and y-offset consistency penalties.
00629     switch (xheight_consistency) {
00630       case XH_INCONSISTENT:
00631         adjust_factor += xheight_penalty_inconsistent;
00632         xheight_triggered = ", xhtBAD";
00633         break;
00634       case XH_SUBNORMAL:
00635         adjust_factor += xheight_penalty_subscripts;
00636         xheight_triggered = ", xhtSUB";
00637         break;
00638       case XH_GOOD:
00639         // leave the factor alone - all good!
00640         break;
00641     }
00642     // TODO(eger): if nonword is true, but there is a "core" thats' a dict
00643     // word, negate nonword status.
00644   } else {
00645     if (debug) {
00646       tprintf("Consistency could not be calculated.\n");
00647     }
00648   }
00649   if (debug) {
00650     tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
00651             word->unichar_string().string(), word->rating(),
00652             xheight_triggered);
00653   }
00654 
00655   if (nonword) {  // non-dictionary word
00656     if (case_is_ok && punc_is_ok) {
00657       adjust_factor += segment_penalty_dict_nonword;
00658       new_rating *= adjust_factor;
00659       if (debug) tprintf(", W");
00660     } else {
00661       adjust_factor += segment_penalty_garbage;
00662       new_rating *= adjust_factor;
00663       if (debug) {
00664         if (!case_is_ok) tprintf(", C");
00665         if (!punc_is_ok) tprintf(", P");
00666       }
00667     }
00668   } else {  // dictionary word
00669     if (case_is_ok) {
00670       if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
00671         word->set_permuter(FREQ_DAWG_PERM);
00672         adjust_factor += segment_penalty_dict_frequent_word;
00673         new_rating *= adjust_factor;
00674         if (debug) tprintf(", F");
00675       } else {
00676         adjust_factor += segment_penalty_dict_case_ok;
00677         new_rating *= adjust_factor;
00678         if (debug) tprintf(", ");
00679       }
00680     } else {
00681       adjust_factor += segment_penalty_dict_case_bad;
00682       new_rating *= adjust_factor;
00683       if (debug) tprintf(", C");
00684     }
00685   }
00686   new_rating -= kRatingPad;
00687   if (modify_rating) word->set_rating(new_rating);
00688   if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
00689   word->set_adjust_factor(adjust_factor);
00690 }
00691 
00692 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
00693   const WERD_CHOICE *word_ptr = &word;
00694   WERD_CHOICE temp_word(word.unicharset());
00695   if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
00696     copy_hyphen_info(&temp_word);
00697     temp_word += word;
00698     word_ptr = &temp_word;
00699   }
00700   if (word_ptr->length() == 0) return NO_PERM;
00701   // Allocate vectors for holding current and updated
00702   // active_dawgs and initialize them.
00703   DawgPositionVector *active_dawgs = new DawgPositionVector[2];
00704   init_active_dawgs(&(active_dawgs[0]), false);
00705   DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
00706   int last_index = word_ptr->length() - 1;
00707   // Call leter_is_okay for each letter in the word.
00708   for (int i = hyphen_base_size(); i <= last_index; ++i) {
00709     if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
00710                                    i == last_index))) break;
00711     // Swap active_dawgs, constraints with the corresponding updated vector.
00712     if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
00713       dawg_args.updated_dawgs = &(active_dawgs[0]);
00714       ++(dawg_args.active_dawgs);
00715     } else {
00716       ++(dawg_args.updated_dawgs);
00717       dawg_args.active_dawgs = &(active_dawgs[0]);
00718     }
00719   }
00720   delete[] active_dawgs;
00721   return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
00722     dawg_args.permuter : NO_PERM;
00723 }
00724 
00725 bool Dict::valid_bigram(const WERD_CHOICE &word1,
00726                         const WERD_CHOICE &word2) const {
00727   if (bigram_dawg_ == NULL) return false;
00728 
00729   // Extract the core word from the middle of each word with any digits
00730   //         replaced with question marks.
00731   int w1start, w1end, w2start, w2end;
00732   word1.punct_stripped(&w1start, &w1end);
00733   word2.punct_stripped(&w2start, &w2end);
00734 
00735   // We don't want to penalize a single guillemet, hyphen, etc.
00736   // But our bigram list doesn't have any information about punctuation.
00737   if (w1start >= w1end) return word1.length() < 3;
00738   if (w2start >= w2end) return word2.length() < 3;
00739 
00740   const UNICHARSET& uchset = getUnicharset();
00741   GenericVector<UNICHAR_ID> bigram_string;
00742   bigram_string.reserve(w1end + w2end + 1);
00743   for (int i = w1start; i < w1end; i++) {
00744     const GenericVector<UNICHAR_ID>& normed_ids =
00745         getUnicharset().normed_ids(word1.unichar_id(i));
00746     if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
00747       bigram_string.push_back(question_unichar_id_);
00748     else
00749       bigram_string += normed_ids;
00750   }
00751   bigram_string.push_back(UNICHAR_SPACE);
00752   for (int i = w2start; i < w2end; i++) {
00753     const GenericVector<UNICHAR_ID>& normed_ids =
00754         getUnicharset().normed_ids(word2.unichar_id(i));
00755     if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
00756       bigram_string.push_back(question_unichar_id_);
00757     else
00758       bigram_string += normed_ids;
00759   }
00760   WERD_CHOICE normalized_word(&uchset, bigram_string.size());
00761   for (int i = 0; i < bigram_string.size(); ++i) {
00762     normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1,
00763                                                       0.0f, 0.0f);
00764   }
00765   return bigram_dawg_->word_in_dawg(normalized_word);
00766 }
00767 
00768 bool Dict::valid_punctuation(const WERD_CHOICE &word) {
00769   if (word.length() == 0) return NO_PERM;
00770   int i;
00771   WERD_CHOICE new_word(word.unicharset());
00772   int last_index = word.length() - 1;
00773   int new_len = 0;
00774   for (i = 0; i <= last_index; ++i) {
00775     UNICHAR_ID unichar_id = (word.unichar_id(i));
00776     if (getUnicharset().get_ispunctuation(unichar_id)) {
00777       new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
00778     } else if (!getUnicharset().get_isalpha(unichar_id) &&
00779                !getUnicharset().get_isdigit(unichar_id)) {
00780       return false;  // neither punc, nor alpha, nor digit
00781     } else if ((new_len = new_word.length()) == 0 ||
00782                new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
00783       new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
00784     }
00785   }
00786   for (i = 0; i < dawgs_.size(); ++i) {
00787     if (dawgs_[i] != NULL &&
00788         dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
00789         dawgs_[i]->word_in_dawg(new_word)) return true;
00790   }
00791   return false;
00792 }
00793 
00794 
00795 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines