tesseract
3.03
|
00001 00002 // File: dict.cpp 00003 // Description: dict class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #include <stdio.h> 00020 00021 #include "dict.h" 00022 #include "unicodes.h" 00023 00024 #ifdef _MSC_VER 00025 #pragma warning(disable:4244) // Conversion warnings 00026 #endif 00027 #include "tprintf.h" 00028 00029 namespace tesseract { 00030 00031 class Image; 00032 00033 Dict::Dict(CCUtil* ccutil) 00034 : letter_is_okay_(&tesseract::Dict::def_letter_is_okay), 00035 probability_in_context_(&tesseract::Dict::def_probability_in_context), 00036 params_model_classify_(NULL), 00037 ccutil_(ccutil), 00038 STRING_INIT_MEMBER(user_words_suffix, "", 00039 "A list of user-provided words.", 00040 getCCUtil()->params()), 00041 STRING_INIT_MEMBER(user_patterns_suffix, "", 00042 "A list of user-provided patterns.", 00043 getCCUtil()->params()), 00044 BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", 00045 getCCUtil()->params()), 00046 BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", 00047 getCCUtil()->params()), 00048 BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.", 00049 getCCUtil()->params()), 00050 BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation" 00051 " patterns.", getCCUtil()->params()), 00052 BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number" 00053 " patterns.", getCCUtil()->params()), 00054 BOOL_INIT_MEMBER(load_bigram_dawg, true, "Load dawg with special word " 00055 "bigrams.", getCCUtil()->params()), 00056 double_MEMBER(xheight_penalty_subscripts, 0.125, 00057 "Score penalty (0.1 = 10%) added if there are subscripts " 00058 "or superscripts in a word, but it is otherwise OK.", 00059 getCCUtil()->params()), 00060 double_MEMBER(xheight_penalty_inconsistent, 0.25, 00061 "Score penalty (0.1 = 10%) added if an xheight is " 00062 "inconsistent.", getCCUtil()->params()), 00063 double_MEMBER(segment_penalty_dict_frequent_word, 1.0, 00064 "Score multiplier for word matches which have good case and" 00065 "are frequent in the given language (lower is better).", 00066 getCCUtil()->params()), 00067 double_MEMBER(segment_penalty_dict_case_ok, 1.1, 00068 "Score multiplier for word matches that have good case " 00069 "(lower is better).", getCCUtil()->params()), 00070 double_MEMBER(segment_penalty_dict_case_bad, 1.3125, 00071 "Default score multiplier for word matches, which may have " 00072 "case issues (lower is better).", 00073 getCCUtil()->params()), 00074 double_MEMBER(segment_penalty_ngram_best_choice, 1.24, 00075 "Multipler to for the best choice from the ngram model.", 00076 getCCUtil()->params()), 00077 double_MEMBER(segment_penalty_dict_nonword, 1.25, 00078 "Score multiplier for glyph fragment segmentations which " 00079 "do not match a dictionary word (lower is better).", 00080 getCCUtil()->params()), 00081 double_MEMBER(segment_penalty_garbage, 1.50, 00082 "Score multiplier for poorly cased strings that are not in" 00083 " the dictionary and generally look like garbage (lower is" 00084 " better).", getCCUtil()->params()), 00085 STRING_MEMBER(output_ambig_words_file, "", 00086 "Output file for ambiguities found in the dictionary", 00087 getCCUtil()->params()), 00088 INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info" 00089 ", to 2 for more details, to 3 to see all the debug messages", 00090 getCCUtil()->params()), 00091 INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", 00092 getCCUtil()->params()), 00093 INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.", 00094 getCCUtil()->params()), 00095 BOOL_MEMBER(use_only_first_uft8_step, false, 00096 "Use only the first UTF8 step of the given string" 00097 " when computing log probabilities.", 00098 getCCUtil()->params()), 00099 double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", 00100 getCCUtil()->params()), 00101 double_MEMBER(stopper_nondict_certainty_base, -2.50, 00102 "Certainty threshold for non-dict words", 00103 getCCUtil()->params()), 00104 double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, 00105 "Reject certainty offset", 00106 getCCUtil()->params()), 00107 INT_MEMBER(stopper_smallword_size, 2, 00108 "Size of dict word to be treated as non-dict word", 00109 getCCUtil()->params()), 00110 double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add" 00111 " for each dict char above small word size.", 00112 getCCUtil()->params()), 00113 double_MEMBER(stopper_allowable_character_badness, 3.0, 00114 "Max certaintly variation allowed in a word (in sigma)", 00115 getCCUtil()->params()), 00116 INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", 00117 getCCUtil()->params()), 00118 BOOL_MEMBER(stopper_no_acceptable_choices, false, 00119 "Make AcceptableChoice() always return false. Useful" 00120 " when there is a need to explore all segmentations", 00121 getCCUtil()->params()), 00122 BOOL_MEMBER(save_raw_choices, false, 00123 "Deprecated- backward compatablity only", 00124 getCCUtil()->params()), 00125 INT_MEMBER(tessedit_truncate_wordchoice_log, 10, 00126 "Max words to keep in list", 00127 getCCUtil()->params()), 00128 STRING_MEMBER(word_to_debug, "", "Word for which stopper debug" 00129 " information should be printed to stdout", 00130 getCCUtil()->params()), 00131 STRING_MEMBER(word_to_debug_lengths, "", 00132 "Lengths of unichars in word_to_debug", 00133 getCCUtil()->params()), 00134 INT_MEMBER(fragments_debug, 0, "Debug character fragments", 00135 getCCUtil()->params()), 00136 BOOL_MEMBER(segment_nonalphabetic_script, false, 00137 "Don't use any alphabetic-specific tricks." 00138 "Set to true in the traineddata config file for" 00139 " scripts that are cursive or inherently fixed-pitch", 00140 getCCUtil()->params()), 00141 BOOL_MEMBER(save_doc_words, 0, "Save Document Words", 00142 getCCUtil()->params()), 00143 double_MEMBER(doc_dict_pending_threshold, 0.0, 00144 "Worst certainty for using pending dictionary", 00145 getCCUtil()->params()), 00146 double_MEMBER(doc_dict_certainty_threshold, -2.25, 00147 "Worst certainty for words that can be inserted into the" 00148 "document dictionary", getCCUtil()->params()), 00149 INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different" 00150 " character choices to consider during permutation." 00151 " This limit is especially useful when user patterns" 00152 " are specified, since overly generic patterns can result in" 00153 " dawg search exploring an overly large number of options.", 00154 getCCUtil()->params()) { 00155 dang_ambigs_table_ = NULL; 00156 replace_ambigs_table_ = NULL; 00157 reject_offset_ = 0.0; 00158 go_deeper_fxn_ = NULL; 00159 hyphen_word_ = NULL; 00160 last_word_on_line_ = false; 00161 hyphen_unichar_id_ = INVALID_UNICHAR_ID; 00162 document_words_ = NULL; 00163 dawg_cache_ = NULL; 00164 dawg_cache_is_ours_ = false; 00165 pending_words_ = NULL; 00166 bigram_dawg_ = NULL; 00167 freq_dawg_ = NULL; 00168 punc_dawg_ = NULL; 00169 unambig_dawg_ = NULL; 00170 wordseg_rating_adjust_factor_ = -1.0f; 00171 output_ambig_words_file_ = NULL; 00172 } 00173 00174 Dict::~Dict() { 00175 if (hyphen_word_ != NULL) delete hyphen_word_; 00176 if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_); 00177 } 00178 00179 DawgCache *Dict::GlobalDawgCache() { 00180 // We dynamically allocate this global cache (a singleton) so it will outlive 00181 // every Tesseract instance (even those that someone else might declare as 00182 // global statics). 00183 static DawgCache *cache = new DawgCache(); // evil global singleton 00184 return cache; 00185 } 00186 00187 void Dict::Load(DawgCache *dawg_cache) { 00188 STRING name; 00189 STRING &lang = getCCUtil()->lang; 00190 00191 if (dawgs_.length() != 0) this->End(); 00192 00193 apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol); 00194 question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol); 00195 slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol); 00196 hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol); 00197 00198 if (dawg_cache != NULL) { 00199 dawg_cache_ = dawg_cache; 00200 dawg_cache_is_ours_ = false; 00201 } else { 00202 dawg_cache_ = new DawgCache(); 00203 dawg_cache_is_ours_ = true; 00204 } 00205 00206 TessdataManager &tessdata_manager = getCCUtil()->tessdata_manager; 00207 const char *data_file_name = tessdata_manager.GetDataFileName().string(); 00208 00209 // Load dawgs_. 00210 if (load_punc_dawg) { 00211 punc_dawg_ = dawg_cache_->GetSquishedDawg( 00212 lang, data_file_name, TESSDATA_PUNC_DAWG, dawg_debug_level); 00213 if (punc_dawg_) dawgs_ += punc_dawg_; 00214 } 00215 if (load_system_dawg) { 00216 Dawg *system_dawg = dawg_cache_->GetSquishedDawg( 00217 lang, data_file_name, TESSDATA_SYSTEM_DAWG, dawg_debug_level); 00218 if (system_dawg) dawgs_ += system_dawg; 00219 } 00220 if (load_number_dawg) { 00221 Dawg *number_dawg = dawg_cache_->GetSquishedDawg( 00222 lang, data_file_name, TESSDATA_NUMBER_DAWG, dawg_debug_level); 00223 if (number_dawg) dawgs_ += number_dawg; 00224 } 00225 if (load_bigram_dawg) { 00226 bigram_dawg_ = dawg_cache_->GetSquishedDawg( 00227 lang, data_file_name, TESSDATA_BIGRAM_DAWG, dawg_debug_level); 00228 } 00229 if (load_freq_dawg) { 00230 freq_dawg_ = dawg_cache_->GetSquishedDawg( 00231 lang, data_file_name, TESSDATA_FREQ_DAWG, dawg_debug_level); 00232 if (freq_dawg_) { dawgs_ += freq_dawg_; } 00233 } 00234 if (load_unambig_dawg) { 00235 unambig_dawg_ = dawg_cache_->GetSquishedDawg( 00236 lang, data_file_name, TESSDATA_UNAMBIG_DAWG, dawg_debug_level); 00237 if (unambig_dawg_) dawgs_ += unambig_dawg_; 00238 } 00239 00240 if (((STRING &)user_words_suffix).length() > 0) { 00241 Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, 00242 kMaxUserDawgEdges, getUnicharset().size(), 00243 dawg_debug_level); 00244 name = getCCUtil()->language_data_path_prefix; 00245 name += user_words_suffix; 00246 if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(), 00247 Trie::RRP_REVERSE_IF_HAS_RTL)) { 00248 tprintf("Error: failed to load %s\n", name.string()); 00249 delete trie_ptr; 00250 } else { 00251 dawgs_ += trie_ptr; 00252 } 00253 } 00254 00255 if (((STRING &)user_patterns_suffix).length() > 0) { 00256 Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, 00257 kMaxUserDawgEdges, getUnicharset().size(), 00258 dawg_debug_level); 00259 trie_ptr->initialize_patterns(&(getUnicharset())); 00260 name = getCCUtil()->language_data_path_prefix; 00261 name += user_patterns_suffix; 00262 if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) { 00263 tprintf("Error: failed to load %s\n", name.string()); 00264 delete trie_ptr; 00265 } else { 00266 dawgs_ += trie_ptr; 00267 } 00268 } 00269 00270 document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, 00271 kMaxDocDawgEdges, getUnicharset().size(), 00272 dawg_debug_level); 00273 dawgs_ += document_words_; 00274 00275 // This dawg is temporary and should not be searched by letter_is_ok. 00276 pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM, 00277 kMaxDocDawgEdges, getUnicharset().size(), 00278 dawg_debug_level); 00279 00280 // Construct a list of corresponding successors for each dawg. Each entry i 00281 // in the successors_ vector is a vector of integers that represent the 00282 // indices into the dawgs_ vector of the successors for dawg i. 00283 successors_.reserve(dawgs_.length()); 00284 for (int i = 0; i < dawgs_.length(); ++i) { 00285 const Dawg *dawg = dawgs_[i]; 00286 SuccessorList *lst = new SuccessorList(); 00287 for (int j = 0; j < dawgs_.length(); ++j) { 00288 const Dawg *other = dawgs_[j]; 00289 if (dawg != NULL && other != NULL && 00290 (dawg->lang() == other->lang()) && 00291 kDawgSuccessors[dawg->type()][other->type()]) *lst += j; 00292 } 00293 successors_ += lst; 00294 } 00295 } 00296 00297 void Dict::End() { 00298 if (dawgs_.length() == 0) 00299 return; // Not safe to call twice. 00300 for (int i = 0; i < dawgs_.size(); i++) { 00301 if (!dawg_cache_->FreeDawg(dawgs_[i])) { 00302 delete dawgs_[i]; 00303 } 00304 } 00305 dawg_cache_->FreeDawg(bigram_dawg_); 00306 if (dawg_cache_is_ours_) { 00307 delete dawg_cache_; 00308 dawg_cache_ = NULL; 00309 } 00310 successors_.delete_data_pointers(); 00311 dawgs_.clear(); 00312 successors_.clear(); 00313 document_words_ = NULL; 00314 if (pending_words_ != NULL) { 00315 delete pending_words_; 00316 pending_words_ = NULL; 00317 } 00318 } 00319 00320 // Returns true if in light of the current state unichar_id is allowed 00321 // according to at least one of the dawgs in the dawgs_ vector. 00322 // See more extensive comments in dict.h where this function is declared. 00323 int Dict::def_letter_is_okay(void* void_dawg_args, 00324 UNICHAR_ID unichar_id, 00325 bool word_end) const { 00326 DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args); 00327 00328 if (dawg_debug_level >= 3) { 00329 tprintf("def_letter_is_okay: current unichar=%s word_end=%d" 00330 " num active dawgs=%d\n", 00331 getUnicharset().debug_str(unichar_id).string(), word_end, 00332 dawg_args->active_dawgs->length()); 00333 } 00334 00335 // Do not accept words that contain kPatternUnicharID. 00336 // (otherwise pattern dawgs would not function correctly). 00337 // Do not accept words containing INVALID_UNICHAR_IDs. 00338 if (unichar_id == Dawg::kPatternUnicharID || 00339 unichar_id == INVALID_UNICHAR_ID) { 00340 dawg_args->permuter = NO_PERM; 00341 return NO_PERM; 00342 } 00343 00344 // Initialization. 00345 PermuterType curr_perm = NO_PERM; 00346 dawg_args->updated_dawgs->clear(); 00347 00348 // Go over the active_dawgs vector and insert DawgPosition records 00349 // with the updated ref (an edge with the corresponding unichar id) into 00350 // dawg_args->updated_pos. 00351 for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) { 00352 const DawgPosition &pos = (*dawg_args->active_dawgs)[a]; 00353 const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : NULL; 00354 const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : NULL; 00355 00356 if (!dawg && !punc_dawg) { 00357 // shouldn't happen. 00358 tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n"); 00359 continue; 00360 } 00361 if (!dawg) { 00362 // We're in the punctuation dawg. A core dawg has not been chosen. 00363 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); 00364 EDGE_REF punc_transition_edge = punc_dawg->edge_char_of( 00365 punc_node, Dawg::kPatternUnicharID, word_end); 00366 if (punc_transition_edge != NO_EDGE) { 00367 // Find all successors, and see which can transition. 00368 const SuccessorList &slist = *(successors_[pos.punc_index]); 00369 for (int s = 0; s < slist.length(); ++s) { 00370 int sdawg_index = slist[s]; 00371 const Dawg *sdawg = dawgs_[sdawg_index]; 00372 UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg); 00373 EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end); 00374 if (dawg_edge != NO_EDGE) { 00375 if (dawg_debug_level >=3) { 00376 tprintf("Letter found in dawg %d\n", sdawg_index); 00377 } 00378 dawg_args->updated_dawgs->add_unique( 00379 DawgPosition(sdawg_index, dawg_edge, 00380 pos.punc_index, punc_transition_edge, false), 00381 dawg_debug_level > 0, 00382 "Append transition from punc dawg to current dawgs: "); 00383 if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter(); 00384 } 00385 } 00386 } 00387 EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, 00388 word_end); 00389 if (punc_edge != NO_EDGE) { 00390 if (dawg_debug_level >=3) { 00391 tprintf("Letter found in punctuation dawg\n"); 00392 } 00393 dawg_args->updated_dawgs->add_unique( 00394 DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), 00395 dawg_debug_level > 0, 00396 "Extend punctuation dawg: "); 00397 if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM; 00398 } 00399 continue; 00400 } 00401 00402 if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) { 00403 // We can end the main word here. 00404 // If we can continue on the punc ref, add that possibility. 00405 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); 00406 EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE 00407 : punc_dawg->edge_char_of(punc_node, unichar_id, word_end); 00408 if (punc_edge != NO_EDGE) { 00409 dawg_args->updated_dawgs->add_unique( 00410 DawgPosition(pos.dawg_index, pos.dawg_ref, 00411 pos.punc_index, punc_edge, true), 00412 dawg_debug_level > 0, 00413 "Return to punctuation dawg: "); 00414 if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); 00415 } 00416 } 00417 00418 if (pos.back_to_punc) continue; 00419 00420 // If we are dealing with the pattern dawg, look up all the 00421 // possible edges, not only for the exact unichar_id, but also 00422 // for all its character classes (alpha, digit, etc). 00423 if (dawg->type() == DAWG_TYPE_PATTERN) { 00424 ProcessPatternEdges(dawg, pos, unichar_id, word_end, 00425 dawg_args->updated_dawgs, &curr_perm); 00426 // There can't be any successors to dawg that is of type 00427 // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition. 00428 continue; 00429 } 00430 00431 // Find the edge out of the node for the unichar_id. 00432 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); 00433 EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE 00434 : dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end); 00435 00436 if (dawg_debug_level >= 3) { 00437 tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", 00438 pos.dawg_index, node, edge); 00439 } 00440 00441 if (edge != NO_EDGE) { // the unichar was found in the current dawg 00442 if (dawg_debug_level >=3) { 00443 tprintf("Letter found in dawg %d\n", pos.dawg_index); 00444 } 00445 if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) { 00446 if (dawg_debug_level >= 3) { 00447 tprintf("Punctuation constraint not satisfied at end of word.\n"); 00448 } 00449 continue; 00450 } 00451 if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); 00452 dawg_args->updated_dawgs->add_unique( 00453 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, 00454 false), 00455 dawg_debug_level > 0, 00456 "Append current dawg to updated active dawgs: "); 00457 } 00458 } // end for 00459 // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM 00460 // or if we found the current letter in a non-punctuation dawg. This 00461 // allows preserving information on which dawg the "core" word came from. 00462 // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM. 00463 if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM || 00464 (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) { 00465 dawg_args->permuter = curr_perm; 00466 } 00467 if (dawg_debug_level >= 2) { 00468 tprintf("Returning %d for permuter code for this character.\n"); 00469 } 00470 return dawg_args->permuter; 00471 } 00472 00473 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, 00474 UNICHAR_ID unichar_id, bool word_end, 00475 DawgPositionVector *updated_dawgs, 00476 PermuterType *curr_perm) const { 00477 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); 00478 // Try to find the edge corresponding to the exact unichar_id and to all the 00479 // edges corresponding to the character class of unichar_id. 00480 GenericVector<UNICHAR_ID> unichar_id_patterns; 00481 unichar_id_patterns.push_back(unichar_id); 00482 dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), 00483 &unichar_id_patterns); 00484 for (int i = 0; i < unichar_id_patterns.size(); ++i) { 00485 // On the first iteration check all the outgoing edges. 00486 // On the second iteration check all self-loops. 00487 for (int k = 0; k < 2; ++k) { 00488 EDGE_REF edge = (k == 0) 00489 ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end) 00490 : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end); 00491 if (edge == NO_EDGE) continue; 00492 if (dawg_debug_level >= 3) { 00493 tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", 00494 pos.dawg_index, node, edge); 00495 tprintf("Letter found in pattern dawg %d\n", pos.dawg_index); 00496 } 00497 if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter(); 00498 updated_dawgs->add_unique( 00499 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, 00500 pos.back_to_punc), 00501 dawg_debug_level > 0, 00502 "Append current dawg to updated active dawgs: "); 00503 } 00504 } 00505 } 00506 00507 // Fill the given active_dawgs vector with dawgs that could contain the 00508 // beginning of the word. If hyphenated() returns true, copy the entries 00509 // from hyphen_active_dawgs_ instead. 00510 void Dict::init_active_dawgs(DawgPositionVector *active_dawgs, 00511 bool ambigs_mode) const { 00512 int i; 00513 if (hyphenated()) { 00514 *active_dawgs = hyphen_active_dawgs_; 00515 if (dawg_debug_level >= 3) { 00516 for (i = 0; i < hyphen_active_dawgs_.size(); ++i) { 00517 tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n", 00518 hyphen_active_dawgs_[i].dawg_index, 00519 hyphen_active_dawgs_[i].dawg_ref); 00520 } 00521 } 00522 } else { 00523 default_dawgs(active_dawgs, ambigs_mode); 00524 } 00525 } 00526 00527 void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, 00528 bool suppress_patterns) const { 00529 bool punc_dawg_available = 00530 (punc_dawg_ != NULL) && 00531 punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE; 00532 00533 for (int i = 0; i < dawgs_.length(); i++) { 00534 if (dawgs_[i] != NULL && 00535 !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) { 00536 int dawg_ty = dawgs_[i]->type(); 00537 bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty]; 00538 if (dawg_ty == DAWG_TYPE_PUNCTUATION) { 00539 *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false); 00540 if (dawg_debug_level >= 3) { 00541 tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, 00542 NO_EDGE); 00543 } 00544 } else if (!punc_dawg_available || !subsumed_by_punc) { 00545 *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false); 00546 if (dawg_debug_level >= 3) { 00547 tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE); 00548 } 00549 } 00550 } 00551 } 00552 } 00553 00554 void Dict::add_document_word(const WERD_CHOICE &best_choice) { 00555 // Do not add hyphenated word parts to the document dawg. 00556 // hyphen_word_ will be non-NULL after the set_hyphen_word() is 00557 // called when the first part of the hyphenated word is 00558 // discovered and while the second part of the word is recognized. 00559 // hyphen_word_ is cleared in cc_recg() before the next word on 00560 // the line is recognized. 00561 if (hyphen_word_) return; 00562 00563 char filename[CHARS_PER_LINE]; 00564 FILE *doc_word_file; 00565 int stringlen = best_choice.length(); 00566 00567 if (valid_word(best_choice) || stringlen < 2) 00568 return; 00569 00570 // Discard words that contain >= kDocDictMaxRepChars repeating unichars. 00571 if (best_choice.length() >= kDocDictMaxRepChars) { 00572 int num_rep_chars = 1; 00573 UNICHAR_ID uch_id = best_choice.unichar_id(0); 00574 for (int i = 1; i < best_choice.length(); ++i) { 00575 if (best_choice.unichar_id(i) != uch_id) { 00576 num_rep_chars = 1; 00577 uch_id = best_choice.unichar_id(i); 00578 } else { 00579 ++num_rep_chars; 00580 if (num_rep_chars == kDocDictMaxRepChars) return; 00581 } 00582 } 00583 } 00584 00585 if (best_choice.certainty() < doc_dict_certainty_threshold || 00586 stringlen == 2) { 00587 if (best_choice.certainty() < doc_dict_pending_threshold) 00588 return; 00589 00590 if (!pending_words_->word_in_dawg(best_choice)) { 00591 if (stringlen > 2 || 00592 (stringlen == 2 && 00593 getUnicharset().get_isupper(best_choice.unichar_id(0)) && 00594 getUnicharset().get_isupper(best_choice.unichar_id(1)))) { 00595 pending_words_->add_word_to_dawg(best_choice); 00596 } 00597 return; 00598 } 00599 } 00600 00601 if (save_doc_words) { 00602 strcpy(filename, getCCUtil()->imagefile.string()); 00603 strcat(filename, ".doc"); 00604 doc_word_file = open_file (filename, "a"); 00605 fprintf(doc_word_file, "%s\n", 00606 best_choice.debug_string().string()); 00607 fclose(doc_word_file); 00608 } 00609 document_words_->add_word_to_dawg(best_choice); 00610 } 00611 00612 void Dict::adjust_word(WERD_CHOICE *word, 00613 bool nonword, 00614 XHeightConsistencyEnum xheight_consistency, 00615 float additional_adjust, 00616 bool modify_rating, 00617 bool debug) { 00618 bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() && 00619 word->GetTopScriptID() == getUnicharset().han_sid()); 00620 bool case_is_ok = (is_han || case_ok(*word, getUnicharset())); 00621 bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word)); 00622 00623 float adjust_factor = additional_adjust; 00624 float new_rating = word->rating(); 00625 new_rating += kRatingPad; 00626 const char *xheight_triggered = ""; 00627 if (word->length() > 1) { 00628 // Calculate x-height and y-offset consistency penalties. 00629 switch (xheight_consistency) { 00630 case XH_INCONSISTENT: 00631 adjust_factor += xheight_penalty_inconsistent; 00632 xheight_triggered = ", xhtBAD"; 00633 break; 00634 case XH_SUBNORMAL: 00635 adjust_factor += xheight_penalty_subscripts; 00636 xheight_triggered = ", xhtSUB"; 00637 break; 00638 case XH_GOOD: 00639 // leave the factor alone - all good! 00640 break; 00641 } 00642 // TODO(eger): if nonword is true, but there is a "core" thats' a dict 00643 // word, negate nonword status. 00644 } else { 00645 if (debug) { 00646 tprintf("Consistency could not be calculated.\n"); 00647 } 00648 } 00649 if (debug) { 00650 tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", 00651 word->unichar_string().string(), word->rating(), 00652 xheight_triggered); 00653 } 00654 00655 if (nonword) { // non-dictionary word 00656 if (case_is_ok && punc_is_ok) { 00657 adjust_factor += segment_penalty_dict_nonword; 00658 new_rating *= adjust_factor; 00659 if (debug) tprintf(", W"); 00660 } else { 00661 adjust_factor += segment_penalty_garbage; 00662 new_rating *= adjust_factor; 00663 if (debug) { 00664 if (!case_is_ok) tprintf(", C"); 00665 if (!punc_is_ok) tprintf(", P"); 00666 } 00667 } 00668 } else { // dictionary word 00669 if (case_is_ok) { 00670 if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) { 00671 word->set_permuter(FREQ_DAWG_PERM); 00672 adjust_factor += segment_penalty_dict_frequent_word; 00673 new_rating *= adjust_factor; 00674 if (debug) tprintf(", F"); 00675 } else { 00676 adjust_factor += segment_penalty_dict_case_ok; 00677 new_rating *= adjust_factor; 00678 if (debug) tprintf(", "); 00679 } 00680 } else { 00681 adjust_factor += segment_penalty_dict_case_bad; 00682 new_rating *= adjust_factor; 00683 if (debug) tprintf(", C"); 00684 } 00685 } 00686 new_rating -= kRatingPad; 00687 if (modify_rating) word->set_rating(new_rating); 00688 if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating); 00689 word->set_adjust_factor(adjust_factor); 00690 } 00691 00692 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const { 00693 const WERD_CHOICE *word_ptr = &word; 00694 WERD_CHOICE temp_word(word.unicharset()); 00695 if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) { 00696 copy_hyphen_info(&temp_word); 00697 temp_word += word; 00698 word_ptr = &temp_word; 00699 } 00700 if (word_ptr->length() == 0) return NO_PERM; 00701 // Allocate vectors for holding current and updated 00702 // active_dawgs and initialize them. 00703 DawgPositionVector *active_dawgs = new DawgPositionVector[2]; 00704 init_active_dawgs(&(active_dawgs[0]), false); 00705 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); 00706 int last_index = word_ptr->length() - 1; 00707 // Call leter_is_okay for each letter in the word. 00708 for (int i = hyphen_base_size(); i <= last_index; ++i) { 00709 if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i), 00710 i == last_index))) break; 00711 // Swap active_dawgs, constraints with the corresponding updated vector. 00712 if (dawg_args.updated_dawgs == &(active_dawgs[1])) { 00713 dawg_args.updated_dawgs = &(active_dawgs[0]); 00714 ++(dawg_args.active_dawgs); 00715 } else { 00716 ++(dawg_args.updated_dawgs); 00717 dawg_args.active_dawgs = &(active_dawgs[0]); 00718 } 00719 } 00720 delete[] active_dawgs; 00721 return valid_word_permuter(dawg_args.permuter, numbers_ok) ? 00722 dawg_args.permuter : NO_PERM; 00723 } 00724 00725 bool Dict::valid_bigram(const WERD_CHOICE &word1, 00726 const WERD_CHOICE &word2) const { 00727 if (bigram_dawg_ == NULL) return false; 00728 00729 // Extract the core word from the middle of each word with any digits 00730 // replaced with question marks. 00731 int w1start, w1end, w2start, w2end; 00732 word1.punct_stripped(&w1start, &w1end); 00733 word2.punct_stripped(&w2start, &w2end); 00734 00735 // We don't want to penalize a single guillemet, hyphen, etc. 00736 // But our bigram list doesn't have any information about punctuation. 00737 if (w1start >= w1end) return word1.length() < 3; 00738 if (w2start >= w2end) return word2.length() < 3; 00739 00740 const UNICHARSET& uchset = getUnicharset(); 00741 GenericVector<UNICHAR_ID> bigram_string; 00742 bigram_string.reserve(w1end + w2end + 1); 00743 for (int i = w1start; i < w1end; i++) { 00744 const GenericVector<UNICHAR_ID>& normed_ids = 00745 getUnicharset().normed_ids(word1.unichar_id(i)); 00746 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) 00747 bigram_string.push_back(question_unichar_id_); 00748 else 00749 bigram_string += normed_ids; 00750 } 00751 bigram_string.push_back(UNICHAR_SPACE); 00752 for (int i = w2start; i < w2end; i++) { 00753 const GenericVector<UNICHAR_ID>& normed_ids = 00754 getUnicharset().normed_ids(word2.unichar_id(i)); 00755 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) 00756 bigram_string.push_back(question_unichar_id_); 00757 else 00758 bigram_string += normed_ids; 00759 } 00760 WERD_CHOICE normalized_word(&uchset, bigram_string.size()); 00761 for (int i = 0; i < bigram_string.size(); ++i) { 00762 normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 00763 0.0f, 0.0f); 00764 } 00765 return bigram_dawg_->word_in_dawg(normalized_word); 00766 } 00767 00768 bool Dict::valid_punctuation(const WERD_CHOICE &word) { 00769 if (word.length() == 0) return NO_PERM; 00770 int i; 00771 WERD_CHOICE new_word(word.unicharset()); 00772 int last_index = word.length() - 1; 00773 int new_len = 0; 00774 for (i = 0; i <= last_index; ++i) { 00775 UNICHAR_ID unichar_id = (word.unichar_id(i)); 00776 if (getUnicharset().get_ispunctuation(unichar_id)) { 00777 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0); 00778 } else if (!getUnicharset().get_isalpha(unichar_id) && 00779 !getUnicharset().get_isdigit(unichar_id)) { 00780 return false; // neither punc, nor alpha, nor digit 00781 } else if ((new_len = new_word.length()) == 0 || 00782 new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) { 00783 new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0); 00784 } 00785 } 00786 for (i = 0; i < dawgs_.size(); ++i) { 00787 if (dawgs_[i] != NULL && 00788 dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION && 00789 dawgs_[i]->word_in_dawg(new_word)) return true; 00790 } 00791 return false; 00792 } 00793 00794 00795 } // namespace tesseract