tesseract
3.03
|
00001 00002 // File: dict.h 00003 // Description: dict class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #ifndef TESSERACT_DICT_DICT_H_ 00020 #define TESSERACT_DICT_DICT_H_ 00021 00022 #include "ambigs.h" 00023 #include "dawg.h" 00024 #include "dawg_cache.h" 00025 #include "host.h" 00026 #include "oldlist.h" 00027 #include "ratngs.h" 00028 #include "stopper.h" 00029 #include "trie.h" 00030 #include "unicharset.h" 00031 #include "params_training_featdef.h" 00032 00033 class MATRIX; 00034 class WERD_RES; 00035 00036 #define MAX_WERD_LENGTH (inT64) 128 00037 #define NO_RATING -1 00038 00040 struct CHAR_FRAGMENT_INFO { 00041 UNICHAR_ID unichar_id; 00042 const CHAR_FRAGMENT *fragment; 00043 int num_fragments; 00044 float rating; 00045 float certainty; 00046 }; 00047 00048 namespace tesseract { 00049 00050 typedef GenericVector<Dawg *> DawgVector; 00051 00052 // 00053 // Constants 00054 // 00055 static const int kRatingPad = 4; 00056 static const char kDictWildcard[] = "\u2606"; // WHITE STAR 00057 static const int kDictMaxWildcards = 2; // max wildcards for a word 00058 // TODO(daria): If hyphens are different in different languages and can be 00059 // inferred from training data we should load their values dynamically. 00060 static const char kHyphenSymbol[] = "-"; 00061 static const char kSlashSymbol[] = "/"; 00062 static const char kQuestionSymbol[] = "?"; 00063 static const char kApostropheSymbol[] = "'"; 00064 static const int kMaxNumDawgEdgees = 2000000; 00065 static const int kMaxDocDawgEdges = 250000; 00066 static const int kMaxUserDawgEdges = 50000; 00067 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling 00068 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset 00069 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on 00070 static const int kDocDictMaxRepChars = 4; 00071 00072 // Enum for describing whether the x-height for the word is consistent: 00073 // 0 - everything is good. 00074 // 1 - there are one or two secondary (but consistent) baselines 00075 // [think subscript and superscript], or there is an oversized 00076 // first character. 00077 // 2 - the word is inconsistent. 00078 enum XHeightConsistencyEnum {XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT}; 00079 00080 struct DawgArgs { 00081 DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p) 00082 : active_dawgs(d), updated_dawgs(up), permuter(p) {} 00083 00084 DawgPositionVector *active_dawgs; 00085 DawgPositionVector *updated_dawgs; 00086 PermuterType permuter; 00087 }; 00088 00089 class Dict { 00090 public: 00091 Dict(CCUtil* image_ptr); 00092 ~Dict(); 00093 const CCUtil* getCCUtil() const { 00094 return ccutil_; 00095 } 00096 CCUtil* getCCUtil() { 00097 return ccutil_; 00098 } 00099 const UNICHARSET& getUnicharset() const { 00100 return getCCUtil()->unicharset; 00101 } 00102 UNICHARSET& getUnicharset() { 00103 return getCCUtil()->unicharset; 00104 } 00105 const UnicharAmbigs &getUnicharAmbigs() const { 00106 return getCCUtil()->unichar_ambigs; 00107 } 00108 00109 // Returns true if unichar_id is a word compounding character like - or /. 00110 inline bool compound_marker(UNICHAR_ID unichar_id) { 00111 const GenericVector<UNICHAR_ID>& normed_ids = 00112 getUnicharset().normed_ids(unichar_id); 00113 return normed_ids.size() == 1 && 00114 (normed_ids[0] == hyphen_unichar_id_ || 00115 normed_ids[0] == slash_unichar_id_); 00116 } 00117 // Returns true if unichar_id is an apostrophe-like character that may 00118 // separate prefix/suffix words from a main body word. 00119 inline bool is_apostrophe(UNICHAR_ID unichar_id) { 00120 const GenericVector<UNICHAR_ID>& normed_ids = 00121 getUnicharset().normed_ids(unichar_id); 00122 return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_; 00123 } 00124 00125 /* hyphen.cpp ************************************************************/ 00126 00128 inline bool hyphenated() const { return 00129 !last_word_on_line_ && hyphen_word_; 00130 } 00132 inline int hyphen_base_size() const { 00133 return this->hyphenated() ? hyphen_word_->length() : 0; 00134 } 00138 inline void copy_hyphen_info(WERD_CHOICE *word) const { 00139 if (this->hyphenated()) { 00140 *word = *hyphen_word_; 00141 if (hyphen_debug_level) word->print("copy_hyphen_info: "); 00142 } 00143 } 00145 inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const { 00146 if (!last_word_on_line_ || first_pos) 00147 return false; 00148 const GenericVector<UNICHAR_ID>& normed_ids = 00149 getUnicharset().normed_ids(unichar_id); 00150 return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_; 00151 } 00153 inline bool has_hyphen_end(const WERD_CHOICE &word) const { 00154 int word_index = word.length() - 1; 00155 return has_hyphen_end(word.unichar_id(word_index), word_index == 0); 00156 } 00160 void reset_hyphen_vars(bool last_word_on_line); 00163 void set_hyphen_word(const WERD_CHOICE &word, 00164 const DawgPositionVector &active_dawgs); 00165 00166 /* permdawg.cpp ************************************************************/ 00167 // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig(). 00168 // When this function is refactored, permdawg.cpp can be removed. 00169 00172 inline void update_best_choice(const WERD_CHOICE &word, 00173 WERD_CHOICE *best_choice) { 00174 if (word.rating() < best_choice->rating()) { 00175 *best_choice = word; 00176 } 00177 } 00181 void init_active_dawgs(DawgPositionVector *active_dawgs, 00182 bool ambigs_mode) const; 00183 // Fill the given vector with the default collection of any-length dawgs 00184 void default_dawgs(DawgPositionVector *anylength_dawgs, 00185 bool suppress_patterns) const; 00186 00187 00193 WERD_CHOICE *dawg_permute_and_select( 00194 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit); 00198 void go_deeper_dawg_fxn( 00199 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, 00200 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00201 bool word_ending, WERD_CHOICE *word, float certainties[], 00202 float *limit, WERD_CHOICE *best_choice, int *attempts_left, 00203 void *void_more_args); 00204 00206 void (Dict::*go_deeper_fxn_)(const char *debug, 00207 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00208 int char_choice_index, 00209 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00210 bool word_ending, WERD_CHOICE *word, 00211 float certainties[], float *limit, 00212 WERD_CHOICE *best_choice, int *attempts_left, 00213 void *void_more_args); 00214 // 00215 // Helper functions for dawg_permute_and_select(). 00216 // 00217 void permute_choices( 00218 const char *debug, 00219 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00220 int char_choice_index, 00221 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00222 WERD_CHOICE *word, 00223 float certainties[], 00224 float *limit, 00225 WERD_CHOICE *best_choice, 00226 int *attempts_left, 00227 void *more_args); 00228 00229 void append_choices( 00230 const char *debug, 00231 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00232 const BLOB_CHOICE &blob_choice, 00233 int char_choice_index, 00234 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00235 WERD_CHOICE *word, 00236 float certainties[], 00237 float *limit, 00238 WERD_CHOICE *best_choice, 00239 int *attempts_left, 00240 void *more_args); 00241 00242 bool fragment_state_okay(UNICHAR_ID curr_unichar_id, 00243 float curr_rating, float curr_certainty, 00244 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00245 const char *debug, int word_ending, 00246 CHAR_FRAGMENT_INFO *char_frag_info); 00247 00248 /* stopper.cpp *************************************************************/ 00249 bool NoDangerousAmbig(WERD_CHOICE *BestChoice, 00250 DANGERR *fixpt, 00251 bool fix_replaceable, 00252 MATRIX* ratings); 00253 // Replaces the corresponding wrong ngram in werd_choice with the correct 00254 // one. The whole correct n-gram is inserted into the ratings matrix and 00255 // the werd_choice: no more fragments!. Rating and certainty of new entries 00256 // in matrix and werd_choice are the sum and mean of the wrong ngram 00257 // respectively. 00258 // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes 00259 // mystring", with a new entry in the ratings matrix for ". 00260 void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, 00261 UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, 00262 MATRIX *ratings); 00263 00265 int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice); 00273 int UniformCertainties(const WERD_CHOICE& word); 00275 bool AcceptableChoice(const WERD_CHOICE& best_choice, 00276 XHeightConsistencyEnum xheight_consistency); 00280 bool AcceptableResult(WERD_RES* word); 00281 void EndDangerousAmbigs(); 00283 void DebugWordChoices(); 00285 void SettupStopperPass1(); 00287 void SettupStopperPass2(); 00288 /* context.cpp *************************************************************/ 00290 int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset); 00293 bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset); 00294 00295 /* dict.cpp ****************************************************************/ 00296 00299 static DawgCache *GlobalDawgCache(); 00300 void Load(DawgCache *dawg_cache); 00301 void End(); 00302 00303 // Resets the document dictionary analogous to ResetAdaptiveClassifier. 00304 void ResetDocumentDictionary() { 00305 if (pending_words_ != NULL) 00306 pending_words_->clear(); 00307 if (document_words_ != NULL) 00308 document_words_->clear(); 00309 } 00310 00346 // 00347 int def_letter_is_okay(void* void_dawg_args, 00348 UNICHAR_ID unichar_id, bool word_end) const; 00349 00350 int (Dict::*letter_is_okay_)(void* void_dawg_args, 00351 UNICHAR_ID unichar_id, bool word_end) const; 00353 int LetterIsOkay(void* void_dawg_args, 00354 UNICHAR_ID unichar_id, bool word_end) const { 00355 return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end); 00356 } 00357 00358 00360 double (Dict::*probability_in_context_)(const char* lang, 00361 const char* context, 00362 int context_bytes, 00363 const char* character, 00364 int character_bytes); 00366 double ProbabilityInContext(const char* context, 00367 int context_bytes, 00368 const char* character, 00369 int character_bytes) { 00370 return (this->*probability_in_context_)( 00371 getCCUtil()->lang.string(), 00372 context, context_bytes, 00373 character, character_bytes); 00374 } 00375 00377 double def_probability_in_context( 00378 const char* lang, const char* context, int context_bytes, 00379 const char* character, int character_bytes) { 00380 (void) context; 00381 (void) context_bytes; 00382 (void) character; 00383 (void) character_bytes; 00384 return 0.0; 00385 } 00386 double ngram_probability_in_context(const char* lang, 00387 const char* context, 00388 int context_bytes, 00389 const char* character, 00390 int character_bytes); 00391 00392 // Interface with params model. 00393 float (Dict::*params_model_classify_)(const char *lang, void *path); 00394 float ParamsModelClassify(const char *lang, void *path); 00395 // Call params_model_classify_ member function. 00396 float CallParamsModelClassify(void *path) { 00397 ASSERT_HOST(params_model_classify_ != NULL); // ASSERT_HOST -> assert 00398 return (this->*params_model_classify_)( 00399 getCCUtil()->lang.string(), path); 00400 } 00401 00402 inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; } 00403 inline const UNICHAR_ID WildcardID() const { 00404 return wildcard_unichar_id_; 00405 } 00407 inline const int NumDawgs() const { return dawgs_.size(); } 00409 inline const Dawg *GetDawg(int index) const { return dawgs_[index]; } 00411 inline const Dawg *GetPuncDawg() const { return punc_dawg_; } 00413 inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; } 00415 static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) { 00416 if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg 00417 NODE_REF node = dawg->next_node(edge_ref); 00418 if (node == 0) node = NO_EDGE; // end of word 00419 return node; 00420 } 00421 00422 // Given a unichar from a string and a given dawg, return the unichar 00423 // we should use to match in that dawg type. (for example, in the number 00424 // dawg, all numbers are transformed to kPatternUnicharId). 00425 inline UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const { 00426 if (!dawg) return ch; 00427 switch (dawg->type()) { 00428 case DAWG_TYPE_NUMBER: 00429 return getUnicharset().get_isdigit(ch) ? Dawg::kPatternUnicharID : ch; 00430 default: 00431 return ch; 00432 } 00433 } 00434 00440 void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, 00441 UNICHAR_ID unichar_id, bool word_end, 00442 DawgPositionVector *updated_dawgs, 00443 PermuterType *current_permuter) const; 00444 00448 00450 inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) { 00451 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || 00452 perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM || 00453 perm == USER_PATTERN_PERM || perm == COMPOUND_PERM || 00454 (numbers_ok && perm == NUMBER_PERM)); 00455 } 00456 int valid_word(const WERD_CHOICE &word, bool numbers_ok) const; 00457 int valid_word(const WERD_CHOICE &word) const { 00458 return valid_word(word, false); // return NO_PERM for words with digits 00459 } 00460 int valid_word_or_number(const WERD_CHOICE &word) const { 00461 return valid_word(word, true); // return NUMBER_PERM for valid numbers 00462 } 00464 int valid_word(const char *string) const { 00465 WERD_CHOICE word(string, getUnicharset()); 00466 return valid_word(word); 00467 } 00468 // Do the two WERD_CHOICEs form a meaningful bigram? 00469 bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const; 00474 bool valid_punctuation(const WERD_CHOICE &word); 00476 int good_choice(const WERD_CHOICE &choice); 00478 void add_document_word(const WERD_CHOICE &best_choice); 00480 void adjust_word(WERD_CHOICE *word, 00481 bool nonword, XHeightConsistencyEnum xheight_consistency, 00482 float additional_adjust, 00483 bool modify_rating, 00484 bool debug); 00486 inline void SetWordsegRatingAdjustFactor(float f) { 00487 wordseg_rating_adjust_factor_ = f; 00488 } 00489 00490 private: 00492 CCUtil* ccutil_; 00499 UnicharAmbigs *dang_ambigs_table_; 00501 UnicharAmbigs *replace_ambigs_table_; 00503 FLOAT32 reject_offset_; 00504 // Cached UNICHAR_IDs: 00505 UNICHAR_ID wildcard_unichar_id_; // kDictWildcard. 00506 UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol. 00507 UNICHAR_ID question_unichar_id_; // kQuestionSymbol. 00508 UNICHAR_ID slash_unichar_id_; // kSlashSymbol. 00509 UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol. 00510 // Hyphen-related variables. 00511 WERD_CHOICE *hyphen_word_; 00512 DawgPositionVector hyphen_active_dawgs_; 00513 bool last_word_on_line_; 00514 // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary 00515 // matching. The first member of each list is taken as canonical. For 00516 // example, the first list contains hyphens and dashes with the first symbol 00517 // being the ASCII hyphen minus. 00518 GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_; 00519 // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs. 00520 DawgCache *dawg_cache_; 00521 bool dawg_cache_is_ours_; // we should delete our own dawg_cache_ 00522 // Dawgs. 00523 DawgVector dawgs_; 00524 SuccessorListsVector successors_; 00525 Trie *pending_words_; 00526 // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if 00527 // any of them are present on the best choices list for a word pair. 00528 // the bigrams are stored as space-separated words where: 00529 // (1) leading and trailing punctuation has been removed from each word and 00530 // (2) any digits have been replaced with '?' marks. 00531 Dawg *bigram_dawg_; 00534 // TODO(daria): need to support multiple languages in the future, 00535 // so maybe will need to maintain a list of dawgs of each kind. 00536 Dawg *freq_dawg_; 00537 Dawg *unambig_dawg_; 00538 Dawg *punc_dawg_; 00539 Trie *document_words_; 00542 float wordseg_rating_adjust_factor_; 00543 // File for recording ambiguities discovered during dictionary search. 00544 FILE *output_ambig_words_file_; 00545 00546 public: 00550 STRING_VAR_H(user_words_suffix, "", "A list of user-provided words."); 00551 STRING_VAR_H(user_patterns_suffix, "", 00552 "A list of user-provided patterns."); 00553 BOOL_VAR_H(load_system_dawg, true, "Load system word dawg."); 00554 BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg."); 00555 BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg."); 00556 BOOL_VAR_H(load_punc_dawg, true, 00557 "Load dawg with punctuation patterns."); 00558 BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns."); 00559 BOOL_VAR_H(load_bigram_dawg, true, 00560 "Load dawg with special word bigrams."); 00561 double_VAR_H(xheight_penalty_subscripts, 0.125, 00562 "Score penalty (0.1 = 10%) added if there are subscripts " 00563 "or superscripts in a word, but it is otherwise OK."); 00564 double_VAR_H(xheight_penalty_inconsistent, 0.25, 00565 "Score penalty (0.1 = 10%) added if an xheight is " 00566 "inconsistent."); 00567 double_VAR_H(segment_penalty_dict_frequent_word, 1.0, 00568 "Score multiplier for word matches which have good case and" 00569 "are frequent in the given language (lower is better)."); 00570 00571 double_VAR_H(segment_penalty_dict_case_ok, 1.1, 00572 "Score multiplier for word matches that have good case " 00573 "(lower is better)."); 00574 00575 double_VAR_H(segment_penalty_dict_case_bad, 1.3125, 00576 "Default score multiplier for word matches, which may have " 00577 "case issues (lower is better)."); 00578 00579 // TODO(daria): remove this param when ngram permuter is deprecated. 00580 double_VAR_H(segment_penalty_ngram_best_choice, 1.24, 00581 "Multipler to for the best choice from the ngram model."); 00582 00583 double_VAR_H(segment_penalty_dict_nonword, 1.25, 00584 "Score multiplier for glyph fragment segmentations which " 00585 "do not match a dictionary word (lower is better)."); 00586 00587 double_VAR_H(segment_penalty_garbage, 1.50, 00588 "Score multiplier for poorly cased strings that are not in" 00589 " the dictionary and generally look like garbage (lower is" 00590 " better)."); 00591 STRING_VAR_H(output_ambig_words_file, "", 00592 "Output file for ambiguities found in the dictionary"); 00593 INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info" 00594 ", to 2 for more details, to 3 to see all the debug messages"); 00595 INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words."); 00596 INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list."); 00597 BOOL_VAR_H(use_only_first_uft8_step, false, 00598 "Use only the first UTF8 step of the given string" 00599 " when computing log probabilities."); 00600 double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor"); 00601 double_VAR_H(stopper_nondict_certainty_base, -2.50, 00602 "Certainty threshold for non-dict words"); 00603 double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0, 00604 "Reject certainty offset"); 00605 INT_VAR_H(stopper_smallword_size, 2, 00606 "Size of dict word to be treated as non-dict word"); 00607 double_VAR_H(stopper_certainty_per_char, -0.50, 00608 "Certainty to add for each dict char above small word size."); 00609 double_VAR_H(stopper_allowable_character_badness, 3.0, 00610 "Max certaintly variation allowed in a word (in sigma)"); 00611 INT_VAR_H(stopper_debug_level, 0, "Stopper debug level"); 00612 BOOL_VAR_H(stopper_no_acceptable_choices, false, 00613 "Make AcceptableChoice() always return false. Useful" 00614 " when there is a need to explore all segmentations"); 00615 BOOL_VAR_H(save_raw_choices, false, 00616 "Deprecated- backward compatability only"); 00617 INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list"); 00618 STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information" 00619 " should be printed to stdout"); 00620 STRING_VAR_H(word_to_debug_lengths, "", 00621 "Lengths of unichars in word_to_debug"); 00622 INT_VAR_H(fragments_debug, 0, "Debug character fragments"); 00623 BOOL_VAR_H(segment_nonalphabetic_script, false, 00624 "Don't use any alphabetic-specific tricks." 00625 "Set to true in the traineddata config file for" 00626 " scripts that are cursive or inherently fixed-pitch"); 00627 BOOL_VAR_H(save_doc_words, 0, "Save Document Words"); 00628 double_VAR_H(doc_dict_pending_threshold, 0.0, 00629 "Worst certainty for using pending dictionary"); 00630 double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty" 00631 " for words that can be inserted into the document dictionary"); 00632 INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different" 00633 " character choices to consider during permutation." 00634 " This limit is especially useful when user patterns" 00635 " are specified, since overly generic patterns can result in" 00636 " dawg search exploring an overly large number of options."); 00637 }; 00638 } // namespace tesseract 00639 00640 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_