tesseract
3.03
|
00001 00002 // File: language_model.h 00003 // Description: Functions that utilize the knowledge about the properties, 00004 // structure and statistics of the language to help segmentation 00005 // search. 00006 // Author: Daria Antonova 00007 // Created: Mon Nov 11 11:26:43 PST 2009 00008 // 00009 // (C) Copyright 2009, Google Inc. 00010 // Licensed under the Apache License, Version 2.0 (the "License"); 00011 // you may not use this file except in compliance with the License. 00012 // You may obtain a copy of the License at 00013 // http://www.apache.org/licenses/LICENSE-2.0 00014 // Unless required by applicable law or agreed to in writing, software 00015 // distributed under the License is distributed on an "AS IS" BASIS, 00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00017 // See the License for the specific language governing permissions and 00018 // limitations under the License. 00019 // 00021 00022 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_ 00023 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_ 00024 00025 #include "associate.h" 00026 #include "dawg.h" 00027 #include "dict.h" 00028 #include "fontinfo.h" 00029 #include "intproto.h" 00030 #include "lm_consistency.h" 00031 #include "lm_pain_points.h" 00032 #include "lm_state.h" 00033 #include "matrix.h" 00034 #include "params.h" 00035 #include "pageres.h" 00036 #include "params_model.h" 00037 00038 namespace tesseract { 00039 00040 // This class that contains the data structures and functions necessary 00041 // to represent and use the knowledge about the language. 00042 class LanguageModel { 00043 public: 00044 // Masks for keeping track of top choices that should not be pruned out. 00045 static const LanguageModelFlagsType kSmallestRatingFlag = 0x1; 00046 static const LanguageModelFlagsType kLowerCaseFlag = 0x2; 00047 static const LanguageModelFlagsType kUpperCaseFlag = 0x4; 00048 static const LanguageModelFlagsType kDigitFlag = 0x8; 00049 static const LanguageModelFlagsType kXhtConsistentFlag = 0x10; 00050 00051 // Denominator for normalizing per-letter ngram cost when deriving 00052 // penalty adjustments. 00053 static const float kMaxAvgNgramCost; 00054 00055 LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, Dict *dict); 00056 ~LanguageModel(); 00057 00058 // Fills the given floats array with features extracted from path represented 00059 // by the given ViterbiStateEntry. See ccstruct/params_training_featdef.h 00060 // for feature information. 00061 // Note: the function assumes that features points to an array of size 00062 // PTRAIN_NUM_FEATURE_TYPES. 00063 static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, 00064 float features[]); 00065 00066 // Updates data structures that are used for the duration of the segmentation 00067 // search on the current word; 00068 void InitForWord(const WERD_CHOICE *prev_word, 00069 bool fixed_pitch, float max_char_wh_ratio, 00070 float rating_cert_scale); 00071 00072 // Updates language model state of the given BLOB_CHOICE_LIST (from 00073 // the ratings matrix) a its parent. Updates pain_points if new 00074 // problematic points are found in the segmentation graph. 00075 // 00076 // At most language_model_viterbi_list_size are kept in each 00077 // LanguageModelState.viterbi_state_entries list. 00078 // At most language_model_viterbi_list_max_num_prunable of those are prunable 00079 // (non-dictionary) paths. 00080 // The entries that represent dictionary word paths are kept at the front 00081 // of the list. 00082 // The list ordered by cost that is computed collectively by several 00083 // language model components (currently dawg and ngram components). 00084 bool UpdateState( 00085 bool just_classified, 00086 int curr_col, int curr_row, 00087 BLOB_CHOICE_LIST *curr_list, 00088 LanguageModelState *parent_node, 00089 LMPainPoints *pain_points, 00090 WERD_RES *word_res, 00091 BestChoiceBundle *best_choice_bundle, 00092 BlamerBundle *blamer_bundle); 00093 00094 // Returns true if an acceptable best choice was discovered. 00095 inline bool AcceptableChoiceFound() { return acceptable_choice_found_; } 00096 inline void SetAcceptableChoiceFound(bool val) { 00097 acceptable_choice_found_ = val; 00098 } 00099 // Returns the reference to ParamsModel. 00100 inline ParamsModel &getParamsModel() { return params_model_; } 00101 00102 protected: 00103 00104 inline float CertaintyScore(float cert) { 00105 if (language_model_use_sigmoidal_certainty) { 00106 // cert is assumed to be between 0 and -dict_->certainty_scale. 00107 // If you enable language_model_use_sigmoidal_certainty, you 00108 // need to adjust language_model_ngram_nonmatch_score as well. 00109 cert = -cert / dict_->certainty_scale; 00110 return 1.0f / (1.0f + exp(10.0f * cert)); 00111 } else { 00112 return (-1.0f / cert); 00113 } 00114 } 00115 00116 inline float ComputeAdjustment(int num_problems, float penalty) { 00117 if (num_problems == 0) return 0.0f; 00118 if (num_problems == 1) return penalty; 00119 return (penalty + (language_model_penalty_increment * 00120 static_cast<float>(num_problems-1))); 00121 } 00122 00123 // Computes the adjustment to the ratings sum based on the given 00124 // consistency_info. The paths with invalid punctuation, inconsistent 00125 // case and character type are penalized proportionally to the number 00126 // of inconsistencies on the path. 00127 inline float ComputeConsistencyAdjustment( 00128 const LanguageModelDawgInfo *dawg_info, 00129 const LMConsistencyInfo &consistency_info) { 00130 if (dawg_info != NULL) { 00131 return ComputeAdjustment(consistency_info.NumInconsistentCase(), 00132 language_model_penalty_case); 00133 } 00134 return (ComputeAdjustment(consistency_info.NumInconsistentPunc(), 00135 language_model_penalty_punc) + 00136 ComputeAdjustment(consistency_info.NumInconsistentCase(), 00137 language_model_penalty_case) + 00138 ComputeAdjustment(consistency_info.NumInconsistentChartype(), 00139 language_model_penalty_chartype) + 00140 ComputeAdjustment(consistency_info.NumInconsistentSpaces(), 00141 language_model_penalty_spacing) + 00142 (consistency_info.inconsistent_script ? 00143 language_model_penalty_script : 0.0f) + 00144 (consistency_info.inconsistent_font ? 00145 language_model_penalty_font : 0.0f)); 00146 } 00147 00148 // Returns an adjusted ratings sum that includes inconsistency penalties, 00149 // penalties for non-dictionary paths and paths with dips in ngram 00150 // probability. 00151 float ComputeAdjustedPathCost(ViterbiStateEntry *vse); 00152 00153 // Finds the first lower and upper case letter and first digit in curr_list. 00154 // Uses the first character in the list in place of empty results. 00155 // Returns true if both alpha and digits are found. 00156 bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, 00157 BLOB_CHOICE **first_lower, 00158 BLOB_CHOICE **first_upper, 00159 BLOB_CHOICE **first_digit) const; 00160 // Forces there to be at least one entry in the overall set of the 00161 // viterbi_state_entries of each element of parent_node that has the 00162 // top_choice_flag set for lower, upper and digit using the same rules as 00163 // GetTopLowerUpperDigit, setting the flag on the first found suitable 00164 // candidate, whether or not the flag is set on some other parent. 00165 // Returns 1 if both alpha and digits are found among the parents, -1 if no 00166 // parents are found at all (a legitimate case), and 0 otherwise. 00167 int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const; 00168 00169 // Finds the next ViterbiStateEntry with which the given unichar_id can 00170 // combine sensibly, taking into account any mixed alnum/mixed case 00171 // situation, and whether this combination has been inspected before. 00172 ViterbiStateEntry* GetNextParentVSE( 00173 bool just_classified, bool mixed_alnum, 00174 const BLOB_CHOICE* bc, LanguageModelFlagsType blob_choice_flags, 00175 const UNICHARSET& unicharset, WERD_RES* word_res, 00176 ViterbiStateEntry_IT* vse_it, 00177 LanguageModelFlagsType* top_choice_flags) const; 00178 // Helper function that computes the cost of the path composed of the 00179 // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE. 00180 // If the new path looks good enough, adds a new ViterbiStateEntry to the 00181 // list of viterbi entries in the given BLOB_CHOICE and returns true. 00182 bool AddViterbiStateEntry( 00183 LanguageModelFlagsType top_choice_flags, float denom, bool word_end, 00184 int curr_col, int curr_row, BLOB_CHOICE *b, 00185 LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, 00186 LMPainPoints *pain_points, WERD_RES *word_res, 00187 BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); 00188 00189 // Determines whether a potential entry is a true top choice and 00190 // updates changed accordingly. 00191 // 00192 // Note: The function assumes that b, top_choice_flags and changed 00193 // are not NULL. 00194 void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, 00195 const ViterbiStateEntry *parent_vse, 00196 LanguageModelState *lms); 00197 00198 // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and 00199 // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo 00200 // with updated active dawgs, constraints and permuter. 00201 // 00202 // Note: the caller is responsible for deleting the returned pointer. 00203 LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, 00204 int curr_col, int curr_row, 00205 const BLOB_CHOICE &b, 00206 const ViterbiStateEntry *parent_vse); 00207 00208 // Computes p(unichar | parent context) and records it in ngram_cost. 00209 // If b.unichar_id() is an unlikely continuation of the parent context 00210 // sets found_small_prob to true and returns NULL. 00211 // Otherwise creates a new LanguageModelNgramInfo entry containing the 00212 // updated context (that includes b.unichar_id() at the end) and returns it. 00213 // 00214 // Note: the caller is responsible for deleting the returned pointer. 00215 LanguageModelNgramInfo *GenerateNgramInfo( 00216 const char *unichar, float certainty, float denom, 00217 int curr_col, int curr_row, float outline_length, 00218 const ViterbiStateEntry *parent_vse); 00219 00220 // Computes -(log(prob(classifier)) + log(prob(ngram model))) 00221 // for the given unichar in the given context. If there are multiple 00222 // unichars at one position - takes the average of their probabilities. 00223 // UNICHAR::utf8_step() is used to separate out individual UTF8 characters, 00224 // since probability_in_context() can only handle one at a time (while 00225 // unicharset might contain ngrams and glyphs composed from multiple UTF8 00226 // characters). 00227 float ComputeNgramCost(const char *unichar, float certainty, float denom, 00228 const char *context, int *unichar_step_len, 00229 bool *found_small_prob, float *ngram_prob); 00230 00231 // Computes the normalization factors for the classifier confidences 00232 // (used by ComputeNgramCost()). 00233 float ComputeDenom(BLOB_CHOICE_LIST *curr_list); 00234 00235 // Fills the given consistenty_info based on parent_vse.consistency_info 00236 // and on the consistency of the given unichar_id with parent_vse. 00237 void FillConsistencyInfo( 00238 int curr_col, bool word_end, BLOB_CHOICE *b, 00239 ViterbiStateEntry *parent_vse, 00240 WERD_RES *word_res, 00241 LMConsistencyInfo *consistency_info); 00242 00243 // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs 00244 // on the path represented by the given BLOB_CHOICE and language model 00245 // state entries (lmse, dse). The path is re-constructed by following 00246 // the parent pointers in the the lang model state entries). If the 00247 // constructed WERD_CHOICE is better than the best/raw choice recorded 00248 // in the best_choice_bundle, this function updates the corresponding 00249 // fields and sets best_choice_bunldle->updated to true. 00250 void UpdateBestChoice(ViterbiStateEntry *vse, 00251 LMPainPoints *pain_points, 00252 WERD_RES *word_res, 00253 BestChoiceBundle *best_choice_bundle, 00254 BlamerBundle *blamer_bundle); 00255 00256 // Constructs a WERD_CHOICE by tracing parent pointers starting with 00257 // the given LanguageModelStateEntry. Returns the constructed word. 00258 // Updates best_char_choices, certainties and state if they are not 00259 // NULL (best_char_choices and certainties are assumed to have the 00260 // length equal to lmse->length). 00261 // The caller is responsible for freeing memory associated with the 00262 // returned WERD_CHOICE. 00263 WERD_CHOICE *ConstructWord(ViterbiStateEntry *vse, 00264 WERD_RES *word_res, 00265 DANGERR *fixpt, 00266 BlamerBundle *blamer_bundle, 00267 bool *truth_path); 00268 00269 // Wrapper around AssociateUtils::ComputeStats(). 00270 inline void ComputeAssociateStats(int col, int row, 00271 float max_char_wh_ratio, 00272 ViterbiStateEntry *parent_vse, 00273 WERD_RES *word_res, 00274 AssociateStats *associate_stats) { 00275 AssociateUtils::ComputeStats( 00276 col, row, 00277 (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL, 00278 (parent_vse != NULL) ? parent_vse->length : 0, 00279 fixed_pitch_, max_char_wh_ratio, 00280 word_res, language_model_debug_level > 2, associate_stats); 00281 } 00282 00283 // Returns true if the path with such top_choice_flags and dawg_info 00284 // could be pruned out (i.e. is neither a system/user/frequent dictionary 00285 // nor a top choice path). 00286 // In non-space delimited languages all paths can be "somewhat" dictionary 00287 // words. In such languages we can not do dictionary-driven path pruning, 00288 // so paths with non-empty dawg_info are considered prunable. 00289 inline bool PrunablePath(const ViterbiStateEntry &vse) { 00290 if (vse.top_choice_flags) return false; 00291 if (vse.dawg_info != NULL && 00292 (vse.dawg_info->permuter == SYSTEM_DAWG_PERM || 00293 vse.dawg_info->permuter == USER_DAWG_PERM || 00294 vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false; 00295 return true; 00296 } 00297 00298 // Returns true if the given ViterbiStateEntry represents an acceptable path. 00299 inline bool AcceptablePath(const ViterbiStateEntry &vse) { 00300 return (vse.dawg_info != NULL || vse.Consistent() || 00301 (vse.ngram_info != NULL && !vse.ngram_info->pruned)); 00302 } 00303 00304 public: 00305 // Parameters. 00306 INT_VAR_H(language_model_debug_level, 0, "Language model debug level"); 00307 BOOL_VAR_H(language_model_ngram_on, false, 00308 "Turn on/off the use of character ngram model"); 00309 INT_VAR_H(language_model_ngram_order, 8, 00310 "Maximum order of the character ngram model"); 00311 INT_VAR_H(language_model_viterbi_list_max_num_prunable, 10, 00312 "Maximum number of prunable (those for which PrunablePath() is" 00313 " true) entries in each viterbi list recorded in BLOB_CHOICEs"); 00314 INT_VAR_H(language_model_viterbi_list_max_size, 500, 00315 "Maximum size of viterbi lists recorded in BLOB_CHOICEs"); 00316 double_VAR_H(language_model_ngram_small_prob, 0.000001, 00317 "To avoid overly small denominators use this as the floor" 00318 " of the probability returned by the ngram model"); 00319 double_VAR_H(language_model_ngram_nonmatch_score, -40.0, 00320 "Average classifier score of a non-matching unichar"); 00321 BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step, false, 00322 "Use only the first UTF8 step of the given string" 00323 " when computing log probabilities"); 00324 double_VAR_H(language_model_ngram_scale_factor, 0.03, 00325 "Strength of the character ngram model relative to the" 00326 " character classifier "); 00327 double_VAR_H(language_model_ngram_rating_factor, 16.0, 00328 "Factor to bring log-probs into the same range as ratings" 00329 " when multiplied by outline length "); 00330 BOOL_VAR_H(language_model_ngram_space_delimited_language, true, 00331 "Words are delimited by space"); 00332 INT_VAR_H(language_model_min_compound_length, 3, 00333 "Minimum length of compound words"); 00334 // Penalties used for adjusting path costs and final word rating. 00335 double_VAR_H(language_model_penalty_non_freq_dict_word, 0.1, 00336 "Penalty for words not in the frequent word dictionary"); 00337 double_VAR_H(language_model_penalty_non_dict_word, 0.15, 00338 "Penalty for non-dictionary words"); 00339 double_VAR_H(language_model_penalty_punc, 0.2, 00340 "Penalty for inconsistent punctuation"); 00341 double_VAR_H(language_model_penalty_case, 0.1, 00342 "Penalty for inconsistent case"); 00343 double_VAR_H(language_model_penalty_script, 0.5, 00344 "Penalty for inconsistent script"); 00345 double_VAR_H(language_model_penalty_chartype, 0.3, 00346 "Penalty for inconsistent character type"); 00347 double_VAR_H(language_model_penalty_font, 0.00, 00348 "Penalty for inconsistent font"); 00349 double_VAR_H(language_model_penalty_spacing, 0.05, 00350 "Penalty for inconsistent spacing"); 00351 double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment"); 00352 INT_VAR_H(wordrec_display_segmentations, 0, "Display Segmentations"); 00353 BOOL_VAR_H(language_model_use_sigmoidal_certainty, false, 00354 "Use sigmoidal score for certainty"); 00355 00356 00357 protected: 00358 // Member Variables. 00359 00360 // Temporary DawgArgs struct that is re-used across different words to 00361 // avoid dynamic memory re-allocation (should be cleared before each use). 00362 DawgArgs *dawg_args_; 00363 // Scaling for recovering blob outline length from rating and certainty. 00364 float rating_cert_scale_; 00365 00366 // The following variables are set at construction time. 00367 00368 // Pointer to fontinfo table (not owned by LanguageModel). 00369 const UnicityTable<FontInfo> *fontinfo_table_; 00370 00371 // Pointer to Dict class, that is used for querying the dictionaries 00372 // (the pointer is not owned by LanguageModel). 00373 Dict *dict_; 00374 00375 // TODO(daria): the following variables should become LanguageModel params 00376 // when the old code in bestfirst.cpp and heuristic.cpp is deprecated. 00377 // 00378 // Set to true if we are dealing with fixed pitch text 00379 // (set to assume_fixed_pitch_char_segment). 00380 bool fixed_pitch_; 00381 // Max char width-to-height ratio allowed 00382 // (set to segsearch_max_char_wh_ratio). 00383 float max_char_wh_ratio_; 00384 00385 // The following variables are initialized with InitForWord(). 00386 00387 // String representation of the classification of the previous word 00388 // (since this is only used by the character ngram model component, 00389 // only the last language_model_ngram_order of the word are stored). 00390 STRING prev_word_str_; 00391 int prev_word_unichar_step_len_; 00392 // Active dawg vector. 00393 DawgPositionVector *very_beginning_active_dawgs_; // includes continuation 00394 DawgPositionVector *beginning_active_dawgs_; 00395 // Set to true if acceptable choice was discovered. 00396 // Note: it would be nice to use this to terminate the search once an 00397 // acceptable choices is found. However we do not do that and once an 00398 // acceptable choice is found we finish looking for alternative choices 00399 // in the current segmentation graph and then exit the search (no more 00400 // classifications are done after an acceptable choice is found). 00401 // This is needed in order to let the search find the words very close to 00402 // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these 00403 // choices. This way the stopper will know that the best choice is not 00404 // ambiguous (i.e. there are best choices in the best choice list that have 00405 // ratings close to the very best one) and will be less likely to mis-adapt. 00406 bool acceptable_choice_found_; 00407 // Set to true if a choice representing correct segmentation was explored. 00408 bool correct_segmentation_explored_; 00409 00410 // Params models containing weights for for computing ViterbiStateEntry costs. 00411 ParamsModel params_model_; 00412 }; 00413 00414 } // namespace tesseract 00415 00416 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_