tesseract
3.03
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: permdawg.c (Formerly permdawg.c) 00005 * Description: Scale word choices by a dictionary 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Tue Jul 9 15:43:18 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I n c l u d e s 00027 ----------------------------------------------------------------------*/ 00028 00029 #include "cutil.h" 00030 #include "dawg.h" 00031 #include "freelist.h" 00032 #include "globals.h" 00033 #include "ndminx.h" 00034 #include "stopper.h" 00035 #include "tprintf.h" 00036 #include "params.h" 00037 00038 #include <ctype.h> 00039 #include "dict.h" 00040 00041 /*---------------------------------------------------------------------- 00042 F u n c t i o n s 00043 ----------------------------------------------------------------------*/ 00044 namespace tesseract { 00045 00052 void Dict::go_deeper_dawg_fxn( 00053 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, 00054 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00055 bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, 00056 WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) { 00057 DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args); 00058 word_ending = (char_choice_index == char_choices.size()-1); 00059 int word_index = word->length() - 1; 00060 if (best_choice->rating() < *limit) return; 00061 // Look up char in DAWG 00062 00063 // If the current unichar is an ngram first try calling 00064 // letter_is_okay() for each unigram it contains separately. 00065 UNICHAR_ID orig_uch_id = word->unichar_id(word_index); 00066 bool checked_unigrams = false; 00067 if (getUnicharset().get_isngram(orig_uch_id)) { 00068 if (dawg_debug_level) { 00069 tprintf("checking unigrams in an ngram %s\n", 00070 getUnicharset().debug_str(orig_uch_id).string()); 00071 } 00072 int num_unigrams = 0; 00073 word->remove_last_unichar_id(); 00074 GenericVector<UNICHAR_ID> encoding; 00075 const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); 00076 // Since the string came out of the unicharset, failure is impossible. 00077 ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, NULL, 00078 NULL)); 00079 bool unigrams_ok = true; 00080 // Construct DawgArgs that reflect the current state. 00081 DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs); 00082 DawgPositionVector unigram_updated_dawgs; 00083 DawgArgs unigram_dawg_args(&unigram_active_dawgs, 00084 &unigram_updated_dawgs, 00085 more_args->permuter); 00086 // Check unigrams in the ngram with letter_is_okay(). 00087 for (int i = 0; unigrams_ok && i < encoding.size(); ++i) { 00088 UNICHAR_ID uch_id = encoding[i]; 00089 ASSERT_HOST(uch_id != INVALID_UNICHAR_ID); 00090 ++num_unigrams; 00091 word->append_unichar_id(uch_id, 1, 0.0, 0.0); 00092 unigrams_ok = (this->*letter_is_okay_)( 00093 &unigram_dawg_args, 00094 word->unichar_id(word_index+num_unigrams-1), 00095 word_ending && i == encoding.size() - 1); 00096 (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs); 00097 if (dawg_debug_level) { 00098 tprintf("unigram %s is %s\n", 00099 getUnicharset().debug_str(uch_id).string(), 00100 unigrams_ok ? "OK" : "not OK"); 00101 } 00102 } 00103 // Restore the word and copy the updated dawg state if needed. 00104 while (num_unigrams-- > 0) word->remove_last_unichar_id(); 00105 word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0); 00106 if (unigrams_ok) { 00107 checked_unigrams = true; 00108 more_args->permuter = unigram_dawg_args.permuter; 00109 *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs); 00110 } 00111 } 00112 00113 // Check which dawgs from the dawgs_ vector contain the word 00114 // up to and including the current unichar. 00115 if (checked_unigrams || (this->*letter_is_okay_)( 00116 more_args, word->unichar_id(word_index), word_ending)) { 00117 // Add a new word choice 00118 if (word_ending) { 00119 if (dawg_debug_level) { 00120 tprintf("found word = %s\n", word->debug_string().string()); 00121 } 00122 if (strcmp(output_ambig_words_file.string(), "") != 0) { 00123 if (output_ambig_words_file_ == NULL) { 00124 output_ambig_words_file_ = 00125 fopen(output_ambig_words_file.string(), "wb+"); 00126 if (output_ambig_words_file_ == NULL) { 00127 tprintf("Failed to open output_ambig_words_file %s\n", 00128 output_ambig_words_file.string()); 00129 exit(1); 00130 } 00131 STRING word_str; 00132 word->string_and_lengths(&word_str, NULL); 00133 word_str += " "; 00134 fprintf(output_ambig_words_file_, "%s", word_str.string()); 00135 } 00136 STRING word_str; 00137 word->string_and_lengths(&word_str, NULL); 00138 word_str += " "; 00139 fprintf(output_ambig_words_file_, "%s", word_str.string()); 00140 } 00141 WERD_CHOICE *adjusted_word = word; 00142 adjusted_word->set_permuter(more_args->permuter); 00143 update_best_choice(*adjusted_word, best_choice); 00144 } else { // search the next letter 00145 // Make updated_* point to the next entries in the DawgPositionVector 00146 // arrays (that were originally created in dawg_permute_and_select) 00147 ++(more_args->updated_dawgs); 00148 // Make active_dawgs and constraints point to the updated ones. 00149 ++(more_args->active_dawgs); 00150 permute_choices(debug, char_choices, char_choice_index + 1, 00151 prev_char_frag_info, word, certainties, limit, 00152 best_choice, attempts_left, more_args); 00153 // Restore previous state to explore another letter in this position. 00154 --(more_args->updated_dawgs); 00155 --(more_args->active_dawgs); 00156 } 00157 } else { 00158 if (dawg_debug_level) { 00159 tprintf("last unichar not OK at index %d in %s\n", 00160 word_index, word->debug_string().string()); 00161 } 00162 } 00163 } 00164 00165 00175 WERD_CHOICE *Dict::dawg_permute_and_select( 00176 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) { 00177 WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset()); 00178 best_choice->make_bad(); 00179 best_choice->set_rating(rating_limit); 00180 if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH) 00181 return best_choice; 00182 DawgPositionVector *active_dawgs = 00183 new DawgPositionVector[char_choices.length() + 1]; 00184 init_active_dawgs(&(active_dawgs[0]), true); 00185 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); 00186 WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH); 00187 00188 float certainties[MAX_WERD_LENGTH]; 00189 this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn; 00190 int attempts_left = max_permuter_attempts; 00191 permute_choices((dawg_debug_level) ? "permute_dawg_debug" : NULL, 00192 char_choices, 0, NULL, &word, certainties, &rating_limit, best_choice, 00193 &attempts_left, &dawg_args); 00194 delete[] active_dawgs; 00195 return best_choice; 00196 } 00197 00204 void Dict::permute_choices( 00205 const char *debug, 00206 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00207 int char_choice_index, 00208 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00209 WERD_CHOICE *word, 00210 float certainties[], 00211 float *limit, 00212 WERD_CHOICE *best_choice, 00213 int *attempts_left, 00214 void *more_args) { 00215 if (debug) { 00216 tprintf("%s permute_choices: char_choice_index=%d" 00217 " limit=%g rating=%g, certainty=%g word=%s\n", 00218 debug, char_choice_index, *limit, word->rating(), 00219 word->certainty(), word->debug_string().string()); 00220 } 00221 if (char_choice_index < char_choices.length()) { 00222 BLOB_CHOICE_IT blob_choice_it; 00223 blob_choice_it.set_to_list(char_choices.get(char_choice_index)); 00224 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); 00225 blob_choice_it.forward()) { 00226 (*attempts_left)--; 00227 append_choices(debug, char_choices, *(blob_choice_it.data()), 00228 char_choice_index, prev_char_frag_info, word, 00229 certainties, limit, best_choice, attempts_left, more_args); 00230 if (*attempts_left <= 0) { 00231 if (debug) tprintf("permute_choices(): attempts_left is 0\n"); 00232 break; 00233 } 00234 } 00235 } 00236 } 00237 00246 void Dict::append_choices( 00247 const char *debug, 00248 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00249 const BLOB_CHOICE &blob_choice, 00250 int char_choice_index, 00251 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00252 WERD_CHOICE *word, 00253 float certainties[], 00254 float *limit, 00255 WERD_CHOICE *best_choice, 00256 int *attempts_left, 00257 void *more_args) { 00258 int word_ending = 00259 (char_choice_index == char_choices.length() - 1) ? true : false; 00260 00261 // Deal with fragments. 00262 CHAR_FRAGMENT_INFO char_frag_info; 00263 if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), 00264 blob_choice.certainty(), prev_char_frag_info, debug, 00265 word_ending, &char_frag_info)) { 00266 return; // blob_choice must be an invalid fragment 00267 } 00268 // Search the next letter if this character is a fragment. 00269 if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) { 00270 permute_choices(debug, char_choices, char_choice_index + 1, 00271 &char_frag_info, word, certainties, limit, 00272 best_choice, attempts_left, more_args); 00273 return; 00274 } 00275 00276 // Add the next unichar. 00277 float old_rating = word->rating(); 00278 float old_certainty = word->certainty(); 00279 uinT8 old_permuter = word->permuter(); 00280 certainties[word->length()] = char_frag_info.certainty; 00281 word->append_unichar_id_space_allocated( 00282 char_frag_info.unichar_id, char_frag_info.num_fragments, 00283 char_frag_info.rating, char_frag_info.certainty); 00284 00285 // Explore the next unichar. 00286 (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, 00287 &char_frag_info, word_ending, word, certainties, 00288 limit, best_choice, attempts_left, more_args); 00289 00290 // Remove the unichar we added to explore other choices in it's place. 00291 word->remove_last_unichar_id(); 00292 word->set_rating(old_rating); 00293 word->set_certainty(old_certainty); 00294 word->set_permuter(old_permuter); 00295 } 00296 00322 bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, 00323 float curr_rating, float curr_certainty, 00324 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00325 const char *debug, int word_ending, 00326 CHAR_FRAGMENT_INFO *char_frag_info) { 00327 const CHAR_FRAGMENT *this_fragment = 00328 getUnicharset().get_fragment(curr_unichar_id); 00329 const CHAR_FRAGMENT *prev_fragment = 00330 prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL; 00331 00332 // Print debug info for fragments. 00333 if (debug && (prev_fragment || this_fragment)) { 00334 tprintf("%s check fragments: choice=%s word_ending=%d\n", debug, 00335 getUnicharset().debug_str(curr_unichar_id).string(), 00336 word_ending); 00337 if (prev_fragment) { 00338 tprintf("prev_fragment %s\n", prev_fragment->to_string().string()); 00339 } 00340 if (this_fragment) { 00341 tprintf("this_fragment %s\n", this_fragment->to_string().string()); 00342 } 00343 } 00344 00345 char_frag_info->unichar_id = curr_unichar_id; 00346 char_frag_info->fragment = this_fragment; 00347 char_frag_info->rating = curr_rating; 00348 char_frag_info->certainty = curr_certainty; 00349 char_frag_info->num_fragments = 1; 00350 if (prev_fragment && !this_fragment) { 00351 if (debug) tprintf("Skip choice with incomplete fragment\n"); 00352 return false; 00353 } 00354 if (this_fragment) { 00355 // We are dealing with a fragment. 00356 char_frag_info->unichar_id = INVALID_UNICHAR_ID; 00357 if (prev_fragment) { 00358 if (!this_fragment->is_continuation_of(prev_fragment)) { 00359 if (debug) tprintf("Non-matching fragment piece\n"); 00360 return false; 00361 } 00362 if (this_fragment->is_ending()) { 00363 char_frag_info->unichar_id = 00364 getUnicharset().unichar_to_id(this_fragment->get_unichar()); 00365 char_frag_info->fragment = NULL; 00366 if (debug) { 00367 tprintf("Built character %s from fragments\n", 00368 getUnicharset().debug_str( 00369 char_frag_info->unichar_id).string()); 00370 } 00371 } else { 00372 if (debug) tprintf("Record fragment continuation\n"); 00373 char_frag_info->fragment = this_fragment; 00374 } 00375 // Update certainty and rating. 00376 char_frag_info->rating = 00377 prev_char_frag_info->rating + curr_rating; 00378 char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1; 00379 char_frag_info->certainty = 00380 MIN(curr_certainty, prev_char_frag_info->certainty); 00381 } else { 00382 if (this_fragment->is_beginning()) { 00383 if (debug) tprintf("Record fragment beginning\n"); 00384 } else { 00385 if (debug) { 00386 tprintf("Non-starting fragment piece with no prev_fragment\n"); 00387 } 00388 return false; 00389 } 00390 } 00391 } 00392 if (word_ending && char_frag_info->fragment) { 00393 if (debug) tprintf("Word can not end with a fragment\n"); 00394 return false; 00395 } 00396 return true; 00397 } 00398 00399 } // namespace tesseract