tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/dict/permdawg.cpp
Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        permdawg.c  (Formerly permdawg.c)
00005  * Description:  Scale word choices by a dictionary
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Fri Oct 16 14:37:00 1987
00008  * Modified:     Tue Jul  9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Reusable Software Component
00012  *
00013  * (c) Copyright 1987, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  *********************************************************************************/
00025 /*----------------------------------------------------------------------
00026               I n c l u d e s
00027 ----------------------------------------------------------------------*/
00028 
00029 #include "cutil.h"
00030 #include "dawg.h"
00031 #include "freelist.h"
00032 #include "globals.h"
00033 #include "ndminx.h"
00034 #include "stopper.h"
00035 #include "tprintf.h"
00036 #include "params.h"
00037 
00038 #include <ctype.h>
00039 #include "dict.h"
00040 
00041 /*----------------------------------------------------------------------
00042               F u n c t i o n s
00043 ----------------------------------------------------------------------*/
00044 namespace tesseract {
00045 
00052 void Dict::go_deeper_dawg_fxn(
00053     const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
00054     int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00055     bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
00056     WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
00057   DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
00058   word_ending = (char_choice_index == char_choices.size()-1);
00059   int word_index = word->length() - 1;
00060   if (best_choice->rating() < *limit) return;
00061   // Look up char in DAWG
00062 
00063   // If the current unichar is an ngram first try calling
00064   // letter_is_okay() for each unigram it contains separately.
00065   UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
00066   bool checked_unigrams = false;
00067   if (getUnicharset().get_isngram(orig_uch_id)) {
00068     if (dawg_debug_level) {
00069       tprintf("checking unigrams in an ngram %s\n",
00070               getUnicharset().debug_str(orig_uch_id).string());
00071     }
00072     int num_unigrams = 0;
00073     word->remove_last_unichar_id();
00074     GenericVector<UNICHAR_ID> encoding;
00075     const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
00076     // Since the string came out of the unicharset, failure is impossible.
00077     ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, NULL,
00078                                               NULL));
00079     bool unigrams_ok = true;
00080     // Construct DawgArgs that reflect the current state.
00081     DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
00082     DawgPositionVector unigram_updated_dawgs;
00083     DawgArgs unigram_dawg_args(&unigram_active_dawgs,
00084                                &unigram_updated_dawgs,
00085                                more_args->permuter);
00086     // Check unigrams in the ngram with letter_is_okay().
00087     for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
00088       UNICHAR_ID uch_id = encoding[i];
00089       ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
00090       ++num_unigrams;
00091       word->append_unichar_id(uch_id, 1, 0.0, 0.0);
00092       unigrams_ok = (this->*letter_is_okay_)(
00093           &unigram_dawg_args,
00094           word->unichar_id(word_index+num_unigrams-1),
00095           word_ending && i == encoding.size() - 1);
00096       (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
00097       if (dawg_debug_level) {
00098         tprintf("unigram %s is %s\n",
00099                 getUnicharset().debug_str(uch_id).string(),
00100                 unigrams_ok ? "OK" : "not OK");
00101       }
00102     }
00103     // Restore the word and copy the updated dawg state if needed.
00104     while (num_unigrams-- > 0) word->remove_last_unichar_id();
00105     word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
00106     if (unigrams_ok) {
00107       checked_unigrams = true;
00108       more_args->permuter = unigram_dawg_args.permuter;
00109       *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
00110     }
00111   }
00112 
00113   // Check which dawgs from the dawgs_ vector contain the word
00114   // up to and including the current unichar.
00115   if (checked_unigrams || (this->*letter_is_okay_)(
00116       more_args, word->unichar_id(word_index), word_ending)) {
00117     // Add a new word choice
00118     if (word_ending) {
00119       if (dawg_debug_level) {
00120         tprintf("found word = %s\n", word->debug_string().string());
00121       }
00122       if (strcmp(output_ambig_words_file.string(), "") != 0) {
00123         if (output_ambig_words_file_ == NULL) {
00124           output_ambig_words_file_ =
00125               fopen(output_ambig_words_file.string(), "wb+");
00126           if (output_ambig_words_file_ == NULL) {
00127             tprintf("Failed to open output_ambig_words_file %s\n",
00128                     output_ambig_words_file.string());
00129             exit(1);
00130           }
00131           STRING word_str;
00132           word->string_and_lengths(&word_str, NULL);
00133           word_str += " ";
00134           fprintf(output_ambig_words_file_, "%s", word_str.string());
00135         }
00136         STRING word_str;
00137         word->string_and_lengths(&word_str, NULL);
00138         word_str += " ";
00139         fprintf(output_ambig_words_file_, "%s", word_str.string());
00140       }
00141       WERD_CHOICE *adjusted_word = word;
00142       adjusted_word->set_permuter(more_args->permuter);
00143       update_best_choice(*adjusted_word, best_choice);
00144     } else {  // search the next letter
00145       // Make updated_* point to the next entries in the DawgPositionVector
00146       // arrays (that were originally created in dawg_permute_and_select)
00147       ++(more_args->updated_dawgs);
00148       // Make active_dawgs and constraints point to the updated ones.
00149       ++(more_args->active_dawgs);
00150       permute_choices(debug, char_choices, char_choice_index + 1,
00151                       prev_char_frag_info, word, certainties, limit,
00152                       best_choice, attempts_left, more_args);
00153       // Restore previous state to explore another letter in this position.
00154       --(more_args->updated_dawgs);
00155       --(more_args->active_dawgs);
00156     }
00157   } else {
00158       if (dawg_debug_level) {
00159         tprintf("last unichar not OK at index %d in %s\n",
00160                 word_index, word->debug_string().string());
00161     }
00162   }
00163 }
00164 
00165 
00175 WERD_CHOICE *Dict::dawg_permute_and_select(
00176     const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
00177   WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
00178   best_choice->make_bad();
00179   best_choice->set_rating(rating_limit);
00180   if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
00181     return best_choice;
00182   DawgPositionVector *active_dawgs =
00183       new DawgPositionVector[char_choices.length() + 1];
00184   init_active_dawgs(&(active_dawgs[0]), true);
00185   DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
00186   WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
00187 
00188   float certainties[MAX_WERD_LENGTH];
00189   this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
00190   int attempts_left = max_permuter_attempts;
00191   permute_choices((dawg_debug_level) ? "permute_dawg_debug" : NULL,
00192       char_choices, 0, NULL, &word, certainties, &rating_limit, best_choice,
00193       &attempts_left, &dawg_args);
00194   delete[] active_dawgs;
00195   return best_choice;
00196 }
00197 
00204 void Dict::permute_choices(
00205     const char *debug,
00206     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00207     int char_choice_index,
00208     const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00209     WERD_CHOICE *word,
00210     float certainties[],
00211     float *limit,
00212     WERD_CHOICE *best_choice,
00213     int *attempts_left,
00214     void *more_args) {
00215   if (debug) {
00216     tprintf("%s permute_choices: char_choice_index=%d"
00217             " limit=%g rating=%g, certainty=%g word=%s\n",
00218             debug, char_choice_index, *limit, word->rating(),
00219             word->certainty(), word->debug_string().string());
00220   }
00221   if (char_choice_index < char_choices.length()) {
00222     BLOB_CHOICE_IT blob_choice_it;
00223     blob_choice_it.set_to_list(char_choices.get(char_choice_index));
00224     for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
00225          blob_choice_it.forward()) {
00226       (*attempts_left)--;
00227       append_choices(debug, char_choices, *(blob_choice_it.data()),
00228                      char_choice_index, prev_char_frag_info, word,
00229                      certainties, limit, best_choice, attempts_left, more_args);
00230       if (*attempts_left <= 0) {
00231         if (debug) tprintf("permute_choices(): attempts_left is 0\n");
00232         break;
00233       }
00234     }
00235   }
00236 }
00237 
00246 void Dict::append_choices(
00247     const char *debug,
00248     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00249     const BLOB_CHOICE &blob_choice,
00250     int char_choice_index,
00251     const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00252     WERD_CHOICE *word,
00253     float certainties[],
00254     float *limit,
00255     WERD_CHOICE *best_choice,
00256     int *attempts_left,
00257     void *more_args) {
00258   int word_ending =
00259     (char_choice_index == char_choices.length() - 1) ? true : false;
00260 
00261   // Deal with fragments.
00262   CHAR_FRAGMENT_INFO char_frag_info;
00263   if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
00264                            blob_choice.certainty(), prev_char_frag_info, debug,
00265                            word_ending, &char_frag_info)) {
00266     return;  // blob_choice must be an invalid fragment
00267   }
00268   // Search the next letter if this character is a fragment.
00269   if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
00270     permute_choices(debug, char_choices, char_choice_index + 1,
00271                     &char_frag_info, word, certainties, limit,
00272                     best_choice, attempts_left, more_args);
00273     return;
00274   }
00275 
00276   // Add the next unichar.
00277   float old_rating = word->rating();
00278   float old_certainty = word->certainty();
00279   uinT8 old_permuter = word->permuter();
00280   certainties[word->length()] = char_frag_info.certainty;
00281   word->append_unichar_id_space_allocated(
00282       char_frag_info.unichar_id, char_frag_info.num_fragments,
00283       char_frag_info.rating, char_frag_info.certainty);
00284 
00285   // Explore the next unichar.
00286   (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
00287                           &char_frag_info, word_ending, word, certainties,
00288                           limit, best_choice, attempts_left, more_args);
00289 
00290   // Remove the unichar we added to explore other choices in it's place.
00291   word->remove_last_unichar_id();
00292   word->set_rating(old_rating);
00293   word->set_certainty(old_certainty);
00294   word->set_permuter(old_permuter);
00295 }
00296 
00322 bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id,
00323                                float curr_rating, float curr_certainty,
00324                                const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00325                                const char *debug, int word_ending,
00326                                CHAR_FRAGMENT_INFO *char_frag_info) {
00327   const CHAR_FRAGMENT *this_fragment =
00328     getUnicharset().get_fragment(curr_unichar_id);
00329   const CHAR_FRAGMENT *prev_fragment =
00330     prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
00331 
00332   // Print debug info for fragments.
00333   if (debug && (prev_fragment || this_fragment)) {
00334     tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
00335             getUnicharset().debug_str(curr_unichar_id).string(),
00336             word_ending);
00337     if (prev_fragment) {
00338       tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
00339     }
00340     if (this_fragment) {
00341       tprintf("this_fragment %s\n", this_fragment->to_string().string());
00342     }
00343   }
00344 
00345   char_frag_info->unichar_id = curr_unichar_id;
00346   char_frag_info->fragment = this_fragment;
00347   char_frag_info->rating = curr_rating;
00348   char_frag_info->certainty = curr_certainty;
00349   char_frag_info->num_fragments = 1;
00350   if (prev_fragment && !this_fragment) {
00351     if (debug) tprintf("Skip choice with incomplete fragment\n");
00352     return false;
00353   }
00354   if (this_fragment) {
00355     // We are dealing with a fragment.
00356     char_frag_info->unichar_id = INVALID_UNICHAR_ID;
00357     if (prev_fragment) {
00358       if (!this_fragment->is_continuation_of(prev_fragment)) {
00359         if (debug) tprintf("Non-matching fragment piece\n");
00360         return false;
00361       }
00362       if (this_fragment->is_ending()) {
00363         char_frag_info->unichar_id =
00364           getUnicharset().unichar_to_id(this_fragment->get_unichar());
00365         char_frag_info->fragment = NULL;
00366         if (debug) {
00367           tprintf("Built character %s from fragments\n",
00368                   getUnicharset().debug_str(
00369                       char_frag_info->unichar_id).string());
00370         }
00371       } else {
00372         if (debug) tprintf("Record fragment continuation\n");
00373         char_frag_info->fragment = this_fragment;
00374       }
00375       // Update certainty and rating.
00376       char_frag_info->rating =
00377         prev_char_frag_info->rating + curr_rating;
00378       char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
00379       char_frag_info->certainty =
00380         MIN(curr_certainty, prev_char_frag_info->certainty);
00381     } else {
00382       if (this_fragment->is_beginning()) {
00383         if (debug) tprintf("Record fragment beginning\n");
00384       } else {
00385         if (debug) {
00386           tprintf("Non-starting fragment piece with no prev_fragment\n");
00387         }
00388         return false;
00389       }
00390     }
00391   }
00392   if (word_ending && char_frag_info->fragment) {
00393     if (debug) tprintf("Word can not end with a fragment\n");
00394     return false;
00395   }
00396   return true;
00397 }
00398 
00399 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines