tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/reject.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        reject.cpp  (Formerly reject.c)
00003  * Description: Rejection functions used in tessedit
00004  * Author:              Phil Cheatle
00005  * Created:             Wed Sep 23 16:50:21 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #pragma warning(disable:4305)  // int/float warnings
00023 #endif
00024 
00025 #include          "tessvars.h"
00026 #ifdef __UNIX__
00027 #include          <assert.h>
00028 #include          <errno.h>
00029 #endif
00030 #include          "scanutils.h"
00031 #include          <ctype.h>
00032 #include          <string.h>
00033 #include          "genericvector.h"
00034 #include          "reject.h"
00035 #include          "control.h"
00036 #include          "docqual.h"
00037 #include          "secname.h"
00038 #include          "globaloc.h"  // For err_exit.
00039 #include          "globals.h"
00040 #include          "helpers.h"
00041 
00042 /* #define SECURE_NAMES done in secnames.h when necessary */
00043 
00044 #include "tesseractclass.h"
00045 
00046 // Include automatically generated configuration file if running autoconf.
00047 #ifdef HAVE_CONFIG_H
00048 #include "config_auto.h"
00049 #endif
00050 
00051 CLISTIZEH (STRING) CLISTIZE (STRING)
00052 
00053 /*************************************************************************
00054  * set_done()
00055  *
00056  * Set the done flag based on the word acceptability criteria
00057  *************************************************************************/
00058 
00059 namespace tesseract {
00060 void Tesseract::set_done(WERD_RES *word, inT16 pass) {
00061   word->done = word->tess_accepted &&
00062       (strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
00063   bool word_is_ambig = word->best_choice->dangerous_ambig_found();
00064   bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
00065       word->best_choice->permuter() == FREQ_DAWG_PERM ||
00066       word->best_choice->permuter() == USER_DAWG_PERM;
00067   if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
00068       one_ell_conflict(word, FALSE)) {
00069     if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
00070     word->done = FALSE;
00071   }
00072   if (word->done && ((!word_from_dict &&
00073       word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
00074     if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
00075       word->done = FALSE;
00076   }
00077   if (tessedit_rejection_debug) {
00078     tprintf("set_done(): done=%d\n", word->done);
00079     word->best_choice->print("");
00080   }
00081 }
00082 
00083 
00084 /*************************************************************************
00085  * make_reject_map()
00086  *
00087  * Sets the done flag to indicate whether the resylt is acceptable.
00088  *
00089  * Sets a reject map for the word.
00090  *************************************************************************/
00091 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
00092   int i;
00093   int offset;
00094 
00095   flip_0O(word);
00096   check_debug_pt(word, -1);     // For trap only
00097   set_done(word, pass);  // Set acceptance
00098   word->reject_map.initialise(word->best_choice->unichar_lengths().length());
00099   reject_blanks(word);
00100   /*
00101   0: Rays original heuristic - the baseline
00102   */
00103   if (tessedit_reject_mode == 0) {
00104     if (!word->done)
00105       reject_poor_matches(word);
00106   } else if (tessedit_reject_mode == 5) {
00107     /*
00108     5: Reject I/1/l from words where there is no strong contextual confirmation;
00109       the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
00110       and the whole of any words which are very small
00111     */
00112     if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
00113       word->reject_map.rej_word_small_xht();
00114     } else {
00115       one_ell_conflict(word, TRUE);
00116       /*
00117         Originally the code here just used the done flag. Now I have duplicated
00118         and unpacked the conditions for setting the done flag so that each
00119         mechanism can be turned on or off independently. This works WITHOUT
00120         affecting the done flag setting.
00121       */
00122       if (rej_use_tess_accepted && !word->tess_accepted)
00123         word->reject_map.rej_word_not_tess_accepted ();
00124 
00125       if (rej_use_tess_blanks &&
00126         (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
00127         word->reject_map.rej_word_contains_blanks ();
00128 
00129       WERD_CHOICE* best_choice = word->best_choice;
00130       if (rej_use_good_perm) {
00131         if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
00132              best_choice->permuter() == FREQ_DAWG_PERM ||
00133              best_choice->permuter() == USER_DAWG_PERM) &&
00134             (!rej_use_sensible_wd ||
00135              acceptable_word_string(*word->uch_set,
00136                                     best_choice->unichar_string().string(),
00137                                     best_choice->unichar_lengths().string()) !=
00138                                         AC_UNACCEPTABLE)) {
00139           // PASSED TEST
00140         } else if (best_choice->permuter() == NUMBER_PERM) {
00141           if (rej_alphas_in_number_perm) {
00142             for (i = 0, offset = 0;
00143                  best_choice->unichar_string()[offset] != '\0';
00144                  offset += best_choice->unichar_lengths()[i++]) {
00145               if (word->reject_map[i].accepted() &&
00146                   word->uch_set->get_isalpha(
00147                       best_choice->unichar_string().string() + offset,
00148                       best_choice->unichar_lengths()[i]))
00149                 word->reject_map[i].setrej_bad_permuter();
00150               // rej alpha
00151             }
00152           }
00153         } else {
00154           word->reject_map.rej_word_bad_permuter();
00155         }
00156       }
00157       /* Ambig word rejection was here once !!*/
00158     }
00159   } else {
00160     tprintf("BAD tessedit_reject_mode\n");
00161     err_exit();
00162   }
00163 
00164   if (tessedit_image_border > -1)
00165     reject_edge_blobs(word);
00166 
00167   check_debug_pt (word, 10);
00168   if (tessedit_rejection_debug) {
00169     tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
00170     tprintf("Certainty: %f     Rating: %f\n",
00171       word->best_choice->certainty (), word->best_choice->rating ());
00172     tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
00173   }
00174 
00175   flip_hyphens(word);
00176   check_debug_pt(word, 20);
00177 }
00178 }  // namespace tesseract
00179 
00180 
00181 void reject_blanks(WERD_RES *word) {
00182   inT16 i;
00183   inT16 offset;
00184 
00185   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
00186        offset += word->best_choice->unichar_lengths()[i], i += 1) {
00187     if (word->best_choice->unichar_string()[offset] == ' ')
00188                                  //rej unrecognised blobs
00189       word->reject_map[i].setrej_tess_failure ();
00190   }
00191 }
00192 
00193 namespace tesseract {
00194 void Tesseract::reject_I_1_L(WERD_RES *word) {
00195   inT16 i;
00196   inT16 offset;
00197 
00198   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
00199        offset += word->best_choice->unichar_lengths()[i], i += 1) {
00200     if (STRING (conflict_set_I_l_1).
00201     contains (word->best_choice->unichar_string()[offset])) {
00202                                  //rej 1Il conflict
00203       word->reject_map[i].setrej_1Il_conflict ();
00204     }
00205   }
00206 }
00207 }  // namespace tesseract
00208 
00209 
00210 void reject_poor_matches(WERD_RES *word) {
00211   float threshold = compute_reject_threshold(word->best_choice);
00212   for (int i = 0; i < word->best_choice->length(); ++i) {
00213     if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
00214       word->reject_map[i].setrej_tess_failure();
00215     else if (word->best_choice->certainty(i) < threshold)
00216       word->reject_map[i].setrej_poor_match();
00217   }
00218 }
00219 
00220 
00221 /**********************************************************************
00222  * compute_reject_threshold
00223  *
00224  * Set a rejection threshold for this word.
00225  * Initially this is a trivial function which looks for the largest
00226  * gap in the certainty value.
00227  **********************************************************************/
00228 
00229 float compute_reject_threshold(WERD_CHOICE* word) {
00230   float threshold;               // rejection threshold
00231   float bestgap = 0.0f;          // biggest gap
00232   float gapstart;                // bottom of gap
00233                                  // super iterator
00234   BLOB_CHOICE_IT choice_it;      // real iterator
00235 
00236   int blob_count = word->length();
00237   GenericVector<float> ratings;
00238   ratings.init_to_size(blob_count, 0.0f);
00239   for (int i = 0; i < blob_count; ++i) {
00240     ratings[i] = word->certainty(i);
00241   }
00242   ratings.sort();
00243   gapstart = ratings[0] - 1;     // all reject if none better
00244   if (blob_count >= 3) {
00245     for (int index = 0; index < blob_count - 1; index++) {
00246       if (ratings[index + 1] - ratings[index] > bestgap) {
00247         bestgap = ratings[index + 1] - ratings[index];
00248         // find biggest
00249         gapstart = ratings[index];
00250       }
00251     }
00252   }
00253   threshold = gapstart + bestgap / 2;
00254 
00255   return threshold;
00256 }
00257 
00258 
00259 /*************************************************************************
00260  * reject_edge_blobs()
00261  *
00262  * If the word is perilously close to the edge of the image, reject those blobs
00263  * in the word which are too close to the edge as they could be clipped.
00264  *************************************************************************/
00265 namespace tesseract {
00266 void Tesseract::reject_edge_blobs(WERD_RES *word) {
00267   TBOX word_box = word->word->bounding_box();
00268   // Use the box_word as it is already denormed back to image coordinates.
00269   int blobcount = word->box_word->length();
00270 
00271   if (word_box.left() < tessedit_image_border ||
00272       word_box.bottom() < tessedit_image_border ||
00273       word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
00274       word_box.top() + tessedit_image_border > ImageHeight() - 1) {
00275     ASSERT_HOST(word->reject_map.length() == blobcount);
00276     for (int blobindex = 0; blobindex < blobcount; blobindex++) {
00277       TBOX blob_box = word->box_word->BlobBox(blobindex);
00278       if (blob_box.left() < tessedit_image_border ||
00279           blob_box.bottom() < tessedit_image_border ||
00280           blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
00281           blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
00282         word->reject_map[blobindex].setrej_edge_char();
00283         // Close to edge
00284       }
00285     }
00286   }
00287 }
00288 
00289 /**********************************************************************
00290  * one_ell_conflict()
00291  *
00292  * Identify words where there is a potential I/l/1 error.
00293  * - A bundle of contextual heuristics!
00294  **********************************************************************/
00295 BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
00296   const char *word;
00297   const char *lengths;
00298   inT16 word_len;                //its length
00299   inT16 first_alphanum_index_;
00300   inT16 first_alphanum_offset_;
00301   inT16 i;
00302   inT16 offset;
00303   BOOL8 non_conflict_set_char;   //non conf set a/n?
00304   BOOL8 conflict = FALSE;
00305   BOOL8 allow_1s;
00306   ACCEPTABLE_WERD_TYPE word_type;
00307   BOOL8 dict_perm_type;
00308   BOOL8 dict_word_ok;
00309   int dict_word_type;
00310 
00311   word = word_res->best_choice->unichar_string().string ();
00312   lengths = word_res->best_choice->unichar_lengths().string();
00313   word_len = strlen (lengths);
00314   /*
00315     If there are no occurrences of the conflict set characters then the word
00316     is OK.
00317   */
00318   if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
00319     return FALSE;
00320 
00321   /*
00322     There is a conflict if there are NO other (confirmed) alphanumerics apart
00323     from those in the conflict set.
00324   */
00325 
00326   for (i = 0, offset = 0, non_conflict_set_char = FALSE;
00327        (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
00328     non_conflict_set_char =
00329         (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
00330             word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
00331         !STRING (conflict_set_I_l_1).contains (word[offset]);
00332   if (!non_conflict_set_char) {
00333     if (update_map)
00334       reject_I_1_L(word_res);
00335     return TRUE;
00336   }
00337 
00338   /*
00339     If the word is accepted by a dawg permuter, and the first alpha character
00340     is "I" or "l", check to see if the alternative is also a dawg word. If it
00341     is, then there is a potential error otherwise the word is ok.
00342   */
00343 
00344   dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00345     (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
00346     (rej_trust_doc_dawg &&
00347     (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
00348     (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
00349   dict_word_type = dict_word(*(word_res->best_choice));
00350   dict_word_ok = (dict_word_type > 0) &&
00351     (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
00352 
00353   if ((rej_1Il_use_dict_word && dict_word_ok) ||
00354     (rej_1Il_trust_permuter_type && dict_perm_type) ||
00355   (dict_perm_type && dict_word_ok)) {
00356     first_alphanum_index_ = first_alphanum_index (word, lengths);
00357     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
00358     if (lengths[first_alphanum_index_] == 1 &&
00359         word[first_alphanum_offset_] == 'I') {
00360       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00361       if (safe_dict_word(word_res) > 0) {
00362         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00363         if (update_map)
00364           word_res->reject_map[first_alphanum_index_].
00365             setrej_1Il_conflict();
00366         return TRUE;
00367       }
00368       else {
00369         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00370         return FALSE;
00371       }
00372     }
00373 
00374     if (lengths[first_alphanum_index_] == 1 &&
00375         word[first_alphanum_offset_] == 'l') {
00376       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00377       if (safe_dict_word(word_res) > 0) {
00378         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00379         if (update_map)
00380           word_res->reject_map[first_alphanum_index_].
00381             setrej_1Il_conflict();
00382         return TRUE;
00383       }
00384       else {
00385         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00386         return FALSE;
00387       }
00388     }
00389     return FALSE;
00390   }
00391 
00392   /*
00393     NEW 1Il code. The old code relied on permuter types too much. In fact,
00394     tess will use TOP_CHOICE permute for good things like "palette".
00395     In this code the string is examined independently to see if it looks like
00396     a well formed word.
00397   */
00398 
00399   /*
00400     REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
00401     dictionary word.
00402   */
00403   first_alphanum_index_ = first_alphanum_index (word, lengths);
00404   first_alphanum_offset_ = first_alphanum_offset (word, lengths);
00405   if (lengths[first_alphanum_index_] == 1 &&
00406       word[first_alphanum_offset_] == 'l') {
00407     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00408     if (safe_dict_word(word_res) > 0)
00409       return FALSE;
00410     else
00411       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00412   }
00413   else if (lengths[first_alphanum_index_] == 1 &&
00414            word[first_alphanum_offset_] == 'I') {
00415     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
00416     if (safe_dict_word(word_res) > 0)
00417       return FALSE;
00418     else
00419       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
00420   }
00421   /*
00422     For strings containing digits:
00423       If there are no alphas OR the numeric permuter liked the word,
00424         reject any non 1 conflict chs
00425       Else reject all conflict chs
00426   */
00427   if (word_contains_non_1_digit (word, lengths)) {
00428     allow_1s = (alpha_count (word, lengths) == 0) ||
00429       (word_res->best_choice->permuter () == NUMBER_PERM);
00430 
00431     inT16 offset;
00432     conflict = FALSE;
00433     for (i = 0, offset = 0; word[offset] != '\0';
00434          offset += word_res->best_choice->unichar_lengths()[i++]) {
00435       if ((!allow_1s || (word[offset] != '1')) &&
00436       STRING (conflict_set_I_l_1).contains (word[offset])) {
00437         if (update_map)
00438           word_res->reject_map[i].setrej_1Il_conflict ();
00439         conflict = TRUE;
00440       }
00441     }
00442     return conflict;
00443   }
00444   /*
00445     For anything else. See if it conforms to an acceptable word type. If so,
00446     treat accordingly.
00447   */
00448   word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
00449   if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
00450     first_alphanum_index_ = first_alphanum_index (word, lengths);
00451     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
00452     if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
00453       if (update_map)
00454         word_res->reject_map[first_alphanum_index_].
00455             setrej_1Il_conflict ();
00456       return TRUE;
00457     }
00458     else
00459       return FALSE;
00460   }
00461   else if (word_type == AC_UPPER_CASE) {
00462     return FALSE;
00463   }
00464   else {
00465     if (update_map)
00466       reject_I_1_L(word_res);
00467     return TRUE;
00468   }
00469 }
00470 
00471 
00472 inT16 Tesseract::first_alphanum_index(const char *word,
00473                                       const char *word_lengths) {
00474   inT16 i;
00475   inT16 offset;
00476 
00477   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00478     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
00479         unicharset.get_isdigit(word + offset, word_lengths[i]))
00480       return i;
00481   }
00482   return -1;
00483 }
00484 
00485 inT16 Tesseract::first_alphanum_offset(const char *word,
00486                                        const char *word_lengths) {
00487   inT16 i;
00488   inT16 offset;
00489 
00490   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00491     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
00492         unicharset.get_isdigit(word + offset, word_lengths[i]))
00493       return offset;
00494   }
00495   return -1;
00496 }
00497 
00498 inT16 Tesseract::alpha_count(const char *word,
00499                              const char *word_lengths) {
00500   inT16 i;
00501   inT16 offset;
00502   inT16 count = 0;
00503 
00504   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00505     if (unicharset.get_isalpha (word + offset, word_lengths[i]))
00506       count++;
00507   }
00508   return count;
00509 }
00510 
00511 
00512 BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
00513                                            const char *word_lengths) {
00514   inT16 i;
00515   inT16 offset;
00516 
00517   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
00518     if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
00519         (word_lengths[i] != 1 || word[offset] != '1'))
00520       return TRUE;
00521   }
00522   return FALSE;
00523 }
00524 
00525 /*************************************************************************
00526  * dont_allow_1Il()
00527  * Dont unreject LONE accepted 1Il conflict set chars
00528  *************************************************************************/
00529 void Tesseract::dont_allow_1Il(WERD_RES *word) {
00530   int i = 0;
00531   int offset;
00532   int word_len = word->reject_map.length();
00533   const char *s = word->best_choice->unichar_string().string();
00534   const char *lengths = word->best_choice->unichar_lengths().string();
00535   BOOL8 accepted_1Il = FALSE;
00536 
00537   for (i = 0, offset = 0; i < word_len;
00538        offset += word->best_choice->unichar_lengths()[i++]) {
00539     if (word->reject_map[i].accepted()) {
00540       if (STRING(conflict_set_I_l_1).contains(s[offset])) {
00541         accepted_1Il = TRUE;
00542       } else {
00543         if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
00544             word->uch_set->get_isdigit(s + offset, lengths[i]))
00545           return;                // >=1 non 1Il ch accepted
00546       }
00547     }
00548   }
00549   if (!accepted_1Il)
00550     return;                      //Nothing to worry about
00551 
00552   for (i = 0, offset = 0; i < word_len;
00553        offset += word->best_choice->unichar_lengths()[i++]) {
00554     if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
00555       word->reject_map[i].accepted())
00556       word->reject_map[i].setrej_postNN_1Il();
00557   }
00558 }
00559 
00560 
00561 inT16 Tesseract::count_alphanums(WERD_RES *word_res) {
00562   int count = 0;
00563   const WERD_CHOICE *best_choice = word_res->best_choice;
00564   for (int i = 0; i < word_res->reject_map.length(); ++i) {
00565     if ((word_res->reject_map[i].accepted()) &&
00566         (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
00567             word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
00568       count++;
00569     }
00570   }
00571   return count;
00572 }
00573 
00574 
00575 // reject all if most rejected.
00576 void Tesseract::reject_mostly_rejects(WERD_RES *word) {
00577   /* Reject the whole of the word if the fraction of rejects exceeds a limit */
00578 
00579   if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
00580     rej_whole_of_mostly_reject_word_fract)
00581     word->reject_map.rej_word_mostly_rej();
00582 }
00583 
00584 
00585 BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
00586   inT16 char_quality;
00587   inT16 accepted_char_quality;
00588 
00589   if (word->best_choice->unichar_lengths().length() <= 1)
00590     return FALSE;
00591 
00592   if (!STRING(ok_repeated_ch_non_alphanum_wds).
00593     contains(word->best_choice->unichar_string()[0]))
00594     return FALSE;
00595 
00596   UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
00597   for (int i = 1; i < word->best_choice->length(); ++i) {
00598     if (word->best_choice->unichar_id(i) != uch_id) return FALSE;
00599   }
00600 
00601   word_char_quality(word, row, &char_quality, &accepted_char_quality);
00602 
00603   if ((word->best_choice->unichar_lengths().length () == char_quality) &&
00604     (char_quality == accepted_char_quality))
00605     return TRUE;
00606   else
00607     return FALSE;
00608 }
00609 
00610 inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) {
00611   const WERD_CHOICE &word = *werd_res->best_choice;
00612   int dict_word_type = werd_res->tesseract->dict_word(word);
00613   return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
00614 }
00615 
00616 // Note: After running this function word_res->ratings
00617 // might not contain the right BLOB_CHOICE corresponding to each character
00618 // in word_res->best_choice.
00619 void Tesseract::flip_hyphens(WERD_RES *word_res) {
00620   WERD_CHOICE *best_choice = word_res->best_choice;
00621   int i;
00622   int prev_right = -9999;
00623   int next_left;
00624   TBOX out_box;
00625   float aspect_ratio;
00626 
00627   if (tessedit_lower_flip_hyphen <= 1)
00628     return;
00629 
00630   int num_blobs = word_res->rebuild_word->NumBlobs();
00631   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
00632   for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
00633     TBLOB* blob = word_res->rebuild_word->blobs[i];
00634     out_box = blob->bounding_box();
00635     if (i + 1 == num_blobs)
00636       next_left = 9999;
00637     else
00638       next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
00639     // Dont touch small or touching blobs - it is too dangerous.
00640     if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
00641         (out_box.left() > prev_right) && (out_box.right() < next_left)) {
00642       aspect_ratio = out_box.width() / (float) out_box.height();
00643       if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
00644         if (aspect_ratio >= tessedit_upper_flip_hyphen &&
00645             word_res->uch_set->contains_unichar_id(unichar_dash) &&
00646             word_res->uch_set->get_enabled(unichar_dash)) {
00647           /* Certain HYPHEN */
00648           best_choice->set_unichar_id(unichar_dash, i);
00649           if (word_res->reject_map[i].rejected())
00650             word_res->reject_map[i].setrej_hyphen_accept();
00651         }
00652         if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
00653           word_res->reject_map[i].accepted())
00654                                  //Suspected HYPHEN
00655           word_res->reject_map[i].setrej_hyphen ();
00656       }
00657       else if (best_choice->unichar_id(i) == unichar_dash) {
00658         if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
00659           (word_res->reject_map[i].rejected()))
00660           word_res->reject_map[i].setrej_hyphen_accept();
00661         //Certain HYPHEN
00662 
00663         if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
00664           (word_res->reject_map[i].accepted()))
00665                                  //Suspected HYPHEN
00666           word_res->reject_map[i].setrej_hyphen();
00667       }
00668     }
00669     prev_right = out_box.right();
00670   }
00671 }
00672 
00673 // Note: After running this function word_res->ratings
00674 // might not contain the right BLOB_CHOICE corresponding to each character
00675 // in word_res->best_choice.
00676 void Tesseract::flip_0O(WERD_RES *word_res) {
00677   WERD_CHOICE *best_choice = word_res->best_choice;
00678   int i;
00679   TBOX out_box;
00680 
00681   if (!tessedit_flip_0O)
00682     return;
00683 
00684   int num_blobs = word_res->rebuild_word->NumBlobs();
00685   for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
00686     TBLOB* blob = word_res->rebuild_word->blobs[i];
00687     if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
00688         word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
00689       out_box = blob->bounding_box();
00690       if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
00691         (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
00692         return;                  //Beware words with sub/superscripts
00693     }
00694   }
00695   UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
00696   UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
00697   if (unichar_0 == INVALID_UNICHAR_ID ||
00698       !word_res->uch_set->get_enabled(unichar_0) ||
00699       unichar_O == INVALID_UNICHAR_ID ||
00700       !word_res->uch_set->get_enabled(unichar_O)) {
00701     return;  // 0 or O are not present/enabled in unicharset
00702   }
00703   for (i = 1; i < best_choice->length(); ++i) {
00704     if (best_choice->unichar_id(i) == unichar_0 ||
00705         best_choice->unichar_id(i) == unichar_O) {
00706       /* A0A */
00707       if ((i+1) < best_choice->length() &&
00708           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00709           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
00710         best_choice->set_unichar_id(unichar_O, i);
00711       }
00712       /* A00A */
00713       if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00714           (i+1) < best_choice->length() &&
00715           (best_choice->unichar_id(i+1) == unichar_0 ||
00716            best_choice->unichar_id(i+1) == unichar_O) &&
00717           (i+2) < best_choice->length() &&
00718           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
00719         best_choice->set_unichar_id(unichar_O, i);
00720         i++;
00721       }
00722       /* AA0<non digit or end of word> */
00723       if ((i > 1) &&
00724           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
00725           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00726           (((i+1) < best_choice->length() &&
00727             !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
00728             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
00729             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
00730            (i == best_choice->length() - 1))) {
00731         best_choice->set_unichar_id(unichar_O, i);
00732       }
00733       /* 9O9 */
00734       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00735           (i+1) < best_choice->length() &&
00736           non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
00737         best_choice->set_unichar_id(unichar_0, i);
00738       }
00739       /* 9OOO */
00740       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00741           (i+2) < best_choice->length() &&
00742           (best_choice->unichar_id(i+1) == unichar_0 ||
00743            best_choice->unichar_id(i+1) == unichar_O) &&
00744           (best_choice->unichar_id(i+2) == unichar_0 ||
00745            best_choice->unichar_id(i+2) == unichar_O)) {
00746         best_choice->set_unichar_id(unichar_0, i);
00747         best_choice->set_unichar_id(unichar_0, i+1);
00748         best_choice->set_unichar_id(unichar_0, i+2);
00749         i += 2;
00750       }
00751       /* 9OO<non upper> */
00752       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00753           (i+2) < best_choice->length() &&
00754           (best_choice->unichar_id(i+1) == unichar_0 ||
00755           best_choice->unichar_id(i+1) == unichar_O) &&
00756           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
00757         best_choice->set_unichar_id(unichar_0, i);
00758         best_choice->set_unichar_id(unichar_0, i+1);
00759         i++;
00760       }
00761       /* 9O<non upper> */
00762       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
00763           (i+1) < best_choice->length() &&
00764           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
00765         best_choice->set_unichar_id(unichar_0, i);
00766       }
00767       /* 9[.,]OOO.. */
00768       if ((i > 1) &&
00769           (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
00770               word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
00771           (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
00772            best_choice->unichar_id(i-2) == unichar_O)) {
00773         if (best_choice->unichar_id(i-2) == unichar_O) {
00774           best_choice->set_unichar_id(unichar_0, i-2);
00775         }
00776         while (i < best_choice->length() &&
00777                (best_choice->unichar_id(i) == unichar_O ||
00778                 best_choice->unichar_id(i) == unichar_0)) {
00779           best_choice->set_unichar_id(unichar_0, i);
00780           i++;
00781         }
00782         i--;
00783       }
00784     }
00785   }
00786 }
00787 
00788 BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
00789   return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
00790 }
00791 
00792 BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {
00793   return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
00794 }
00795 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines