tesseract
3.03
|
00001 /********************************************************************** 00002 * File: reject.cpp (Formerly reject.c) 00003 * Description: Rejection functions used in tessedit 00004 * Author: Phil Cheatle 00005 * Created: Wed Sep 23 16:50:21 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #pragma warning(disable:4305) // int/float warnings 00023 #endif 00024 00025 #include "tessvars.h" 00026 #ifdef __UNIX__ 00027 #include <assert.h> 00028 #include <errno.h> 00029 #endif 00030 #include "scanutils.h" 00031 #include <ctype.h> 00032 #include <string.h> 00033 #include "genericvector.h" 00034 #include "reject.h" 00035 #include "control.h" 00036 #include "docqual.h" 00037 #include "secname.h" 00038 #include "globaloc.h" // For err_exit. 00039 #include "globals.h" 00040 #include "helpers.h" 00041 00042 /* #define SECURE_NAMES done in secnames.h when necessary */ 00043 00044 #include "tesseractclass.h" 00045 00046 // Include automatically generated configuration file if running autoconf. 00047 #ifdef HAVE_CONFIG_H 00048 #include "config_auto.h" 00049 #endif 00050 00051 CLISTIZEH (STRING) CLISTIZE (STRING) 00052 00053 /************************************************************************* 00054 * set_done() 00055 * 00056 * Set the done flag based on the word acceptability criteria 00057 *************************************************************************/ 00058 00059 namespace tesseract { 00060 void Tesseract::set_done(WERD_RES *word, inT16 pass) { 00061 word->done = word->tess_accepted && 00062 (strchr(word->best_choice->unichar_string().string(), ' ') == NULL); 00063 bool word_is_ambig = word->best_choice->dangerous_ambig_found(); 00064 bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM || 00065 word->best_choice->permuter() == FREQ_DAWG_PERM || 00066 word->best_choice->permuter() == USER_DAWG_PERM; 00067 if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) && 00068 one_ell_conflict(word, FALSE)) { 00069 if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n"); 00070 word->done = FALSE; 00071 } 00072 if (word->done && ((!word_from_dict && 00073 word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) { 00074 if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n"); 00075 word->done = FALSE; 00076 } 00077 if (tessedit_rejection_debug) { 00078 tprintf("set_done(): done=%d\n", word->done); 00079 word->best_choice->print(""); 00080 } 00081 } 00082 00083 00084 /************************************************************************* 00085 * make_reject_map() 00086 * 00087 * Sets the done flag to indicate whether the resylt is acceptable. 00088 * 00089 * Sets a reject map for the word. 00090 *************************************************************************/ 00091 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) { 00092 int i; 00093 int offset; 00094 00095 flip_0O(word); 00096 check_debug_pt(word, -1); // For trap only 00097 set_done(word, pass); // Set acceptance 00098 word->reject_map.initialise(word->best_choice->unichar_lengths().length()); 00099 reject_blanks(word); 00100 /* 00101 0: Rays original heuristic - the baseline 00102 */ 00103 if (tessedit_reject_mode == 0) { 00104 if (!word->done) 00105 reject_poor_matches(word); 00106 } else if (tessedit_reject_mode == 5) { 00107 /* 00108 5: Reject I/1/l from words where there is no strong contextual confirmation; 00109 the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); 00110 and the whole of any words which are very small 00111 */ 00112 if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) { 00113 word->reject_map.rej_word_small_xht(); 00114 } else { 00115 one_ell_conflict(word, TRUE); 00116 /* 00117 Originally the code here just used the done flag. Now I have duplicated 00118 and unpacked the conditions for setting the done flag so that each 00119 mechanism can be turned on or off independently. This works WITHOUT 00120 affecting the done flag setting. 00121 */ 00122 if (rej_use_tess_accepted && !word->tess_accepted) 00123 word->reject_map.rej_word_not_tess_accepted (); 00124 00125 if (rej_use_tess_blanks && 00126 (strchr (word->best_choice->unichar_string().string (), ' ') != NULL)) 00127 word->reject_map.rej_word_contains_blanks (); 00128 00129 WERD_CHOICE* best_choice = word->best_choice; 00130 if (rej_use_good_perm) { 00131 if ((best_choice->permuter() == SYSTEM_DAWG_PERM || 00132 best_choice->permuter() == FREQ_DAWG_PERM || 00133 best_choice->permuter() == USER_DAWG_PERM) && 00134 (!rej_use_sensible_wd || 00135 acceptable_word_string(*word->uch_set, 00136 best_choice->unichar_string().string(), 00137 best_choice->unichar_lengths().string()) != 00138 AC_UNACCEPTABLE)) { 00139 // PASSED TEST 00140 } else if (best_choice->permuter() == NUMBER_PERM) { 00141 if (rej_alphas_in_number_perm) { 00142 for (i = 0, offset = 0; 00143 best_choice->unichar_string()[offset] != '\0'; 00144 offset += best_choice->unichar_lengths()[i++]) { 00145 if (word->reject_map[i].accepted() && 00146 word->uch_set->get_isalpha( 00147 best_choice->unichar_string().string() + offset, 00148 best_choice->unichar_lengths()[i])) 00149 word->reject_map[i].setrej_bad_permuter(); 00150 // rej alpha 00151 } 00152 } 00153 } else { 00154 word->reject_map.rej_word_bad_permuter(); 00155 } 00156 } 00157 /* Ambig word rejection was here once !!*/ 00158 } 00159 } else { 00160 tprintf("BAD tessedit_reject_mode\n"); 00161 err_exit(); 00162 } 00163 00164 if (tessedit_image_border > -1) 00165 reject_edge_blobs(word); 00166 00167 check_debug_pt (word, 10); 00168 if (tessedit_rejection_debug) { 00169 tprintf("Permuter Type = %d\n", word->best_choice->permuter ()); 00170 tprintf("Certainty: %f Rating: %f\n", 00171 word->best_choice->certainty (), word->best_choice->rating ()); 00172 tprintf("Dict word: %d\n", dict_word(*(word->best_choice))); 00173 } 00174 00175 flip_hyphens(word); 00176 check_debug_pt(word, 20); 00177 } 00178 } // namespace tesseract 00179 00180 00181 void reject_blanks(WERD_RES *word) { 00182 inT16 i; 00183 inT16 offset; 00184 00185 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; 00186 offset += word->best_choice->unichar_lengths()[i], i += 1) { 00187 if (word->best_choice->unichar_string()[offset] == ' ') 00188 //rej unrecognised blobs 00189 word->reject_map[i].setrej_tess_failure (); 00190 } 00191 } 00192 00193 namespace tesseract { 00194 void Tesseract::reject_I_1_L(WERD_RES *word) { 00195 inT16 i; 00196 inT16 offset; 00197 00198 for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; 00199 offset += word->best_choice->unichar_lengths()[i], i += 1) { 00200 if (STRING (conflict_set_I_l_1). 00201 contains (word->best_choice->unichar_string()[offset])) { 00202 //rej 1Il conflict 00203 word->reject_map[i].setrej_1Il_conflict (); 00204 } 00205 } 00206 } 00207 } // namespace tesseract 00208 00209 00210 void reject_poor_matches(WERD_RES *word) { 00211 float threshold = compute_reject_threshold(word->best_choice); 00212 for (int i = 0; i < word->best_choice->length(); ++i) { 00213 if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) 00214 word->reject_map[i].setrej_tess_failure(); 00215 else if (word->best_choice->certainty(i) < threshold) 00216 word->reject_map[i].setrej_poor_match(); 00217 } 00218 } 00219 00220 00221 /********************************************************************** 00222 * compute_reject_threshold 00223 * 00224 * Set a rejection threshold for this word. 00225 * Initially this is a trivial function which looks for the largest 00226 * gap in the certainty value. 00227 **********************************************************************/ 00228 00229 float compute_reject_threshold(WERD_CHOICE* word) { 00230 float threshold; // rejection threshold 00231 float bestgap = 0.0f; // biggest gap 00232 float gapstart; // bottom of gap 00233 // super iterator 00234 BLOB_CHOICE_IT choice_it; // real iterator 00235 00236 int blob_count = word->length(); 00237 GenericVector<float> ratings; 00238 ratings.init_to_size(blob_count, 0.0f); 00239 for (int i = 0; i < blob_count; ++i) { 00240 ratings[i] = word->certainty(i); 00241 } 00242 ratings.sort(); 00243 gapstart = ratings[0] - 1; // all reject if none better 00244 if (blob_count >= 3) { 00245 for (int index = 0; index < blob_count - 1; index++) { 00246 if (ratings[index + 1] - ratings[index] > bestgap) { 00247 bestgap = ratings[index + 1] - ratings[index]; 00248 // find biggest 00249 gapstart = ratings[index]; 00250 } 00251 } 00252 } 00253 threshold = gapstart + bestgap / 2; 00254 00255 return threshold; 00256 } 00257 00258 00259 /************************************************************************* 00260 * reject_edge_blobs() 00261 * 00262 * If the word is perilously close to the edge of the image, reject those blobs 00263 * in the word which are too close to the edge as they could be clipped. 00264 *************************************************************************/ 00265 namespace tesseract { 00266 void Tesseract::reject_edge_blobs(WERD_RES *word) { 00267 TBOX word_box = word->word->bounding_box(); 00268 // Use the box_word as it is already denormed back to image coordinates. 00269 int blobcount = word->box_word->length(); 00270 00271 if (word_box.left() < tessedit_image_border || 00272 word_box.bottom() < tessedit_image_border || 00273 word_box.right() + tessedit_image_border > ImageWidth() - 1 || 00274 word_box.top() + tessedit_image_border > ImageHeight() - 1) { 00275 ASSERT_HOST(word->reject_map.length() == blobcount); 00276 for (int blobindex = 0; blobindex < blobcount; blobindex++) { 00277 TBOX blob_box = word->box_word->BlobBox(blobindex); 00278 if (blob_box.left() < tessedit_image_border || 00279 blob_box.bottom() < tessedit_image_border || 00280 blob_box.right() + tessedit_image_border > ImageWidth() - 1 || 00281 blob_box.top() + tessedit_image_border > ImageHeight() - 1) { 00282 word->reject_map[blobindex].setrej_edge_char(); 00283 // Close to edge 00284 } 00285 } 00286 } 00287 } 00288 00289 /********************************************************************** 00290 * one_ell_conflict() 00291 * 00292 * Identify words where there is a potential I/l/1 error. 00293 * - A bundle of contextual heuristics! 00294 **********************************************************************/ 00295 BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { 00296 const char *word; 00297 const char *lengths; 00298 inT16 word_len; //its length 00299 inT16 first_alphanum_index_; 00300 inT16 first_alphanum_offset_; 00301 inT16 i; 00302 inT16 offset; 00303 BOOL8 non_conflict_set_char; //non conf set a/n? 00304 BOOL8 conflict = FALSE; 00305 BOOL8 allow_1s; 00306 ACCEPTABLE_WERD_TYPE word_type; 00307 BOOL8 dict_perm_type; 00308 BOOL8 dict_word_ok; 00309 int dict_word_type; 00310 00311 word = word_res->best_choice->unichar_string().string (); 00312 lengths = word_res->best_choice->unichar_lengths().string(); 00313 word_len = strlen (lengths); 00314 /* 00315 If there are no occurrences of the conflict set characters then the word 00316 is OK. 00317 */ 00318 if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL) 00319 return FALSE; 00320 00321 /* 00322 There is a conflict if there are NO other (confirmed) alphanumerics apart 00323 from those in the conflict set. 00324 */ 00325 00326 for (i = 0, offset = 0, non_conflict_set_char = FALSE; 00327 (i < word_len) && !non_conflict_set_char; offset += lengths[i++]) 00328 non_conflict_set_char = 00329 (word_res->uch_set->get_isalpha(word + offset, lengths[i]) || 00330 word_res->uch_set->get_isdigit(word + offset, lengths[i])) && 00331 !STRING (conflict_set_I_l_1).contains (word[offset]); 00332 if (!non_conflict_set_char) { 00333 if (update_map) 00334 reject_I_1_L(word_res); 00335 return TRUE; 00336 } 00337 00338 /* 00339 If the word is accepted by a dawg permuter, and the first alpha character 00340 is "I" or "l", check to see if the alternative is also a dawg word. If it 00341 is, then there is a potential error otherwise the word is ok. 00342 */ 00343 00344 dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) || 00345 (word_res->best_choice->permuter () == USER_DAWG_PERM) || 00346 (rej_trust_doc_dawg && 00347 (word_res->best_choice->permuter () == DOC_DAWG_PERM)) || 00348 (word_res->best_choice->permuter () == FREQ_DAWG_PERM); 00349 dict_word_type = dict_word(*(word_res->best_choice)); 00350 dict_word_ok = (dict_word_type > 0) && 00351 (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM)); 00352 00353 if ((rej_1Il_use_dict_word && dict_word_ok) || 00354 (rej_1Il_trust_permuter_type && dict_perm_type) || 00355 (dict_perm_type && dict_word_ok)) { 00356 first_alphanum_index_ = first_alphanum_index (word, lengths); 00357 first_alphanum_offset_ = first_alphanum_offset (word, lengths); 00358 if (lengths[first_alphanum_index_] == 1 && 00359 word[first_alphanum_offset_] == 'I') { 00360 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00361 if (safe_dict_word(word_res) > 0) { 00362 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00363 if (update_map) 00364 word_res->reject_map[first_alphanum_index_]. 00365 setrej_1Il_conflict(); 00366 return TRUE; 00367 } 00368 else { 00369 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00370 return FALSE; 00371 } 00372 } 00373 00374 if (lengths[first_alphanum_index_] == 1 && 00375 word[first_alphanum_offset_] == 'l') { 00376 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00377 if (safe_dict_word(word_res) > 0) { 00378 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00379 if (update_map) 00380 word_res->reject_map[first_alphanum_index_]. 00381 setrej_1Il_conflict(); 00382 return TRUE; 00383 } 00384 else { 00385 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00386 return FALSE; 00387 } 00388 } 00389 return FALSE; 00390 } 00391 00392 /* 00393 NEW 1Il code. The old code relied on permuter types too much. In fact, 00394 tess will use TOP_CHOICE permute for good things like "palette". 00395 In this code the string is examined independently to see if it looks like 00396 a well formed word. 00397 */ 00398 00399 /* 00400 REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a 00401 dictionary word. 00402 */ 00403 first_alphanum_index_ = first_alphanum_index (word, lengths); 00404 first_alphanum_offset_ = first_alphanum_offset (word, lengths); 00405 if (lengths[first_alphanum_index_] == 1 && 00406 word[first_alphanum_offset_] == 'l') { 00407 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00408 if (safe_dict_word(word_res) > 0) 00409 return FALSE; 00410 else 00411 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00412 } 00413 else if (lengths[first_alphanum_index_] == 1 && 00414 word[first_alphanum_offset_] == 'I') { 00415 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; 00416 if (safe_dict_word(word_res) > 0) 00417 return FALSE; 00418 else 00419 word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; 00420 } 00421 /* 00422 For strings containing digits: 00423 If there are no alphas OR the numeric permuter liked the word, 00424 reject any non 1 conflict chs 00425 Else reject all conflict chs 00426 */ 00427 if (word_contains_non_1_digit (word, lengths)) { 00428 allow_1s = (alpha_count (word, lengths) == 0) || 00429 (word_res->best_choice->permuter () == NUMBER_PERM); 00430 00431 inT16 offset; 00432 conflict = FALSE; 00433 for (i = 0, offset = 0; word[offset] != '\0'; 00434 offset += word_res->best_choice->unichar_lengths()[i++]) { 00435 if ((!allow_1s || (word[offset] != '1')) && 00436 STRING (conflict_set_I_l_1).contains (word[offset])) { 00437 if (update_map) 00438 word_res->reject_map[i].setrej_1Il_conflict (); 00439 conflict = TRUE; 00440 } 00441 } 00442 return conflict; 00443 } 00444 /* 00445 For anything else. See if it conforms to an acceptable word type. If so, 00446 treat accordingly. 00447 */ 00448 word_type = acceptable_word_string(*word_res->uch_set, word, lengths); 00449 if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) { 00450 first_alphanum_index_ = first_alphanum_index (word, lengths); 00451 first_alphanum_offset_ = first_alphanum_offset (word, lengths); 00452 if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) { 00453 if (update_map) 00454 word_res->reject_map[first_alphanum_index_]. 00455 setrej_1Il_conflict (); 00456 return TRUE; 00457 } 00458 else 00459 return FALSE; 00460 } 00461 else if (word_type == AC_UPPER_CASE) { 00462 return FALSE; 00463 } 00464 else { 00465 if (update_map) 00466 reject_I_1_L(word_res); 00467 return TRUE; 00468 } 00469 } 00470 00471 00472 inT16 Tesseract::first_alphanum_index(const char *word, 00473 const char *word_lengths) { 00474 inT16 i; 00475 inT16 offset; 00476 00477 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00478 if (unicharset.get_isalpha(word + offset, word_lengths[i]) || 00479 unicharset.get_isdigit(word + offset, word_lengths[i])) 00480 return i; 00481 } 00482 return -1; 00483 } 00484 00485 inT16 Tesseract::first_alphanum_offset(const char *word, 00486 const char *word_lengths) { 00487 inT16 i; 00488 inT16 offset; 00489 00490 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00491 if (unicharset.get_isalpha(word + offset, word_lengths[i]) || 00492 unicharset.get_isdigit(word + offset, word_lengths[i])) 00493 return offset; 00494 } 00495 return -1; 00496 } 00497 00498 inT16 Tesseract::alpha_count(const char *word, 00499 const char *word_lengths) { 00500 inT16 i; 00501 inT16 offset; 00502 inT16 count = 0; 00503 00504 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00505 if (unicharset.get_isalpha (word + offset, word_lengths[i])) 00506 count++; 00507 } 00508 return count; 00509 } 00510 00511 00512 BOOL8 Tesseract::word_contains_non_1_digit(const char *word, 00513 const char *word_lengths) { 00514 inT16 i; 00515 inT16 offset; 00516 00517 for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { 00518 if (unicharset.get_isdigit (word + offset, word_lengths[i]) && 00519 (word_lengths[i] != 1 || word[offset] != '1')) 00520 return TRUE; 00521 } 00522 return FALSE; 00523 } 00524 00525 /************************************************************************* 00526 * dont_allow_1Il() 00527 * Dont unreject LONE accepted 1Il conflict set chars 00528 *************************************************************************/ 00529 void Tesseract::dont_allow_1Il(WERD_RES *word) { 00530 int i = 0; 00531 int offset; 00532 int word_len = word->reject_map.length(); 00533 const char *s = word->best_choice->unichar_string().string(); 00534 const char *lengths = word->best_choice->unichar_lengths().string(); 00535 BOOL8 accepted_1Il = FALSE; 00536 00537 for (i = 0, offset = 0; i < word_len; 00538 offset += word->best_choice->unichar_lengths()[i++]) { 00539 if (word->reject_map[i].accepted()) { 00540 if (STRING(conflict_set_I_l_1).contains(s[offset])) { 00541 accepted_1Il = TRUE; 00542 } else { 00543 if (word->uch_set->get_isalpha(s + offset, lengths[i]) || 00544 word->uch_set->get_isdigit(s + offset, lengths[i])) 00545 return; // >=1 non 1Il ch accepted 00546 } 00547 } 00548 } 00549 if (!accepted_1Il) 00550 return; //Nothing to worry about 00551 00552 for (i = 0, offset = 0; i < word_len; 00553 offset += word->best_choice->unichar_lengths()[i++]) { 00554 if (STRING(conflict_set_I_l_1).contains(s[offset]) && 00555 word->reject_map[i].accepted()) 00556 word->reject_map[i].setrej_postNN_1Il(); 00557 } 00558 } 00559 00560 00561 inT16 Tesseract::count_alphanums(WERD_RES *word_res) { 00562 int count = 0; 00563 const WERD_CHOICE *best_choice = word_res->best_choice; 00564 for (int i = 0; i < word_res->reject_map.length(); ++i) { 00565 if ((word_res->reject_map[i].accepted()) && 00566 (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) || 00567 word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) { 00568 count++; 00569 } 00570 } 00571 return count; 00572 } 00573 00574 00575 // reject all if most rejected. 00576 void Tesseract::reject_mostly_rejects(WERD_RES *word) { 00577 /* Reject the whole of the word if the fraction of rejects exceeds a limit */ 00578 00579 if ((float) word->reject_map.reject_count() / word->reject_map.length() >= 00580 rej_whole_of_mostly_reject_word_fract) 00581 word->reject_map.rej_word_mostly_rej(); 00582 } 00583 00584 00585 BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) { 00586 inT16 char_quality; 00587 inT16 accepted_char_quality; 00588 00589 if (word->best_choice->unichar_lengths().length() <= 1) 00590 return FALSE; 00591 00592 if (!STRING(ok_repeated_ch_non_alphanum_wds). 00593 contains(word->best_choice->unichar_string()[0])) 00594 return FALSE; 00595 00596 UNICHAR_ID uch_id = word->best_choice->unichar_id(0); 00597 for (int i = 1; i < word->best_choice->length(); ++i) { 00598 if (word->best_choice->unichar_id(i) != uch_id) return FALSE; 00599 } 00600 00601 word_char_quality(word, row, &char_quality, &accepted_char_quality); 00602 00603 if ((word->best_choice->unichar_lengths().length () == char_quality) && 00604 (char_quality == accepted_char_quality)) 00605 return TRUE; 00606 else 00607 return FALSE; 00608 } 00609 00610 inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) { 00611 const WERD_CHOICE &word = *werd_res->best_choice; 00612 int dict_word_type = werd_res->tesseract->dict_word(word); 00613 return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type; 00614 } 00615 00616 // Note: After running this function word_res->ratings 00617 // might not contain the right BLOB_CHOICE corresponding to each character 00618 // in word_res->best_choice. 00619 void Tesseract::flip_hyphens(WERD_RES *word_res) { 00620 WERD_CHOICE *best_choice = word_res->best_choice; 00621 int i; 00622 int prev_right = -9999; 00623 int next_left; 00624 TBOX out_box; 00625 float aspect_ratio; 00626 00627 if (tessedit_lower_flip_hyphen <= 1) 00628 return; 00629 00630 int num_blobs = word_res->rebuild_word->NumBlobs(); 00631 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); 00632 for (i = 0; i < best_choice->length() && i < num_blobs; ++i) { 00633 TBLOB* blob = word_res->rebuild_word->blobs[i]; 00634 out_box = blob->bounding_box(); 00635 if (i + 1 == num_blobs) 00636 next_left = 9999; 00637 else 00638 next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left(); 00639 // Dont touch small or touching blobs - it is too dangerous. 00640 if ((out_box.width() > 8 * word_res->denorm.x_scale()) && 00641 (out_box.left() > prev_right) && (out_box.right() < next_left)) { 00642 aspect_ratio = out_box.width() / (float) out_box.height(); 00643 if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) { 00644 if (aspect_ratio >= tessedit_upper_flip_hyphen && 00645 word_res->uch_set->contains_unichar_id(unichar_dash) && 00646 word_res->uch_set->get_enabled(unichar_dash)) { 00647 /* Certain HYPHEN */ 00648 best_choice->set_unichar_id(unichar_dash, i); 00649 if (word_res->reject_map[i].rejected()) 00650 word_res->reject_map[i].setrej_hyphen_accept(); 00651 } 00652 if ((aspect_ratio > tessedit_lower_flip_hyphen) && 00653 word_res->reject_map[i].accepted()) 00654 //Suspected HYPHEN 00655 word_res->reject_map[i].setrej_hyphen (); 00656 } 00657 else if (best_choice->unichar_id(i) == unichar_dash) { 00658 if ((aspect_ratio >= tessedit_upper_flip_hyphen) && 00659 (word_res->reject_map[i].rejected())) 00660 word_res->reject_map[i].setrej_hyphen_accept(); 00661 //Certain HYPHEN 00662 00663 if ((aspect_ratio <= tessedit_lower_flip_hyphen) && 00664 (word_res->reject_map[i].accepted())) 00665 //Suspected HYPHEN 00666 word_res->reject_map[i].setrej_hyphen(); 00667 } 00668 } 00669 prev_right = out_box.right(); 00670 } 00671 } 00672 00673 // Note: After running this function word_res->ratings 00674 // might not contain the right BLOB_CHOICE corresponding to each character 00675 // in word_res->best_choice. 00676 void Tesseract::flip_0O(WERD_RES *word_res) { 00677 WERD_CHOICE *best_choice = word_res->best_choice; 00678 int i; 00679 TBOX out_box; 00680 00681 if (!tessedit_flip_0O) 00682 return; 00683 00684 int num_blobs = word_res->rebuild_word->NumBlobs(); 00685 for (i = 0; i < best_choice->length() && i < num_blobs; ++i) { 00686 TBLOB* blob = word_res->rebuild_word->blobs[i]; 00687 if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) || 00688 word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) { 00689 out_box = blob->bounding_box(); 00690 if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) || 00691 (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) 00692 return; //Beware words with sub/superscripts 00693 } 00694 } 00695 UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0"); 00696 UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O"); 00697 if (unichar_0 == INVALID_UNICHAR_ID || 00698 !word_res->uch_set->get_enabled(unichar_0) || 00699 unichar_O == INVALID_UNICHAR_ID || 00700 !word_res->uch_set->get_enabled(unichar_O)) { 00701 return; // 0 or O are not present/enabled in unicharset 00702 } 00703 for (i = 1; i < best_choice->length(); ++i) { 00704 if (best_choice->unichar_id(i) == unichar_0 || 00705 best_choice->unichar_id(i) == unichar_O) { 00706 /* A0A */ 00707 if ((i+1) < best_choice->length() && 00708 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00709 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) { 00710 best_choice->set_unichar_id(unichar_O, i); 00711 } 00712 /* A00A */ 00713 if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00714 (i+1) < best_choice->length() && 00715 (best_choice->unichar_id(i+1) == unichar_0 || 00716 best_choice->unichar_id(i+1) == unichar_O) && 00717 (i+2) < best_choice->length() && 00718 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) { 00719 best_choice->set_unichar_id(unichar_O, i); 00720 i++; 00721 } 00722 /* AA0<non digit or end of word> */ 00723 if ((i > 1) && 00724 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) && 00725 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00726 (((i+1) < best_choice->length() && 00727 !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) && 00728 !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") && 00729 !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) || 00730 (i == best_choice->length() - 1))) { 00731 best_choice->set_unichar_id(unichar_O, i); 00732 } 00733 /* 9O9 */ 00734 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00735 (i+1) < best_choice->length() && 00736 non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) { 00737 best_choice->set_unichar_id(unichar_0, i); 00738 } 00739 /* 9OOO */ 00740 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00741 (i+2) < best_choice->length() && 00742 (best_choice->unichar_id(i+1) == unichar_0 || 00743 best_choice->unichar_id(i+1) == unichar_O) && 00744 (best_choice->unichar_id(i+2) == unichar_0 || 00745 best_choice->unichar_id(i+2) == unichar_O)) { 00746 best_choice->set_unichar_id(unichar_0, i); 00747 best_choice->set_unichar_id(unichar_0, i+1); 00748 best_choice->set_unichar_id(unichar_0, i+2); 00749 i += 2; 00750 } 00751 /* 9OO<non upper> */ 00752 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00753 (i+2) < best_choice->length() && 00754 (best_choice->unichar_id(i+1) == unichar_0 || 00755 best_choice->unichar_id(i+1) == unichar_O) && 00756 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) { 00757 best_choice->set_unichar_id(unichar_0, i); 00758 best_choice->set_unichar_id(unichar_0, i+1); 00759 i++; 00760 } 00761 /* 9O<non upper> */ 00762 if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && 00763 (i+1) < best_choice->length() && 00764 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) { 00765 best_choice->set_unichar_id(unichar_0, i); 00766 } 00767 /* 9[.,]OOO.. */ 00768 if ((i > 1) && 00769 (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") || 00770 word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) && 00771 (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) || 00772 best_choice->unichar_id(i-2) == unichar_O)) { 00773 if (best_choice->unichar_id(i-2) == unichar_O) { 00774 best_choice->set_unichar_id(unichar_0, i-2); 00775 } 00776 while (i < best_choice->length() && 00777 (best_choice->unichar_id(i) == unichar_O || 00778 best_choice->unichar_id(i) == unichar_0)) { 00779 best_choice->set_unichar_id(unichar_0, i); 00780 i++; 00781 } 00782 i--; 00783 } 00784 } 00785 } 00786 } 00787 00788 BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) { 00789 return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O"); 00790 } 00791 00792 BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) { 00793 return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0"); 00794 } 00795 } // namespace tesseract