tesseract
3.03
|
00001 /****************************************************************** 00002 * File: docqual.cpp (Formerly docqual.c) 00003 * Description: Document Quality Metrics 00004 * Author: Phil Cheatle 00005 * Created: Mon May 9 11:27:28 BST 1994 00006 * 00007 * (C) Copyright 1994, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include <ctype.h> 00025 #include "docqual.h" 00026 #include "reject.h" 00027 #include "tesscallback.h" 00028 #include "tessvars.h" 00029 #include "secname.h" 00030 #include "globals.h" 00031 #include "tesseractclass.h" 00032 00033 namespace tesseract{ 00034 00035 // A little class to provide the callbacks as we have no pre-bound args. 00036 struct DocQualCallbacks { 00037 explicit DocQualCallbacks(WERD_RES* word0) 00038 : word(word0), match_count(0), accepted_match_count(0) {} 00039 00040 void CountMatchingBlobs(int index) { 00041 ++match_count; 00042 } 00043 00044 void CountAcceptedBlobs(int index) { 00045 if (word->reject_map[index].accepted()) 00046 ++accepted_match_count; 00047 ++match_count; 00048 } 00049 00050 void AcceptIfGoodQuality(int index) { 00051 if (word->reject_map[index].accept_if_good_quality()) 00052 word->reject_map[index].setrej_quality_accept(); 00053 } 00054 00055 WERD_RES* word; 00056 inT16 match_count; 00057 inT16 accepted_match_count; 00058 }; 00059 00060 /************************************************************************* 00061 * word_blob_quality() 00062 * How many blobs in the box_word are identical to those of the inword? 00063 * ASSUME blobs in both initial word and box_word are in ascending order of 00064 * left hand blob edge. 00065 *************************************************************************/ 00066 inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) { 00067 if (word->bln_boxes == NULL || 00068 word->rebuild_word == NULL || word->rebuild_word->blobs.empty()) 00069 return 0; 00070 00071 DocQualCallbacks cb(word); 00072 word->bln_boxes->ProcessMatchedBlobs( 00073 *word->rebuild_word, 00074 NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs)); 00075 return cb.match_count; 00076 } 00077 00078 inT16 Tesseract::word_outline_errs(WERD_RES *word) { 00079 inT16 i = 0; 00080 inT16 err_count = 0; 00081 00082 if (word->rebuild_word != NULL) { 00083 for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) { 00084 TBLOB* blob = word->rebuild_word->blobs[b]; 00085 err_count += count_outline_errs(word->best_choice->unichar_string()[i], 00086 blob->NumOutlines()); 00087 i++; 00088 } 00089 } 00090 return err_count; 00091 } 00092 00093 /************************************************************************* 00094 * word_char_quality() 00095 * Combination of blob quality and outline quality - how many good chars are 00096 * there? - I.e chars which pass the blob AND outline tests. 00097 *************************************************************************/ 00098 void Tesseract::word_char_quality(WERD_RES *word, 00099 ROW *row, 00100 inT16 *match_count, 00101 inT16 *accepted_match_count) { 00102 if (word->bln_boxes == NULL || 00103 word->rebuild_word == NULL || word->rebuild_word->blobs.empty()) 00104 return; 00105 00106 DocQualCallbacks cb(word); 00107 word->bln_boxes->ProcessMatchedBlobs( 00108 *word->rebuild_word, 00109 NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs)); 00110 *match_count = cb.match_count; 00111 *accepted_match_count = cb.accepted_match_count; 00112 } 00113 00114 /************************************************************************* 00115 * unrej_good_chs() 00116 * Unreject POTENTIAL rejects if the blob passes the blob and outline checks 00117 *************************************************************************/ 00118 void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) { 00119 if (word->bln_boxes == NULL || 00120 word->rebuild_word == NULL || word->rebuild_word->blobs.empty()) 00121 return; 00122 00123 DocQualCallbacks cb(word); 00124 word->bln_boxes->ProcessMatchedBlobs( 00125 *word->rebuild_word, 00126 NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality)); 00127 } 00128 00129 inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) { 00130 int expected_outline_count; 00131 00132 if (STRING (outlines_odd).contains (c)) 00133 return 0; //Dont use this char 00134 else if (STRING (outlines_2).contains (c)) 00135 expected_outline_count = 2; 00136 else 00137 expected_outline_count = 1; 00138 return abs (outline_count - expected_outline_count); 00139 } 00140 00141 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, 00142 BOOL8 good_quality_doc) { 00143 if ((tessedit_good_quality_unrej && good_quality_doc)) 00144 unrej_good_quality_words(page_res_it); 00145 doc_and_block_rejection(page_res_it, good_quality_doc); 00146 if (unlv_tilde_crunching) { 00147 tilde_crunch(page_res_it); 00148 tilde_delete(page_res_it); 00149 } 00150 } 00151 00152 00153 /************************************************************************* 00154 * unrej_good_quality_words() 00155 * Accept potential rejects in words which pass the following checks: 00156 * - Contains a potential reject 00157 * - Word looks like a sensible alpha word. 00158 * - Word segmentation is the same as the original image 00159 * - All characters have the expected number of outlines 00160 * NOTE - the rejection counts are recalculated after unrejection 00161 * - CANT do it in a single pass without a bit of fiddling 00162 * - keep it simple but inefficient 00163 *************************************************************************/ 00164 void Tesseract::unrej_good_quality_words( //unreject potential 00165 PAGE_RES_IT &page_res_it) { 00166 WERD_RES *word; 00167 ROW_RES *current_row; 00168 BLOCK_RES *current_block; 00169 int i; 00170 00171 page_res_it.restart_page (); 00172 while (page_res_it.word () != NULL) { 00173 check_debug_pt (page_res_it.word (), 100); 00174 if (bland_unrej) { 00175 word = page_res_it.word (); 00176 for (i = 0; i < word->reject_map.length (); i++) { 00177 if (word->reject_map[i].accept_if_good_quality ()) 00178 word->reject_map[i].setrej_quality_accept (); 00179 } 00180 page_res_it.forward (); 00181 } 00182 else if ((page_res_it.row ()->char_count > 0) && 00183 ((page_res_it.row ()->rej_count / 00184 (float) page_res_it.row ()->char_count) <= 00185 quality_rowrej_pc)) { 00186 word = page_res_it.word (); 00187 if (word->reject_map.quality_recoverable_rejects() && 00188 (tessedit_unrej_any_wd || 00189 acceptable_word_string(*word->uch_set, 00190 word->best_choice->unichar_string().string(), 00191 word->best_choice->unichar_lengths().string()) 00192 != AC_UNACCEPTABLE)) { 00193 unrej_good_chs(word, page_res_it.row ()->row); 00194 } 00195 page_res_it.forward (); 00196 } 00197 else { 00198 /* Skip to end of dodgy row */ 00199 current_row = page_res_it.row (); 00200 while ((page_res_it.word () != NULL) && 00201 (page_res_it.row () == current_row)) 00202 page_res_it.forward (); 00203 } 00204 check_debug_pt (page_res_it.word (), 110); 00205 } 00206 page_res_it.restart_page (); 00207 page_res_it.page_res->char_count = 0; 00208 page_res_it.page_res->rej_count = 0; 00209 current_block = NULL; 00210 current_row = NULL; 00211 while (page_res_it.word () != NULL) { 00212 if (current_block != page_res_it.block ()) { 00213 current_block = page_res_it.block (); 00214 current_block->char_count = 0; 00215 current_block->rej_count = 0; 00216 } 00217 if (current_row != page_res_it.row ()) { 00218 current_row = page_res_it.row (); 00219 current_row->char_count = 0; 00220 current_row->rej_count = 0; 00221 current_row->whole_word_rej_count = 0; 00222 } 00223 page_res_it.rej_stat_word (); 00224 page_res_it.forward (); 00225 } 00226 } 00227 00228 00229 /************************************************************************* 00230 * doc_and_block_rejection() 00231 * 00232 * If the page has too many rejects - reject all of it. 00233 * If any block has too many rejects - reject all words in the block 00234 *************************************************************************/ 00235 00236 void Tesseract::doc_and_block_rejection( //reject big chunks 00237 PAGE_RES_IT &page_res_it, 00238 BOOL8 good_quality_doc) { 00239 inT16 block_no = 0; 00240 inT16 row_no = 0; 00241 BLOCK_RES *current_block; 00242 ROW_RES *current_row; 00243 00244 BOOL8 rej_word; 00245 BOOL8 prev_word_rejected; 00246 inT16 char_quality = 0; 00247 inT16 accepted_char_quality; 00248 00249 if (page_res_it.page_res->rej_count * 100.0 / 00250 page_res_it.page_res->char_count > tessedit_reject_doc_percent) { 00251 reject_whole_page(page_res_it); 00252 if (tessedit_debug_doc_rejection) { 00253 tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", 00254 page_res_it.page_res->char_count, 00255 page_res_it.page_res->rej_count); 00256 } 00257 } else { 00258 if (tessedit_debug_doc_rejection) { 00259 tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", 00260 page_res_it.page_res->char_count, 00261 page_res_it.page_res->rej_count); 00262 } 00263 00264 /* Walk blocks testing for block rejection */ 00265 00266 page_res_it.restart_page(); 00267 WERD_RES* word; 00268 while ((word = page_res_it.word()) != NULL) { 00269 current_block = page_res_it.block(); 00270 block_no = current_block->block->index(); 00271 if (current_block->char_count > 0 && 00272 (current_block->rej_count * 100.0 / current_block->char_count) > 00273 tessedit_reject_block_percent) { 00274 if (tessedit_debug_block_rejection) { 00275 tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", 00276 block_no, current_block->char_count, 00277 current_block->rej_count); 00278 } 00279 prev_word_rejected = FALSE; 00280 while ((word = page_res_it.word()) != NULL && 00281 (page_res_it.block() == current_block)) { 00282 if (tessedit_preserve_blk_rej_perfect_wds) { 00283 rej_word = word->reject_map.reject_count() > 0 || 00284 word->reject_map.length () < tessedit_preserve_min_wd_len; 00285 if (rej_word && tessedit_dont_blkrej_good_wds && 00286 word->reject_map.length() >= tessedit_preserve_min_wd_len && 00287 acceptable_word_string( 00288 *word->uch_set, 00289 word->best_choice->unichar_string().string(), 00290 word->best_choice->unichar_lengths().string()) != 00291 AC_UNACCEPTABLE) { 00292 word_char_quality(word, page_res_it.row()->row, 00293 &char_quality, 00294 &accepted_char_quality); 00295 rej_word = char_quality != word->reject_map.length(); 00296 } 00297 } else { 00298 rej_word = TRUE; 00299 } 00300 if (rej_word) { 00301 /* 00302 Reject spacing if both current and prev words are rejected. 00303 NOTE - this is NOT restricted to FUZZY spaces. - When tried this 00304 generated more space errors. 00305 */ 00306 if (tessedit_use_reject_spaces && 00307 prev_word_rejected && 00308 page_res_it.prev_row() == page_res_it.row() && 00309 word->word->space() == 1) 00310 word->reject_spaces = TRUE; 00311 word->reject_map.rej_word_block_rej(); 00312 } 00313 prev_word_rejected = rej_word; 00314 page_res_it.forward(); 00315 } 00316 } else { 00317 if (tessedit_debug_block_rejection) { 00318 tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", 00319 block_no, page_res_it.block()->char_count, 00320 page_res_it.block()->rej_count); 00321 } 00322 00323 /* Walk rows in block testing for row rejection */ 00324 row_no = 0; 00325 while ((word = page_res_it.word()) != NULL && 00326 page_res_it.block() == current_block) { 00327 current_row = page_res_it.row(); 00328 row_no++; 00329 /* Reject whole row if: 00330 fraction of chars on row which are rejected exceed a limit AND 00331 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a 00332 limit 00333 */ 00334 if (current_row->char_count > 0 && 00335 (current_row->rej_count * 100.0 / current_row->char_count) > 00336 tessedit_reject_row_percent && 00337 (current_row->whole_word_rej_count * 100.0 / 00338 current_row->rej_count) < 00339 tessedit_whole_wd_rej_row_percent) { 00340 if (tessedit_debug_block_rejection) { 00341 tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", 00342 row_no, current_row->char_count, 00343 current_row->rej_count); 00344 } 00345 prev_word_rejected = FALSE; 00346 while ((word = page_res_it.word()) != NULL && 00347 page_res_it.row () == current_row) { 00348 /* Preserve words on good docs unless they are mostly rejected*/ 00349 if (!tessedit_row_rej_good_docs && good_quality_doc) { 00350 rej_word = word->reject_map.reject_count() / 00351 static_cast<float>(word->reject_map.length()) > 00352 tessedit_good_doc_still_rowrej_wd; 00353 } else if (tessedit_preserve_row_rej_perfect_wds) { 00354 /* Preserve perfect words anyway */ 00355 rej_word = word->reject_map.reject_count() > 0 || 00356 word->reject_map.length () < tessedit_preserve_min_wd_len; 00357 if (rej_word && tessedit_dont_rowrej_good_wds && 00358 word->reject_map.length() >= tessedit_preserve_min_wd_len && 00359 acceptable_word_string(*word->uch_set, 00360 word->best_choice->unichar_string().string(), 00361 word->best_choice->unichar_lengths().string()) != 00362 AC_UNACCEPTABLE) { 00363 word_char_quality(word, page_res_it.row()->row, 00364 &char_quality, 00365 &accepted_char_quality); 00366 rej_word = char_quality != word->reject_map.length(); 00367 } 00368 } else { 00369 rej_word = TRUE; 00370 } 00371 if (rej_word) { 00372 /* 00373 Reject spacing if both current and prev words are rejected. 00374 NOTE - this is NOT restricted to FUZZY spaces. - When tried 00375 this generated more space errors. 00376 */ 00377 if (tessedit_use_reject_spaces && 00378 prev_word_rejected && 00379 page_res_it.prev_row() == page_res_it.row() && 00380 word->word->space () == 1) 00381 word->reject_spaces = TRUE; 00382 word->reject_map.rej_word_row_rej(); 00383 } 00384 prev_word_rejected = rej_word; 00385 page_res_it.forward(); 00386 } 00387 } else { 00388 if (tessedit_debug_block_rejection) { 00389 tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", 00390 row_no, current_row->char_count, current_row->rej_count); 00391 } 00392 while (page_res_it.word() != NULL && 00393 page_res_it.row() == current_row) 00394 page_res_it.forward(); 00395 } 00396 } 00397 } 00398 } 00399 } 00400 } 00401 00402 } // namespace tesseract 00403 00404 00405 /************************************************************************* 00406 * reject_whole_page() 00407 * Dont believe any of it - set the reject map to 00..00 in all words 00408 * 00409 *************************************************************************/ 00410 00411 void reject_whole_page(PAGE_RES_IT &page_res_it) { 00412 page_res_it.restart_page (); 00413 while (page_res_it.word () != NULL) { 00414 page_res_it.word ()->reject_map.rej_word_doc_rej (); 00415 page_res_it.forward (); 00416 } 00417 //whole page is rejected 00418 page_res_it.page_res->rejected = TRUE; 00419 } 00420 00421 namespace tesseract { 00422 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) { 00423 WERD_RES *word; 00424 GARBAGE_LEVEL garbage_level; 00425 PAGE_RES_IT copy_it; 00426 BOOL8 prev_potential_marked = FALSE; 00427 BOOL8 found_terrible_word = FALSE; 00428 BOOL8 ok_dict_word; 00429 00430 page_res_it.restart_page(); 00431 while (page_res_it.word() != NULL) { 00432 POLY_BLOCK* pb = page_res_it.block()->block->poly_block(); 00433 if (pb != NULL && !pb->IsText()) { 00434 page_res_it.forward(); 00435 continue; 00436 } 00437 word = page_res_it.word(); 00438 00439 if (crunch_early_convert_bad_unlv_chs) 00440 convert_bad_unlv_chs(word); 00441 00442 if (crunch_early_merge_tess_fails) 00443 word->merge_tess_fails(); 00444 00445 if (word->reject_map.accept_count () != 0) { 00446 found_terrible_word = FALSE; 00447 //Forget earlier potential crunches 00448 prev_potential_marked = FALSE; 00449 } 00450 else { 00451 ok_dict_word = safe_dict_word(word); 00452 garbage_level = garbage_word (word, ok_dict_word); 00453 00454 if ((garbage_level != G_NEVER_CRUNCH) && 00455 (terrible_word_crunch (word, garbage_level))) { 00456 if (crunch_debug > 0) { 00457 tprintf ("T CRUNCHING: \"%s\"\n", 00458 word->best_choice->unichar_string().string()); 00459 } 00460 word->unlv_crunch_mode = CR_KEEP_SPACE; 00461 if (prev_potential_marked) { 00462 while (copy_it.word () != word) { 00463 if (crunch_debug > 0) { 00464 tprintf ("P1 CRUNCHING: \"%s\"\n", 00465 copy_it.word()->best_choice->unichar_string().string()); 00466 } 00467 copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE; 00468 copy_it.forward (); 00469 } 00470 prev_potential_marked = FALSE; 00471 } 00472 found_terrible_word = TRUE; 00473 } 00474 else if ((garbage_level != G_NEVER_CRUNCH) && 00475 (potential_word_crunch (word, 00476 garbage_level, ok_dict_word))) { 00477 if (found_terrible_word) { 00478 if (crunch_debug > 0) { 00479 tprintf ("P2 CRUNCHING: \"%s\"\n", 00480 word->best_choice->unichar_string().string()); 00481 } 00482 word->unlv_crunch_mode = CR_KEEP_SPACE; 00483 } 00484 else if (!prev_potential_marked) { 00485 copy_it = page_res_it; 00486 prev_potential_marked = TRUE; 00487 if (crunch_debug > 1) { 00488 tprintf ("P3 CRUNCHING: \"%s\"\n", 00489 word->best_choice->unichar_string().string()); 00490 } 00491 } 00492 } 00493 else { 00494 found_terrible_word = FALSE; 00495 //Forget earlier potential crunches 00496 prev_potential_marked = FALSE; 00497 if (crunch_debug > 2) { 00498 tprintf ("NO CRUNCH: \"%s\"\n", 00499 word->best_choice->unichar_string().string()); 00500 } 00501 } 00502 } 00503 page_res_it.forward (); 00504 } 00505 } 00506 00507 00508 BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word, 00509 GARBAGE_LEVEL garbage_level) { 00510 float rating_per_ch; 00511 int adjusted_len; 00512 int crunch_mode = 0; 00513 00514 if ((word->best_choice->unichar_string().length () == 0) || 00515 (strspn (word->best_choice->unichar_string().string(), " ") == 00516 word->best_choice->unichar_string().length ())) 00517 crunch_mode = 1; 00518 else { 00519 adjusted_len = word->reject_map.length (); 00520 if (adjusted_len > crunch_rating_max) 00521 adjusted_len = crunch_rating_max; 00522 rating_per_ch = word->best_choice->rating () / adjusted_len; 00523 00524 if (rating_per_ch > crunch_terrible_rating) 00525 crunch_mode = 2; 00526 else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) 00527 crunch_mode = 3; 00528 else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) && 00529 (garbage_level != G_OK)) 00530 crunch_mode = 4; 00531 else if ((rating_per_ch > crunch_poor_garbage_rate) && 00532 (garbage_level != G_OK)) 00533 crunch_mode = 5; 00534 } 00535 if (crunch_mode > 0) { 00536 if (crunch_debug > 2) { 00537 tprintf ("Terrible_word_crunch (%d) on \"%s\"\n", 00538 crunch_mode, word->best_choice->unichar_string().string()); 00539 } 00540 return TRUE; 00541 } 00542 else 00543 return FALSE; 00544 } 00545 00546 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word, 00547 GARBAGE_LEVEL garbage_level, 00548 BOOL8 ok_dict_word) { 00549 float rating_per_ch; 00550 int adjusted_len; 00551 const char *str = word->best_choice->unichar_string().string(); 00552 const char *lengths = word->best_choice->unichar_lengths().string(); 00553 BOOL8 word_crunchable; 00554 int poor_indicator_count = 0; 00555 00556 word_crunchable = !crunch_leave_accept_strings || 00557 word->reject_map.length() < 3 || 00558 (acceptable_word_string(*word->uch_set, 00559 str, lengths) == AC_UNACCEPTABLE && 00560 !ok_dict_word); 00561 00562 adjusted_len = word->reject_map.length(); 00563 if (adjusted_len > 10) 00564 adjusted_len = 10; 00565 rating_per_ch = word->best_choice->rating() / adjusted_len; 00566 00567 if (rating_per_ch > crunch_pot_poor_rate) { 00568 if (crunch_debug > 2) { 00569 tprintf("Potential poor rating on \"%s\"\n", 00570 word->best_choice->unichar_string().string()); 00571 } 00572 poor_indicator_count++; 00573 } 00574 00575 if (word_crunchable && 00576 word->best_choice->certainty() < crunch_pot_poor_cert) { 00577 if (crunch_debug > 2) { 00578 tprintf("Potential poor cert on \"%s\"\n", 00579 word->best_choice->unichar_string().string()); 00580 } 00581 poor_indicator_count++; 00582 } 00583 00584 if (garbage_level != G_OK) { 00585 if (crunch_debug > 2) { 00586 tprintf("Potential garbage on \"%s\"\n", 00587 word->best_choice->unichar_string().string()); 00588 } 00589 poor_indicator_count++; 00590 } 00591 return poor_indicator_count >= crunch_pot_indicators; 00592 } 00593 00594 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { 00595 WERD_RES *word; 00596 PAGE_RES_IT copy_it; 00597 BOOL8 deleting_from_bol = FALSE; 00598 BOOL8 marked_delete_point = FALSE; 00599 inT16 debug_delete_mode; 00600 CRUNCH_MODE delete_mode; 00601 inT16 x_debug_delete_mode; 00602 CRUNCH_MODE x_delete_mode; 00603 00604 page_res_it.restart_page(); 00605 while (page_res_it.word() != NULL) { 00606 word = page_res_it.word(); 00607 00608 delete_mode = word_deletable (word, debug_delete_mode); 00609 if (delete_mode != CR_NONE) { 00610 if (word->word->flag (W_BOL) || deleting_from_bol) { 00611 if (crunch_debug > 0) { 00612 tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n", 00613 debug_delete_mode, 00614 word->best_choice->unichar_string().string()); 00615 } 00616 word->unlv_crunch_mode = delete_mode; 00617 deleting_from_bol = TRUE; 00618 } else if (word->word->flag(W_EOL)) { 00619 if (marked_delete_point) { 00620 while (copy_it.word() != word) { 00621 x_delete_mode = word_deletable (copy_it.word (), 00622 x_debug_delete_mode); 00623 if (crunch_debug > 0) { 00624 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", 00625 x_debug_delete_mode, 00626 copy_it.word()->best_choice->unichar_string().string()); 00627 } 00628 copy_it.word ()->unlv_crunch_mode = x_delete_mode; 00629 copy_it.forward (); 00630 } 00631 } 00632 if (crunch_debug > 0) { 00633 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", 00634 debug_delete_mode, 00635 word->best_choice->unichar_string().string()); 00636 } 00637 word->unlv_crunch_mode = delete_mode; 00638 deleting_from_bol = FALSE; 00639 marked_delete_point = FALSE; 00640 } 00641 else { 00642 if (!marked_delete_point) { 00643 copy_it = page_res_it; 00644 marked_delete_point = TRUE; 00645 } 00646 } 00647 } 00648 else { 00649 deleting_from_bol = FALSE; 00650 //Forget earlier potential crunches 00651 marked_delete_point = FALSE; 00652 } 00653 /* 00654 The following step has been left till now as the tess fails are used to 00655 determine if the word is deletable. 00656 */ 00657 if (!crunch_early_merge_tess_fails) 00658 word->merge_tess_fails(); 00659 page_res_it.forward (); 00660 } 00661 } 00662 00663 00664 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) { 00665 int i; 00666 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); 00667 UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" "); 00668 UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~"); 00669 UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^"); 00670 for (i = 0; i < word_res->reject_map.length(); ++i) { 00671 if (word_res->best_choice->unichar_id(i) == unichar_tilde) { 00672 word_res->best_choice->set_unichar_id(unichar_dash, i); 00673 if (word_res->reject_map[i].accepted ()) 00674 word_res->reject_map[i].setrej_unlv_rej (); 00675 } 00676 if (word_res->best_choice->unichar_id(i) == unichar_pow) { 00677 word_res->best_choice->set_unichar_id(unichar_space, i); 00678 if (word_res->reject_map[i].accepted ()) 00679 word_res->reject_map[i].setrej_unlv_rej (); 00680 } 00681 } 00682 } 00683 00684 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { 00685 enum STATES 00686 { 00687 JUNK, 00688 FIRST_UPPER, 00689 FIRST_LOWER, 00690 FIRST_NUM, 00691 SUBSEQUENT_UPPER, 00692 SUBSEQUENT_LOWER, 00693 SUBSEQUENT_NUM 00694 }; 00695 const char *str = word->best_choice->unichar_string().string(); 00696 const char *lengths = word->best_choice->unichar_lengths().string(); 00697 STATES state = JUNK; 00698 int len = 0; 00699 int isolated_digits = 0; 00700 int isolated_alphas = 0; 00701 int bad_char_count = 0; 00702 int tess_rejs = 0; 00703 int dodgy_chars = 0; 00704 int ok_chars; 00705 UNICHAR_ID last_char = -1; 00706 int alpha_repetition_count = 0; 00707 int longest_alpha_repetition_count = 0; 00708 int longest_lower_run_len = 0; 00709 int lower_string_count = 0; 00710 int longest_upper_run_len = 0; 00711 int upper_string_count = 0; 00712 int total_alpha_count = 0; 00713 int total_digit_count = 0; 00714 00715 for (; *str != '\0'; str += *(lengths++)) { 00716 len++; 00717 if (word->uch_set->get_isupper (str, *lengths)) { 00718 total_alpha_count++; 00719 switch (state) { 00720 case SUBSEQUENT_UPPER: 00721 case FIRST_UPPER: 00722 state = SUBSEQUENT_UPPER; 00723 upper_string_count++; 00724 if (longest_upper_run_len < upper_string_count) 00725 longest_upper_run_len = upper_string_count; 00726 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { 00727 alpha_repetition_count++; 00728 if (longest_alpha_repetition_count < alpha_repetition_count) { 00729 longest_alpha_repetition_count = alpha_repetition_count; 00730 } 00731 } 00732 else { 00733 last_char = word->uch_set->unichar_to_id(str, *lengths); 00734 alpha_repetition_count = 1; 00735 } 00736 break; 00737 case FIRST_NUM: 00738 isolated_digits++; 00739 default: 00740 state = FIRST_UPPER; 00741 last_char = word->uch_set->unichar_to_id(str, *lengths); 00742 alpha_repetition_count = 1; 00743 upper_string_count = 1; 00744 break; 00745 } 00746 } 00747 else if (word->uch_set->get_islower (str, *lengths)) { 00748 total_alpha_count++; 00749 switch (state) { 00750 case SUBSEQUENT_LOWER: 00751 case FIRST_LOWER: 00752 state = SUBSEQUENT_LOWER; 00753 lower_string_count++; 00754 if (longest_lower_run_len < lower_string_count) 00755 longest_lower_run_len = lower_string_count; 00756 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { 00757 alpha_repetition_count++; 00758 if (longest_alpha_repetition_count < alpha_repetition_count) { 00759 longest_alpha_repetition_count = alpha_repetition_count; 00760 } 00761 } 00762 else { 00763 last_char = word->uch_set->unichar_to_id(str, *lengths); 00764 alpha_repetition_count = 1; 00765 } 00766 break; 00767 case FIRST_NUM: 00768 isolated_digits++; 00769 default: 00770 state = FIRST_LOWER; 00771 last_char = word->uch_set->unichar_to_id(str, *lengths); 00772 alpha_repetition_count = 1; 00773 lower_string_count = 1; 00774 break; 00775 } 00776 } 00777 else if (word->uch_set->get_isdigit (str, *lengths)) { 00778 total_digit_count++; 00779 switch (state) { 00780 case FIRST_NUM: 00781 state = SUBSEQUENT_NUM; 00782 case SUBSEQUENT_NUM: 00783 break; 00784 case FIRST_UPPER: 00785 case FIRST_LOWER: 00786 isolated_alphas++; 00787 default: 00788 state = FIRST_NUM; 00789 break; 00790 } 00791 } 00792 else { 00793 if (*lengths == 1 && *str == ' ') 00794 tess_rejs++; 00795 else 00796 bad_char_count++; 00797 switch (state) { 00798 case FIRST_NUM: 00799 isolated_digits++; 00800 break; 00801 case FIRST_UPPER: 00802 case FIRST_LOWER: 00803 isolated_alphas++; 00804 default: 00805 break; 00806 } 00807 state = JUNK; 00808 } 00809 } 00810 00811 switch (state) { 00812 case FIRST_NUM: 00813 isolated_digits++; 00814 break; 00815 case FIRST_UPPER: 00816 case FIRST_LOWER: 00817 isolated_alphas++; 00818 default: 00819 break; 00820 } 00821 00822 if (crunch_include_numerals) { 00823 total_alpha_count += total_digit_count - isolated_digits; 00824 } 00825 00826 if (crunch_leave_ok_strings && len >= 4 && 00827 2 * (total_alpha_count - isolated_alphas) > len && 00828 longest_alpha_repetition_count < crunch_long_repetitions) { 00829 if ((crunch_accept_ok && 00830 acceptable_word_string(*word->uch_set, str, lengths) != 00831 AC_UNACCEPTABLE) || 00832 longest_lower_run_len > crunch_leave_lc_strings || 00833 longest_upper_run_len > crunch_leave_uc_strings) 00834 return G_NEVER_CRUNCH; 00835 } 00836 if (word->reject_map.length() > 1 && 00837 strpbrk(str, " ") == NULL && 00838 (word->best_choice->permuter() == SYSTEM_DAWG_PERM || 00839 word->best_choice->permuter() == FREQ_DAWG_PERM || 00840 word->best_choice->permuter() == USER_DAWG_PERM || 00841 word->best_choice->permuter() == NUMBER_PERM || 00842 acceptable_word_string(*word->uch_set, str, lengths) != 00843 AC_UNACCEPTABLE || ok_dict_word)) 00844 return G_OK; 00845 00846 ok_chars = len - bad_char_count - isolated_digits - 00847 isolated_alphas - tess_rejs; 00848 00849 if (crunch_debug > 3) { 00850 tprintf("garbage_word: \"%s\"\n", 00851 word->best_choice->unichar_string().string()); 00852 tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", 00853 len, 00854 bad_char_count, isolated_digits, isolated_alphas, tess_rejs); 00855 } 00856 if (bad_char_count == 0 && 00857 tess_rejs == 0 && 00858 (len > isolated_digits + isolated_alphas || len <= 2)) 00859 return G_OK; 00860 00861 if (tess_rejs > ok_chars || 00862 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) 00863 return G_TERRIBLE; 00864 00865 if (len > 4) { 00866 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + 00867 isolated_alphas; 00868 if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5) 00869 return G_DODGY; 00870 else 00871 return G_OK; 00872 } else { 00873 dodgy_chars = 2 * tess_rejs + bad_char_count; 00874 if ((len == 4 && dodgy_chars > 2) || 00875 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) 00876 return G_DODGY; 00877 else 00878 return G_OK; 00879 } 00880 } 00881 00882 00883 /************************************************************************* 00884 * word_deletable() 00885 * DELETE WERDS AT ENDS OF ROWS IF 00886 * Word is crunched && 00887 * ( string length = 0 OR 00888 * > 50% of chars are "|" (before merging) OR 00889 * certainty < -10 OR 00890 * rating /char > 60 OR 00891 * TOP of word is more than 0.5 xht BELOW baseline OR 00892 * BOTTOM of word is more than 0.5 xht ABOVE xht OR 00893 * length of word < 3xht OR 00894 * height of word < 0.7 xht OR 00895 * height of word > 3.0 xht OR 00896 * >75% of the outline BBs have longest dimension < 0.5xht 00897 *************************************************************************/ 00898 00899 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) { 00900 int word_len = word->reject_map.length (); 00901 float rating_per_ch; 00902 TBOX box; //BB of word 00903 00904 if (word->unlv_crunch_mode == CR_NONE) { 00905 delete_mode = 0; 00906 return CR_NONE; 00907 } 00908 00909 if (word_len == 0) { 00910 delete_mode = 1; 00911 return CR_DELETE; 00912 } 00913 00914 if (word->rebuild_word != NULL) { 00915 // Cube leaves rebuild_word NULL. 00916 box = word->rebuild_word->bounding_box(); 00917 if (box.height () < crunch_del_min_ht * kBlnXHeight) { 00918 delete_mode = 4; 00919 return CR_DELETE; 00920 } 00921 00922 if (noise_outlines(word->rebuild_word)) { 00923 delete_mode = 5; 00924 return CR_DELETE; 00925 } 00926 } 00927 00928 if ((failure_count (word) * 1.5) > word_len) { 00929 delete_mode = 2; 00930 return CR_LOOSE_SPACE; 00931 } 00932 00933 if (word->best_choice->certainty () < crunch_del_cert) { 00934 delete_mode = 7; 00935 return CR_LOOSE_SPACE; 00936 } 00937 00938 rating_per_ch = word->best_choice->rating () / word_len; 00939 00940 if (rating_per_ch > crunch_del_rating) { 00941 delete_mode = 8; 00942 return CR_LOOSE_SPACE; 00943 } 00944 00945 if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) { 00946 delete_mode = 9; 00947 return CR_LOOSE_SPACE; 00948 } 00949 00950 if (box.bottom () > 00951 kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) { 00952 delete_mode = 10; 00953 return CR_LOOSE_SPACE; 00954 } 00955 00956 if (box.height () > crunch_del_max_ht * kBlnXHeight) { 00957 delete_mode = 11; 00958 return CR_LOOSE_SPACE; 00959 } 00960 00961 if (box.width () < crunch_del_min_width * kBlnXHeight) { 00962 delete_mode = 3; 00963 return CR_LOOSE_SPACE; 00964 } 00965 00966 delete_mode = 0; 00967 return CR_NONE; 00968 } 00969 00970 inT16 Tesseract::failure_count(WERD_RES *word) { 00971 const char *str = word->best_choice->unichar_string().string(); 00972 int tess_rejs = 0; 00973 00974 for (; *str != '\0'; str++) { 00975 if (*str == ' ') 00976 tess_rejs++; 00977 } 00978 return tess_rejs; 00979 } 00980 00981 00982 BOOL8 Tesseract::noise_outlines(TWERD *word) { 00983 TBOX box; // BB of outline 00984 inT16 outline_count = 0; 00985 inT16 small_outline_count = 0; 00986 inT16 max_dimension; 00987 float small_limit = kBlnXHeight * crunch_small_outlines_size; 00988 00989 for (int b = 0; b < word->NumBlobs(); ++b) { 00990 TBLOB* blob = word->blobs[b]; 00991 for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) { 00992 outline_count++; 00993 box = ol->bounding_box(); 00994 if (box.height() > box.width()) 00995 max_dimension = box.height(); 00996 else 00997 max_dimension = box.width(); 00998 if (max_dimension < small_limit) 00999 small_outline_count++; 01000 } 01001 } 01002 return small_outline_count >= outline_count; 01003 } 01004 01005 } // namespace tesseract