tesseract
3.03
|
00001 /****************************************************************** 00002 * File: fixspace.cpp (Formerly fixspace.c) 00003 * Description: Implements a pass over the page res, exploring the alternative 00004 * spacing possibilities, trying to use context to improve the 00005 * word spacing 00006 * Author: Phil Cheatle 00007 * Created: Thu Oct 21 11:38:43 BST 1993 00008 * 00009 * (C) Copyright 1993, Hewlett-Packard Ltd. 00010 ** Licensed under the Apache License, Version 2.0 (the "License"); 00011 ** you may not use this file except in compliance with the License. 00012 ** You may obtain a copy of the License at 00013 ** http://www.apache.org/licenses/LICENSE-2.0 00014 ** Unless required by applicable law or agreed to in writing, software 00015 ** distributed under the License is distributed on an "AS IS" BASIS, 00016 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00017 ** See the License for the specific language governing permissions and 00018 ** limitations under the License. 00019 * 00020 **********************************************************************/ 00021 00022 #include <ctype.h> 00023 #include "reject.h" 00024 #include "statistc.h" 00025 #include "control.h" 00026 #include "fixspace.h" 00027 #include "genblob.h" 00028 #include "tessvars.h" 00029 #include "tessbox.h" 00030 #include "secname.h" 00031 #include "globals.h" 00032 #include "tesseractclass.h" 00033 00034 #define PERFECT_WERDS 999 00035 #define MAXSPACING 128 /*max expected spacing in pix */ 00036 00037 namespace tesseract { 00038 00049 void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, 00050 inT32 word_count, 00051 PAGE_RES *page_res) { 00052 BLOCK_RES_IT block_res_it; 00053 ROW_RES_IT row_res_it; 00054 WERD_RES_IT word_res_it_from; 00055 WERD_RES_IT word_res_it_to; 00056 WERD_RES *word_res; 00057 WERD_RES_LIST fuzzy_space_words; 00058 inT16 new_length; 00059 BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds 00060 inT32 word_index; // current word 00061 00062 block_res_it.set_to_list(&page_res->block_res_list); 00063 word_index = 0; 00064 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); 00065 block_res_it.forward()) { 00066 row_res_it.set_to_list(&block_res_it.data()->row_res_list); 00067 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); 00068 row_res_it.forward()) { 00069 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list); 00070 while (!word_res_it_from.at_last()) { 00071 word_res = word_res_it_from.data(); 00072 while (!word_res_it_from.at_last() && 00073 !(word_res->combination || 00074 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) || 00075 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) { 00076 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, 00077 block_res_it.data()->block); 00078 word_res = word_res_it_from.forward(); 00079 word_index++; 00080 if (monitor != NULL) { 00081 monitor->ocr_alive = TRUE; 00082 monitor->progress = 90 + 5 * word_index / word_count; 00083 if (monitor->deadline_exceeded() || 00084 (monitor->cancel != NULL && 00085 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) 00086 return; 00087 } 00088 } 00089 00090 if (!word_res_it_from.at_last()) { 00091 word_res_it_to = word_res_it_from; 00092 prevent_null_wd_fixsp = 00093 word_res->word->cblob_list()->empty(); 00094 if (check_debug_pt(word_res, 60)) 00095 debug_fix_space_level.set_value(10); 00096 word_res_it_to.forward(); 00097 word_index++; 00098 if (monitor != NULL) { 00099 monitor->ocr_alive = TRUE; 00100 monitor->progress = 90 + 5 * word_index / word_count; 00101 if (monitor->deadline_exceeded() || 00102 (monitor->cancel != NULL && 00103 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) 00104 return; 00105 } 00106 while (!word_res_it_to.at_last () && 00107 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) || 00108 word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) { 00109 if (check_debug_pt(word_res, 60)) 00110 debug_fix_space_level.set_value(10); 00111 if (word_res->word->cblob_list()->empty()) 00112 prevent_null_wd_fixsp = TRUE; 00113 word_res = word_res_it_to.forward(); 00114 } 00115 if (check_debug_pt(word_res, 60)) 00116 debug_fix_space_level.set_value(10); 00117 if (word_res->word->cblob_list()->empty()) 00118 prevent_null_wd_fixsp = TRUE; 00119 if (prevent_null_wd_fixsp) { 00120 word_res_it_from = word_res_it_to; 00121 } else { 00122 fuzzy_space_words.assign_to_sublist(&word_res_it_from, 00123 &word_res_it_to); 00124 fix_fuzzy_space_list(fuzzy_space_words, 00125 row_res_it.data()->row, 00126 block_res_it.data()->block); 00127 new_length = fuzzy_space_words.length(); 00128 word_res_it_from.add_list_before(&fuzzy_space_words); 00129 for (; 00130 !word_res_it_from.at_last() && new_length > 0; 00131 new_length--) { 00132 word_res_it_from.forward(); 00133 } 00134 } 00135 if (test_pt) 00136 debug_fix_space_level.set_value(0); 00137 } 00138 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, 00139 block_res_it.data()->block); 00140 // Last word in row 00141 } 00142 } 00143 } 00144 } 00145 00146 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, 00147 ROW *row, 00148 BLOCK* block) { 00149 inT16 best_score; 00150 WERD_RES_LIST current_perm; 00151 inT16 current_score; 00152 BOOL8 improved = FALSE; 00153 00154 best_score = eval_word_spacing(best_perm); // default score 00155 dump_words(best_perm, best_score, 1, improved); 00156 00157 if (best_score != PERFECT_WERDS) 00158 initialise_search(best_perm, current_perm); 00159 00160 while ((best_score != PERFECT_WERDS) && !current_perm.empty()) { 00161 match_current_words(current_perm, row, block); 00162 current_score = eval_word_spacing(current_perm); 00163 dump_words(current_perm, current_score, 2, improved); 00164 if (current_score > best_score) { 00165 best_perm.clear(); 00166 best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); 00167 best_score = current_score; 00168 improved = TRUE; 00169 } 00170 if (current_score < PERFECT_WERDS) 00171 transform_to_next_perm(current_perm); 00172 } 00173 dump_words(best_perm, best_score, 3, improved); 00174 } 00175 00176 } // namespace tesseract 00177 00178 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) { 00179 WERD_RES_IT src_it(&src_list); 00180 WERD_RES_IT new_it(&new_list); 00181 WERD_RES *src_wd; 00182 WERD_RES *new_wd; 00183 00184 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { 00185 src_wd = src_it.data(); 00186 if (!src_wd->combination) { 00187 new_wd = WERD_RES::deep_copy(src_wd); 00188 new_wd->combination = FALSE; 00189 new_wd->part_of_combo = FALSE; 00190 new_it.add_after_then_move(new_wd); 00191 } 00192 } 00193 } 00194 00195 00196 namespace tesseract { 00197 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, 00198 BLOCK* block) { 00199 WERD_RES_IT word_it(&words); 00200 WERD_RES *word; 00201 // Since we are not using PAGE_RES to iterate over words, we need to update 00202 // prev_word_best_choice_ before calling classify_word_pass2(). 00203 prev_word_best_choice_ = NULL; 00204 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00205 word = word_it.data(); 00206 if ((!word->part_of_combo) && (word->box_word == NULL)) { 00207 WordData word_data(block, row, word); 00208 SetupWordPassN(2, &word_data); 00209 classify_word_and_language(&Tesseract::classify_word_pass2, &word_data); 00210 } 00211 prev_word_best_choice_ = word->best_choice; 00212 } 00213 } 00214 00215 00241 inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) { 00242 WERD_RES_IT word_res_it(&word_res_list); 00243 inT16 total_score = 0; 00244 inT16 word_count = 0; 00245 inT16 done_word_count = 0; 00246 inT16 word_len; 00247 inT16 i; 00248 inT16 offset; 00249 WERD_RES *word; // current word 00250 inT16 prev_word_score = 0; 00251 BOOL8 prev_word_done = FALSE; 00252 BOOL8 prev_char_1 = FALSE; // prev ch a "1/I/l"? 00253 BOOL8 prev_char_digit = FALSE; // prev ch 2..9 or 0 00254 BOOL8 current_char_1 = FALSE; 00255 BOOL8 current_word_ok_so_far; 00256 STRING punct_chars = "!\"`',.:;"; 00257 BOOL8 prev_char_punct = FALSE; 00258 BOOL8 current_char_punct = FALSE; 00259 BOOL8 word_done = FALSE; 00260 00261 do { 00262 word = word_res_it.data(); 00263 word_done = fixspace_thinks_word_done(word); 00264 word_count++; 00265 if (word->tess_failed) { 00266 total_score += prev_word_score; 00267 if (prev_word_done) 00268 done_word_count++; 00269 prev_word_score = 0; 00270 prev_char_1 = FALSE; 00271 prev_char_digit = FALSE; 00272 prev_word_done = FALSE; 00273 } else { 00274 /* 00275 Can we add the prev word score and potentially count this word? 00276 Yes IF it didnt end in a 1 when the first char of this word is a digit 00277 AND it didnt end in a digit when the first char of this word is a 1 00278 */ 00279 word_len = word->reject_map.length(); 00280 current_word_ok_so_far = FALSE; 00281 if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) || 00282 (prev_char_digit && ( 00283 (word_done && 00284 word->best_choice->unichar_lengths().string()[0] == 1 && 00285 word->best_choice->unichar_string()[0] == '1') || 00286 (!word_done && STRING(conflict_set_I_l_1).contains( 00287 word->best_choice->unichar_string()[0])))))) { 00288 total_score += prev_word_score; 00289 if (prev_word_done) 00290 done_word_count++; 00291 current_word_ok_so_far = word_done; 00292 } 00293 00294 if (current_word_ok_so_far) { 00295 prev_word_done = TRUE; 00296 prev_word_score = word_len; 00297 } else { 00298 prev_word_done = FALSE; 00299 prev_word_score = 0; 00300 } 00301 00302 /* Add 1 to total score for every joined 1 regardless of context and 00303 rejtn */ 00304 for (i = 0, prev_char_1 = FALSE; i < word_len; i++) { 00305 current_char_1 = word->best_choice->unichar_string()[i] == '1'; 00306 if (prev_char_1 || (current_char_1 && (i > 0))) 00307 total_score++; 00308 prev_char_1 = current_char_1; 00309 } 00310 00311 /* Add 1 to total score for every joined punctuation regardless of context 00312 and rejtn */ 00313 if (tessedit_prefer_joined_punct) { 00314 for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len; 00315 offset += word->best_choice->unichar_lengths()[i++]) { 00316 current_char_punct = 00317 punct_chars.contains(word->best_choice->unichar_string()[offset]); 00318 if (prev_char_punct || (current_char_punct && i > 0)) 00319 total_score++; 00320 prev_char_punct = current_char_punct; 00321 } 00322 } 00323 prev_char_digit = digit_or_numeric_punct(word, word_len - 1); 00324 for (i = 0, offset = 0; i < word_len - 1; 00325 offset += word->best_choice->unichar_lengths()[i++]); 00326 prev_char_1 = 00327 ((word_done && (word->best_choice->unichar_string()[offset] == '1')) 00328 || (!word_done && STRING(conflict_set_I_l_1).contains( 00329 word->best_choice->unichar_string()[offset]))); 00330 } 00331 /* Find next word */ 00332 do { 00333 word_res_it.forward(); 00334 } while (word_res_it.data()->part_of_combo); 00335 } while (!word_res_it.at_first()); 00336 total_score += prev_word_score; 00337 if (prev_word_done) 00338 done_word_count++; 00339 if (done_word_count == word_count) 00340 return PERFECT_WERDS; 00341 else 00342 return total_score; 00343 } 00344 00345 BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) { 00346 int i; 00347 int offset; 00348 00349 for (i = 0, offset = 0; i < char_position; 00350 offset += word->best_choice->unichar_lengths()[i++]); 00351 return ( 00352 word->uch_set->get_isdigit( 00353 word->best_choice->unichar_string().string() + offset, 00354 word->best_choice->unichar_lengths()[i]) || 00355 (word->best_choice->permuter() == NUMBER_PERM && 00356 STRING(numeric_punctuation).contains( 00357 word->best_choice->unichar_string().string()[offset]))); 00358 } 00359 00360 } // namespace tesseract 00361 00362 00374 void transform_to_next_perm(WERD_RES_LIST &words) { 00375 WERD_RES_IT word_it(&words); 00376 WERD_RES_IT prev_word_it(&words); 00377 WERD_RES *word; 00378 WERD_RES *prev_word; 00379 WERD_RES *combo; 00380 WERD *copy_word; 00381 inT16 prev_right = -MAX_INT16; 00382 TBOX box; 00383 inT16 gap; 00384 inT16 min_gap = MAX_INT16; 00385 00386 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00387 word = word_it.data(); 00388 if (!word->part_of_combo) { 00389 box = word->word->bounding_box(); 00390 if (prev_right > -MAX_INT16) { 00391 gap = box.left() - prev_right; 00392 if (gap < min_gap) 00393 min_gap = gap; 00394 } 00395 prev_right = box.right(); 00396 } 00397 } 00398 if (min_gap < MAX_INT16) { 00399 prev_right = -MAX_INT16; // back to start 00400 word_it.set_to_list(&words); 00401 // Note: we can't use cycle_pt due to inserted combos at start of list. 00402 for (; (prev_right == -MAX_INT16) || !word_it.at_first(); 00403 word_it.forward()) { 00404 word = word_it.data(); 00405 if (!word->part_of_combo) { 00406 box = word->word->bounding_box(); 00407 if (prev_right > -MAX_INT16) { 00408 gap = box.left() - prev_right; 00409 if (gap <= min_gap) { 00410 prev_word = prev_word_it.data(); 00411 if (prev_word->combination) { 00412 combo = prev_word; 00413 } else { 00414 /* Make a new combination and insert before 00415 * the first word being joined. */ 00416 copy_word = new WERD; 00417 *copy_word = *(prev_word->word); 00418 // deep copy 00419 combo = new WERD_RES(copy_word); 00420 combo->combination = TRUE; 00421 combo->x_height = prev_word->x_height; 00422 prev_word->part_of_combo = TRUE; 00423 prev_word_it.add_before_then_move(combo); 00424 } 00425 combo->word->set_flag(W_EOL, word->word->flag(W_EOL)); 00426 if (word->combination) { 00427 combo->word->join_on(word->word); 00428 // Move blobs to combo 00429 // old combo no longer needed 00430 delete word_it.extract(); 00431 } else { 00432 // Copy current wd to combo 00433 combo->copy_on(word); 00434 word->part_of_combo = TRUE; 00435 } 00436 combo->done = FALSE; 00437 combo->ClearResults(); 00438 } else { 00439 prev_word_it = word_it; // catch up 00440 } 00441 } 00442 prev_right = box.right(); 00443 } 00444 } 00445 } else { 00446 words.clear(); // signal termination 00447 } 00448 } 00449 00450 namespace tesseract { 00451 void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score, 00452 inT16 mode, BOOL8 improved) { 00453 WERD_RES_IT word_res_it(&perm); 00454 00455 if (debug_fix_space_level > 0) { 00456 if (mode == 1) { 00457 stats_.dump_words_str = ""; 00458 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 00459 word_res_it.forward()) { 00460 if (!word_res_it.data()->part_of_combo) { 00461 stats_.dump_words_str += 00462 word_res_it.data()->best_choice->unichar_string(); 00463 stats_.dump_words_str += ' '; 00464 } 00465 } 00466 } 00467 00468 #ifndef SECURE_NAMES 00469 if (debug_fix_space_level > 1) { 00470 switch (mode) { 00471 case 1: 00472 tprintf("EXTRACTED (%d): \"", score); 00473 break; 00474 case 2: 00475 tprintf("TESTED (%d): \"", score); 00476 break; 00477 case 3: 00478 tprintf("RETURNED (%d): \"", score); 00479 break; 00480 } 00481 00482 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 00483 word_res_it.forward()) { 00484 if (!word_res_it.data()->part_of_combo) { 00485 tprintf("%s/%1d ", 00486 word_res_it.data()->best_choice->unichar_string().string(), 00487 (int)word_res_it.data()->best_choice->permuter()); 00488 } 00489 } 00490 tprintf("\"\n"); 00491 } else if (improved) { 00492 tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string()); 00493 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); 00494 word_res_it.forward()) { 00495 if (!word_res_it.data()->part_of_combo) { 00496 tprintf("%s/%1d ", 00497 word_res_it.data()->best_choice->unichar_string().string(), 00498 (int)word_res_it.data()->best_choice->permuter()); 00499 } 00500 } 00501 tprintf("\"\n"); 00502 } 00503 #endif 00504 } 00505 } 00506 00507 BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) { 00508 if (word->done) 00509 return TRUE; 00510 00511 /* 00512 Use all the standard pass 2 conditions for mode 5 in set_done() in 00513 reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT 00514 CARE WHETHER WE HAVE of/at on/an etc. 00515 */ 00516 if (fixsp_done_mode > 0 && 00517 (word->tess_accepted || 00518 (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) || 00519 fixsp_done_mode == 3) && 00520 (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) && 00521 ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) || 00522 (word->best_choice->permuter() == FREQ_DAWG_PERM) || 00523 (word->best_choice->permuter() == USER_DAWG_PERM) || 00524 (word->best_choice->permuter() == NUMBER_PERM))) { 00525 return TRUE; 00526 } else { 00527 return FALSE; 00528 } 00529 } 00530 00531 00539 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, 00540 BLOCK* block) { 00541 WERD_RES *word_res; 00542 WERD_RES_LIST sub_word_list; 00543 WERD_RES_IT sub_word_list_it(&sub_word_list); 00544 inT16 blob_index; 00545 inT16 new_length; 00546 float junk; 00547 00548 word_res = word_res_it.data(); 00549 if (word_res->word->flag(W_REP_CHAR) || 00550 word_res->combination || 00551 word_res->part_of_combo || 00552 !word_res->word->flag(W_DONT_CHOP)) 00553 return; 00554 00555 blob_index = worst_noise_blob(word_res, &junk); 00556 if (blob_index < 0) 00557 return; 00558 00559 if (debug_fix_space_level > 1) { 00560 tprintf("FP fixspace working on \"%s\"\n", 00561 word_res->best_choice->unichar_string().string()); 00562 } 00563 word_res->word->rej_cblob_list()->sort(c_blob_comparator); 00564 sub_word_list_it.add_after_stay_put(word_res_it.extract()); 00565 fix_noisy_space_list(sub_word_list, row, block); 00566 new_length = sub_word_list.length(); 00567 word_res_it.add_list_before(&sub_word_list); 00568 for (; !word_res_it.at_last() && new_length > 1; new_length--) { 00569 word_res_it.forward(); 00570 } 00571 } 00572 00573 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, 00574 BLOCK* block) { 00575 inT16 best_score; 00576 WERD_RES_IT best_perm_it(&best_perm); 00577 WERD_RES_LIST current_perm; 00578 WERD_RES_IT current_perm_it(¤t_perm); 00579 WERD_RES *old_word_res; 00580 inT16 current_score; 00581 BOOL8 improved = FALSE; 00582 00583 best_score = fp_eval_word_spacing(best_perm); // default score 00584 00585 dump_words(best_perm, best_score, 1, improved); 00586 00587 old_word_res = best_perm_it.data(); 00588 // Even deep_copy doesn't copy the underlying WERD unless its combination 00589 // flag is true!. 00590 old_word_res->combination = TRUE; // Kludge to force deep copy 00591 current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res)); 00592 old_word_res->combination = FALSE; // Undo kludge 00593 00594 break_noisiest_blob_word(current_perm); 00595 00596 while (best_score != PERFECT_WERDS && !current_perm.empty()) { 00597 match_current_words(current_perm, row, block); 00598 current_score = fp_eval_word_spacing(current_perm); 00599 dump_words(current_perm, current_score, 2, improved); 00600 if (current_score > best_score) { 00601 best_perm.clear(); 00602 best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); 00603 best_score = current_score; 00604 improved = TRUE; 00605 } 00606 if (current_score < PERFECT_WERDS) { 00607 break_noisiest_blob_word(current_perm); 00608 } 00609 } 00610 dump_words(best_perm, best_score, 3, improved); 00611 } 00612 00613 00619 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) { 00620 WERD_RES_IT word_it(&words); 00621 WERD_RES_IT worst_word_it; 00622 float worst_noise_score = 9999; 00623 int worst_blob_index = -1; // Noisiest blob of noisiest wd 00624 int blob_index; // of wds noisiest blob 00625 float noise_score; // of wds noisiest blob 00626 WERD_RES *word_res; 00627 C_BLOB_IT blob_it; 00628 C_BLOB_IT rej_cblob_it; 00629 C_BLOB_LIST new_blob_list; 00630 C_BLOB_IT new_blob_it; 00631 C_BLOB_IT new_rej_cblob_it; 00632 WERD *new_word; 00633 inT16 start_of_noise_blob; 00634 inT16 i; 00635 00636 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00637 blob_index = worst_noise_blob(word_it.data(), &noise_score); 00638 if (blob_index > -1 && worst_noise_score > noise_score) { 00639 worst_noise_score = noise_score; 00640 worst_blob_index = blob_index; 00641 worst_word_it = word_it; 00642 } 00643 } 00644 if (worst_blob_index < 0) { 00645 words.clear(); // signal termination 00646 return; 00647 } 00648 00649 /* Now split the worst_word_it */ 00650 00651 word_res = worst_word_it.data(); 00652 00653 /* Move blobs before noise blob to a new bloblist */ 00654 00655 new_blob_it.set_to_list(&new_blob_list); 00656 blob_it.set_to_list(word_res->word->cblob_list()); 00657 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) { 00658 new_blob_it.add_after_then_move(blob_it.extract()); 00659 } 00660 start_of_noise_blob = blob_it.data()->bounding_box().left(); 00661 delete blob_it.extract(); // throw out noise blob 00662 00663 new_word = new WERD(&new_blob_list, word_res->word); 00664 new_word->set_flag(W_EOL, FALSE); 00665 word_res->word->set_flag(W_BOL, FALSE); 00666 word_res->word->set_blanks(1); // After break 00667 00668 new_rej_cblob_it.set_to_list(new_word->rej_cblob_list()); 00669 rej_cblob_it.set_to_list(word_res->word->rej_cblob_list()); 00670 for (; 00671 (!rej_cblob_it.empty() && 00672 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob)); 00673 rej_cblob_it.forward()) { 00674 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract()); 00675 } 00676 00677 WERD_RES* new_word_res = new WERD_RES(new_word); 00678 new_word_res->combination = TRUE; 00679 worst_word_it.add_before_then_move(new_word_res); 00680 00681 word_res->ClearResults(); 00682 } 00683 00684 inT16 Tesseract::worst_noise_blob(WERD_RES *word_res, 00685 float *worst_noise_score) { 00686 float noise_score[512]; 00687 int i; 00688 int min_noise_blob; // 1st contender 00689 int max_noise_blob; // last contender 00690 int non_noise_count; 00691 int worst_noise_blob; // Worst blob 00692 float small_limit = kBlnXHeight * fixsp_small_outlines_size; 00693 float non_noise_limit = kBlnXHeight * 0.8; 00694 00695 if (word_res->rebuild_word == NULL) 00696 return -1; // Can't handle cube words. 00697 00698 // Normalised. 00699 int blob_count = word_res->box_word->length(); 00700 ASSERT_HOST(blob_count <= 512); 00701 if (blob_count < 5) 00702 return -1; // too short to split 00703 00704 /* Get the noise scores for all blobs */ 00705 00706 #ifndef SECURE_NAMES 00707 if (debug_fix_space_level > 5) 00708 tprintf("FP fixspace Noise metrics for \"%s\": ", 00709 word_res->best_choice->unichar_string().string()); 00710 #endif 00711 00712 for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) { 00713 TBLOB* blob = word_res->rebuild_word->blobs[i]; 00714 if (word_res->reject_map[i].accepted()) 00715 noise_score[i] = non_noise_limit; 00716 else 00717 noise_score[i] = blob_noise_score(blob); 00718 00719 if (debug_fix_space_level > 5) 00720 tprintf("%1.1f ", noise_score[i]); 00721 } 00722 if (debug_fix_space_level > 5) 00723 tprintf("\n"); 00724 00725 /* Now find the worst one which is far enough away from the end of the word */ 00726 00727 non_noise_count = 0; 00728 for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) { 00729 if (noise_score[i] >= non_noise_limit) { 00730 non_noise_count++; 00731 } 00732 } 00733 if (non_noise_count < fixsp_non_noise_limit) 00734 return -1; 00735 00736 min_noise_blob = i; 00737 00738 non_noise_count = 0; 00739 for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; 00740 i--) { 00741 if (noise_score[i] >= non_noise_limit) { 00742 non_noise_count++; 00743 } 00744 } 00745 if (non_noise_count < fixsp_non_noise_limit) 00746 return -1; 00747 00748 max_noise_blob = i; 00749 00750 if (min_noise_blob > max_noise_blob) 00751 return -1; 00752 00753 *worst_noise_score = small_limit; 00754 worst_noise_blob = -1; 00755 for (i = min_noise_blob; i <= max_noise_blob; i++) { 00756 if (noise_score[i] < *worst_noise_score) { 00757 worst_noise_blob = i; 00758 *worst_noise_score = noise_score[i]; 00759 } 00760 } 00761 return worst_noise_blob; 00762 } 00763 00764 float Tesseract::blob_noise_score(TBLOB *blob) { 00765 TBOX box; // BB of outline 00766 inT16 outline_count = 0; 00767 inT16 max_dimension; 00768 inT16 largest_outline_dimension = 0; 00769 00770 for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) { 00771 outline_count++; 00772 box = ol->bounding_box(); 00773 if (box.height() > box.width()) { 00774 max_dimension = box.height(); 00775 } else { 00776 max_dimension = box.width(); 00777 } 00778 00779 if (largest_outline_dimension < max_dimension) 00780 largest_outline_dimension = max_dimension; 00781 } 00782 00783 if (outline_count > 5) { 00784 // penalise LOTS of blobs 00785 largest_outline_dimension *= 2; 00786 } 00787 00788 box = blob->bounding_box(); 00789 if (box.bottom() > kBlnBaselineOffset * 4 || 00790 box.top() < kBlnBaselineOffset / 2) { 00791 // Lax blob is if high or low 00792 largest_outline_dimension /= 2; 00793 } 00794 00795 return largest_outline_dimension; 00796 } 00797 } // namespace tesseract 00798 00799 void fixspace_dbg(WERD_RES *word) { 00800 TBOX box = word->word->bounding_box(); 00801 BOOL8 show_map_detail = FALSE; 00802 inT16 i; 00803 00804 box.print(); 00805 tprintf(" \"%s\" ", word->best_choice->unichar_string().string()); 00806 tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", 00807 word->word->cblob_list()->length(), 00808 word->rebuild_word->NumBlobs(), 00809 word->box_word->length()); 00810 word->reject_map.print(debug_fp); 00811 tprintf("\n"); 00812 if (show_map_detail) { 00813 tprintf("\"%s\"\n", word->best_choice->unichar_string().string()); 00814 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { 00815 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); 00816 word->reject_map[i].full_print(debug_fp); 00817 } 00818 } 00819 00820 tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); 00821 tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); 00822 } 00823 00824 00833 namespace tesseract { 00834 inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { 00835 WERD_RES_IT word_it(&word_res_list); 00836 WERD_RES *word; 00837 inT16 word_length; 00838 inT16 score = 0; 00839 inT16 i; 00840 float small_limit = kBlnXHeight * fixsp_small_outlines_size; 00841 00842 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00843 word = word_it.data(); 00844 if (word->rebuild_word == NULL) 00845 continue; // Can't handle cube words. 00846 word_length = word->reject_map.length(); 00847 if (word->done || 00848 word->tess_accepted || 00849 word->best_choice->permuter() == SYSTEM_DAWG_PERM || 00850 word->best_choice->permuter() == FREQ_DAWG_PERM || 00851 word->best_choice->permuter() == USER_DAWG_PERM || 00852 safe_dict_word(word) > 0) { 00853 int num_blobs = word->rebuild_word->NumBlobs(); 00854 UNICHAR_ID space = word->uch_set->unichar_to_id(" "); 00855 for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) { 00856 TBLOB* blob = word->rebuild_word->blobs[i]; 00857 if (word->best_choice->unichar_id(i) == space || 00858 blob_noise_score(blob) < small_limit) { 00859 score -= 1; // penalise possibly erroneous non-space 00860 } else if (word->reject_map[i].accepted()) { 00861 score++; 00862 } 00863 } 00864 } 00865 } 00866 if (score < 0) 00867 score = 0; 00868 return score; 00869 } 00870 00871 } // namespace tesseract