tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/fixspace.cpp
Go to the documentation of this file.
00001 /******************************************************************
00002  * File:        fixspace.cpp  (Formerly fixspace.c)
00003  * Description: Implements a pass over the page res, exploring the alternative
00004  *              spacing possibilities, trying to use context to improve the
00005  *              word spacing
00006 * Author:               Phil Cheatle
00007 * Created:              Thu Oct 21 11:38:43 BST 1993
00008 *
00009 * (C) Copyright 1993, Hewlett-Packard Ltd.
00010 ** Licensed under the Apache License, Version 2.0 (the "License");
00011 ** you may not use this file except in compliance with the License.
00012 ** You may obtain a copy of the License at
00013 ** http://www.apache.org/licenses/LICENSE-2.0
00014 ** Unless required by applicable law or agreed to in writing, software
00015 ** distributed under the License is distributed on an "AS IS" BASIS,
00016 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017 ** See the License for the specific language governing permissions and
00018 ** limitations under the License.
00019 *
00020 **********************************************************************/
00021 
00022 #include <ctype.h>
00023 #include "reject.h"
00024 #include "statistc.h"
00025 #include "control.h"
00026 #include "fixspace.h"
00027 #include "genblob.h"
00028 #include "tessvars.h"
00029 #include "tessbox.h"
00030 #include "secname.h"
00031 #include "globals.h"
00032 #include "tesseractclass.h"
00033 
00034 #define PERFECT_WERDS   999
00035 #define MAXSPACING      128      /*max expected spacing in pix */
00036 
00037 namespace tesseract {
00038 
00049 void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
00050                                  inT32 word_count,
00051                                  PAGE_RES *page_res) {
00052   BLOCK_RES_IT block_res_it;
00053   ROW_RES_IT row_res_it;
00054   WERD_RES_IT word_res_it_from;
00055   WERD_RES_IT word_res_it_to;
00056   WERD_RES *word_res;
00057   WERD_RES_LIST fuzzy_space_words;
00058   inT16 new_length;
00059   BOOL8 prevent_null_wd_fixsp;   // DONT process blobless wds
00060   inT32 word_index;              // current word
00061 
00062   block_res_it.set_to_list(&page_res->block_res_list);
00063   word_index = 0;
00064   for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
00065        block_res_it.forward()) {
00066     row_res_it.set_to_list(&block_res_it.data()->row_res_list);
00067     for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
00068          row_res_it.forward()) {
00069       word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
00070       while (!word_res_it_from.at_last()) {
00071         word_res = word_res_it_from.data();
00072         while (!word_res_it_from.at_last() &&
00073                !(word_res->combination ||
00074                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
00075                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
00076           fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
00077                          block_res_it.data()->block);
00078           word_res = word_res_it_from.forward();
00079           word_index++;
00080           if (monitor != NULL) {
00081             monitor->ocr_alive = TRUE;
00082             monitor->progress = 90 + 5 * word_index / word_count;
00083             if (monitor->deadline_exceeded() ||
00084                 (monitor->cancel != NULL &&
00085                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
00086             return;
00087           }
00088         }
00089 
00090         if (!word_res_it_from.at_last()) {
00091           word_res_it_to = word_res_it_from;
00092           prevent_null_wd_fixsp =
00093             word_res->word->cblob_list()->empty();
00094           if (check_debug_pt(word_res, 60))
00095             debug_fix_space_level.set_value(10);
00096           word_res_it_to.forward();
00097           word_index++;
00098           if (monitor != NULL) {
00099             monitor->ocr_alive = TRUE;
00100             monitor->progress = 90 + 5 * word_index / word_count;
00101             if (monitor->deadline_exceeded() ||
00102                 (monitor->cancel != NULL &&
00103                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
00104             return;
00105           }
00106           while (!word_res_it_to.at_last () &&
00107                  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
00108                   word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
00109             if (check_debug_pt(word_res, 60))
00110               debug_fix_space_level.set_value(10);
00111             if (word_res->word->cblob_list()->empty())
00112               prevent_null_wd_fixsp = TRUE;
00113             word_res = word_res_it_to.forward();
00114           }
00115           if (check_debug_pt(word_res, 60))
00116             debug_fix_space_level.set_value(10);
00117           if (word_res->word->cblob_list()->empty())
00118             prevent_null_wd_fixsp = TRUE;
00119           if (prevent_null_wd_fixsp) {
00120             word_res_it_from = word_res_it_to;
00121           } else {
00122             fuzzy_space_words.assign_to_sublist(&word_res_it_from,
00123                                                 &word_res_it_to);
00124             fix_fuzzy_space_list(fuzzy_space_words,
00125                                  row_res_it.data()->row,
00126                                  block_res_it.data()->block);
00127             new_length = fuzzy_space_words.length();
00128             word_res_it_from.add_list_before(&fuzzy_space_words);
00129             for (;
00130                  !word_res_it_from.at_last() && new_length > 0;
00131                  new_length--) {
00132               word_res_it_from.forward();
00133             }
00134           }
00135           if (test_pt)
00136             debug_fix_space_level.set_value(0);
00137         }
00138         fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
00139                        block_res_it.data()->block);
00140         // Last word in row
00141       }
00142     }
00143   }
00144 }
00145 
00146 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
00147                                      ROW *row,
00148                                      BLOCK* block) {
00149   inT16 best_score;
00150   WERD_RES_LIST current_perm;
00151   inT16 current_score;
00152   BOOL8 improved = FALSE;
00153 
00154   best_score = eval_word_spacing(best_perm);  // default score
00155   dump_words(best_perm, best_score, 1, improved);
00156 
00157   if (best_score != PERFECT_WERDS)
00158     initialise_search(best_perm, current_perm);
00159 
00160   while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
00161     match_current_words(current_perm, row, block);
00162     current_score = eval_word_spacing(current_perm);
00163     dump_words(current_perm, current_score, 2, improved);
00164     if (current_score > best_score) {
00165       best_perm.clear();
00166       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
00167       best_score = current_score;
00168       improved = TRUE;
00169     }
00170     if (current_score < PERFECT_WERDS)
00171       transform_to_next_perm(current_perm);
00172   }
00173   dump_words(best_perm, best_score, 3, improved);
00174 }
00175 
00176 }  // namespace tesseract
00177 
00178 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
00179   WERD_RES_IT src_it(&src_list);
00180   WERD_RES_IT new_it(&new_list);
00181   WERD_RES *src_wd;
00182   WERD_RES *new_wd;
00183 
00184   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
00185     src_wd = src_it.data();
00186     if (!src_wd->combination) {
00187       new_wd = WERD_RES::deep_copy(src_wd);
00188       new_wd->combination = FALSE;
00189       new_wd->part_of_combo = FALSE;
00190       new_it.add_after_then_move(new_wd);
00191     }
00192   }
00193 }
00194 
00195 
00196 namespace tesseract {
00197 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
00198                                     BLOCK* block) {
00199   WERD_RES_IT word_it(&words);
00200   WERD_RES *word;
00201   // Since we are not using PAGE_RES to iterate over words, we need to update
00202   // prev_word_best_choice_ before calling classify_word_pass2().
00203   prev_word_best_choice_ = NULL;
00204   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00205     word = word_it.data();
00206     if ((!word->part_of_combo) && (word->box_word == NULL)) {
00207       WordData word_data(block, row, word);
00208       SetupWordPassN(2, &word_data);
00209       classify_word_and_language(&Tesseract::classify_word_pass2, &word_data);
00210     }
00211     prev_word_best_choice_ = word->best_choice;
00212   }
00213 }
00214 
00215 
00241 inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
00242   WERD_RES_IT word_res_it(&word_res_list);
00243   inT16 total_score = 0;
00244   inT16 word_count = 0;
00245   inT16 done_word_count = 0;
00246   inT16 word_len;
00247   inT16 i;
00248   inT16 offset;
00249   WERD_RES *word;                 // current word
00250   inT16 prev_word_score = 0;
00251   BOOL8 prev_word_done = FALSE;
00252   BOOL8 prev_char_1 = FALSE;      // prev ch a "1/I/l"?
00253   BOOL8 prev_char_digit = FALSE;  // prev ch 2..9 or 0
00254   BOOL8 current_char_1 = FALSE;
00255   BOOL8 current_word_ok_so_far;
00256   STRING punct_chars = "!\"`',.:;";
00257   BOOL8 prev_char_punct = FALSE;
00258   BOOL8 current_char_punct = FALSE;
00259   BOOL8 word_done = FALSE;
00260 
00261   do {
00262     word = word_res_it.data();
00263     word_done = fixspace_thinks_word_done(word);
00264     word_count++;
00265     if (word->tess_failed) {
00266       total_score += prev_word_score;
00267       if (prev_word_done)
00268         done_word_count++;
00269       prev_word_score = 0;
00270       prev_char_1 = FALSE;
00271       prev_char_digit = FALSE;
00272       prev_word_done = FALSE;
00273     } else {
00274       /*
00275         Can we add the prev word score and potentially count this word?
00276         Yes IF it didnt end in a 1 when the first char of this word is a digit
00277           AND it didnt end in a digit when the first char of this word is a 1
00278       */
00279       word_len = word->reject_map.length();
00280       current_word_ok_so_far = FALSE;
00281       if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
00282             (prev_char_digit && (
00283                 (word_done &&
00284                  word->best_choice->unichar_lengths().string()[0] == 1 &&
00285                  word->best_choice->unichar_string()[0] == '1') ||
00286                 (!word_done && STRING(conflict_set_I_l_1).contains(
00287                       word->best_choice->unichar_string()[0])))))) {
00288         total_score += prev_word_score;
00289         if (prev_word_done)
00290           done_word_count++;
00291         current_word_ok_so_far = word_done;
00292       }
00293 
00294       if (current_word_ok_so_far) {
00295         prev_word_done = TRUE;
00296         prev_word_score = word_len;
00297       } else {
00298         prev_word_done = FALSE;
00299         prev_word_score = 0;
00300       }
00301 
00302       /* Add 1 to total score for every joined 1 regardless of context and
00303          rejtn */
00304       for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
00305         current_char_1 = word->best_choice->unichar_string()[i] == '1';
00306         if (prev_char_1 || (current_char_1 && (i > 0)))
00307           total_score++;
00308         prev_char_1 = current_char_1;
00309       }
00310 
00311       /* Add 1 to total score for every joined punctuation regardless of context
00312         and rejtn */
00313       if (tessedit_prefer_joined_punct) {
00314         for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
00315              offset += word->best_choice->unichar_lengths()[i++]) {
00316           current_char_punct =
00317             punct_chars.contains(word->best_choice->unichar_string()[offset]);
00318           if (prev_char_punct || (current_char_punct && i > 0))
00319             total_score++;
00320           prev_char_punct = current_char_punct;
00321         }
00322       }
00323       prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
00324       for (i = 0, offset = 0; i < word_len - 1;
00325            offset += word->best_choice->unichar_lengths()[i++]);
00326       prev_char_1 =
00327           ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
00328            || (!word_done && STRING(conflict_set_I_l_1).contains(
00329                    word->best_choice->unichar_string()[offset])));
00330     }
00331     /* Find next word */
00332     do {
00333       word_res_it.forward();
00334     } while (word_res_it.data()->part_of_combo);
00335   } while (!word_res_it.at_first());
00336   total_score += prev_word_score;
00337   if (prev_word_done)
00338     done_word_count++;
00339   if (done_word_count == word_count)
00340     return PERFECT_WERDS;
00341   else
00342     return total_score;
00343 }
00344 
00345 BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
00346   int i;
00347   int offset;
00348 
00349   for (i = 0, offset = 0; i < char_position;
00350        offset += word->best_choice->unichar_lengths()[i++]);
00351   return (
00352       word->uch_set->get_isdigit(
00353           word->best_choice->unichar_string().string() + offset,
00354           word->best_choice->unichar_lengths()[i]) ||
00355       (word->best_choice->permuter() == NUMBER_PERM &&
00356        STRING(numeric_punctuation).contains(
00357            word->best_choice->unichar_string().string()[offset])));
00358 }
00359 
00360 }  // namespace tesseract
00361 
00362 
00374 void transform_to_next_perm(WERD_RES_LIST &words) {
00375   WERD_RES_IT word_it(&words);
00376   WERD_RES_IT prev_word_it(&words);
00377   WERD_RES *word;
00378   WERD_RES *prev_word;
00379   WERD_RES *combo;
00380   WERD *copy_word;
00381   inT16 prev_right = -MAX_INT16;
00382   TBOX box;
00383   inT16 gap;
00384   inT16 min_gap = MAX_INT16;
00385 
00386   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00387     word = word_it.data();
00388     if (!word->part_of_combo) {
00389       box = word->word->bounding_box();
00390       if (prev_right > -MAX_INT16) {
00391         gap = box.left() - prev_right;
00392         if (gap < min_gap)
00393           min_gap = gap;
00394       }
00395       prev_right = box.right();
00396     }
00397   }
00398   if (min_gap < MAX_INT16) {
00399     prev_right = -MAX_INT16;        // back to start
00400     word_it.set_to_list(&words);
00401     // Note: we can't use cycle_pt due to inserted combos at start of list.
00402     for (; (prev_right == -MAX_INT16) || !word_it.at_first();
00403          word_it.forward()) {
00404       word = word_it.data();
00405       if (!word->part_of_combo) {
00406         box = word->word->bounding_box();
00407         if (prev_right > -MAX_INT16) {
00408           gap = box.left() - prev_right;
00409           if (gap <= min_gap) {
00410             prev_word = prev_word_it.data();
00411             if (prev_word->combination) {
00412               combo = prev_word;
00413             } else {
00414               /* Make a new combination and insert before
00415                * the first word being joined. */
00416               copy_word = new WERD;
00417               *copy_word = *(prev_word->word);
00418               // deep copy
00419               combo = new WERD_RES(copy_word);
00420               combo->combination = TRUE;
00421               combo->x_height = prev_word->x_height;
00422               prev_word->part_of_combo = TRUE;
00423               prev_word_it.add_before_then_move(combo);
00424             }
00425             combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
00426             if (word->combination) {
00427               combo->word->join_on(word->word);
00428               // Move blobs to combo
00429               // old combo no longer needed
00430               delete word_it.extract();
00431             } else {
00432               // Copy current wd to combo
00433               combo->copy_on(word);
00434               word->part_of_combo = TRUE;
00435             }
00436             combo->done = FALSE;
00437             combo->ClearResults();
00438           } else {
00439             prev_word_it = word_it;  // catch up
00440           }
00441         }
00442         prev_right = box.right();
00443       }
00444     }
00445   } else {
00446     words.clear();  // signal termination
00447   }
00448 }
00449 
00450 namespace tesseract {
00451 void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score,
00452                            inT16 mode, BOOL8 improved) {
00453   WERD_RES_IT word_res_it(&perm);
00454 
00455   if (debug_fix_space_level > 0) {
00456     if (mode == 1) {
00457       stats_.dump_words_str = "";
00458       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
00459            word_res_it.forward()) {
00460         if (!word_res_it.data()->part_of_combo) {
00461           stats_.dump_words_str +=
00462               word_res_it.data()->best_choice->unichar_string();
00463           stats_.dump_words_str += ' ';
00464         }
00465       }
00466     }
00467 
00468     #ifndef SECURE_NAMES
00469     if (debug_fix_space_level > 1) {
00470       switch (mode) {
00471         case 1:
00472           tprintf("EXTRACTED (%d): \"", score);
00473           break;
00474         case 2:
00475           tprintf("TESTED (%d): \"", score);
00476           break;
00477         case 3:
00478           tprintf("RETURNED (%d): \"", score);
00479           break;
00480       }
00481 
00482       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
00483            word_res_it.forward()) {
00484         if (!word_res_it.data()->part_of_combo) {
00485           tprintf("%s/%1d ",
00486                   word_res_it.data()->best_choice->unichar_string().string(),
00487                   (int)word_res_it.data()->best_choice->permuter());
00488         }
00489       }
00490       tprintf("\"\n");
00491     } else if (improved) {
00492       tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
00493       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
00494            word_res_it.forward()) {
00495         if (!word_res_it.data()->part_of_combo) {
00496           tprintf("%s/%1d ",
00497                   word_res_it.data()->best_choice->unichar_string().string(),
00498                   (int)word_res_it.data()->best_choice->permuter());
00499         }
00500       }
00501       tprintf("\"\n");
00502     }
00503     #endif
00504   }
00505 }
00506 
00507 BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
00508   if (word->done)
00509     return TRUE;
00510 
00511   /*
00512     Use all the standard pass 2 conditions for mode 5 in set_done() in
00513     reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
00514     CARE WHETHER WE HAVE of/at on/an etc.
00515   */
00516   if (fixsp_done_mode > 0 &&
00517       (word->tess_accepted ||
00518        (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
00519        fixsp_done_mode == 3) &&
00520       (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&
00521       ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
00522        (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
00523        (word->best_choice->permuter() == USER_DAWG_PERM) ||
00524        (word->best_choice->permuter() == NUMBER_PERM))) {
00525     return TRUE;
00526   } else {
00527     return FALSE;
00528   }
00529 }
00530 
00531 
00539 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
00540                                BLOCK* block) {
00541   WERD_RES *word_res;
00542   WERD_RES_LIST sub_word_list;
00543   WERD_RES_IT sub_word_list_it(&sub_word_list);
00544   inT16 blob_index;
00545   inT16 new_length;
00546   float junk;
00547 
00548   word_res = word_res_it.data();
00549   if (word_res->word->flag(W_REP_CHAR) ||
00550       word_res->combination ||
00551       word_res->part_of_combo ||
00552       !word_res->word->flag(W_DONT_CHOP))
00553     return;
00554 
00555   blob_index = worst_noise_blob(word_res, &junk);
00556   if (blob_index < 0)
00557     return;
00558 
00559   if (debug_fix_space_level > 1) {
00560     tprintf("FP fixspace working on \"%s\"\n",
00561             word_res->best_choice->unichar_string().string());
00562   }
00563   word_res->word->rej_cblob_list()->sort(c_blob_comparator);
00564   sub_word_list_it.add_after_stay_put(word_res_it.extract());
00565   fix_noisy_space_list(sub_word_list, row, block);
00566   new_length = sub_word_list.length();
00567   word_res_it.add_list_before(&sub_word_list);
00568   for (; !word_res_it.at_last() && new_length > 1; new_length--) {
00569     word_res_it.forward();
00570   }
00571 }
00572 
00573 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
00574                                      BLOCK* block) {
00575   inT16 best_score;
00576   WERD_RES_IT best_perm_it(&best_perm);
00577   WERD_RES_LIST current_perm;
00578   WERD_RES_IT current_perm_it(&current_perm);
00579   WERD_RES *old_word_res;
00580   inT16 current_score;
00581   BOOL8 improved = FALSE;
00582 
00583   best_score = fp_eval_word_spacing(best_perm);  // default score
00584 
00585   dump_words(best_perm, best_score, 1, improved);
00586 
00587   old_word_res = best_perm_it.data();
00588   // Even deep_copy doesn't copy the underlying WERD unless its combination
00589   // flag is true!.
00590   old_word_res->combination = TRUE;   // Kludge to force deep copy
00591   current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
00592   old_word_res->combination = FALSE;  // Undo kludge
00593 
00594   break_noisiest_blob_word(current_perm);
00595 
00596   while (best_score != PERFECT_WERDS && !current_perm.empty()) {
00597     match_current_words(current_perm, row, block);
00598     current_score = fp_eval_word_spacing(current_perm);
00599     dump_words(current_perm, current_score, 2, improved);
00600     if (current_score > best_score) {
00601       best_perm.clear();
00602       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
00603       best_score = current_score;
00604       improved = TRUE;
00605     }
00606     if (current_score < PERFECT_WERDS) {
00607       break_noisiest_blob_word(current_perm);
00608     }
00609   }
00610   dump_words(best_perm, best_score, 3, improved);
00611 }
00612 
00613 
00619 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
00620   WERD_RES_IT word_it(&words);
00621   WERD_RES_IT worst_word_it;
00622   float worst_noise_score = 9999;
00623   int worst_blob_index = -1;     // Noisiest blob of noisiest wd
00624   int blob_index;                // of wds noisiest blob
00625   float noise_score;             // of wds noisiest blob
00626   WERD_RES *word_res;
00627   C_BLOB_IT blob_it;
00628   C_BLOB_IT rej_cblob_it;
00629   C_BLOB_LIST new_blob_list;
00630   C_BLOB_IT new_blob_it;
00631   C_BLOB_IT new_rej_cblob_it;
00632   WERD *new_word;
00633   inT16 start_of_noise_blob;
00634   inT16 i;
00635 
00636   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00637     blob_index = worst_noise_blob(word_it.data(), &noise_score);
00638     if (blob_index > -1 && worst_noise_score > noise_score) {
00639       worst_noise_score = noise_score;
00640       worst_blob_index = blob_index;
00641       worst_word_it = word_it;
00642     }
00643   }
00644   if (worst_blob_index < 0) {
00645     words.clear();          // signal termination
00646     return;
00647   }
00648 
00649   /* Now split the worst_word_it */
00650 
00651   word_res = worst_word_it.data();
00652 
00653   /* Move blobs before noise blob to a new bloblist */
00654 
00655   new_blob_it.set_to_list(&new_blob_list);
00656   blob_it.set_to_list(word_res->word->cblob_list());
00657   for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
00658     new_blob_it.add_after_then_move(blob_it.extract());
00659   }
00660   start_of_noise_blob = blob_it.data()->bounding_box().left();
00661   delete blob_it.extract();     // throw out noise blob
00662 
00663   new_word = new WERD(&new_blob_list, word_res->word);
00664   new_word->set_flag(W_EOL, FALSE);
00665   word_res->word->set_flag(W_BOL, FALSE);
00666   word_res->word->set_blanks(1);  // After break
00667 
00668   new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
00669   rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
00670   for (;
00671        (!rej_cblob_it.empty() &&
00672         (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
00673        rej_cblob_it.forward()) {
00674     new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
00675   }
00676 
00677   WERD_RES* new_word_res = new WERD_RES(new_word);
00678   new_word_res->combination = TRUE;
00679   worst_word_it.add_before_then_move(new_word_res);
00680 
00681   word_res->ClearResults();
00682 }
00683 
00684 inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
00685                                   float *worst_noise_score) {
00686   float noise_score[512];
00687   int i;
00688   int min_noise_blob;            // 1st contender
00689   int max_noise_blob;            // last contender
00690   int non_noise_count;
00691   int worst_noise_blob;          // Worst blob
00692   float small_limit = kBlnXHeight * fixsp_small_outlines_size;
00693   float non_noise_limit = kBlnXHeight * 0.8;
00694 
00695   if (word_res->rebuild_word == NULL)
00696     return -1;  // Can't handle cube words.
00697 
00698   // Normalised.
00699   int blob_count = word_res->box_word->length();
00700   ASSERT_HOST(blob_count <= 512);
00701   if (blob_count < 5)
00702     return -1;                   // too short to split
00703 
00704   /* Get the noise scores for all blobs */
00705 
00706   #ifndef SECURE_NAMES
00707   if (debug_fix_space_level > 5)
00708     tprintf("FP fixspace Noise metrics for \"%s\": ",
00709             word_res->best_choice->unichar_string().string());
00710   #endif
00711 
00712   for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
00713     TBLOB* blob = word_res->rebuild_word->blobs[i];
00714     if (word_res->reject_map[i].accepted())
00715       noise_score[i] = non_noise_limit;
00716     else
00717       noise_score[i] = blob_noise_score(blob);
00718 
00719     if (debug_fix_space_level > 5)
00720       tprintf("%1.1f ", noise_score[i]);
00721   }
00722   if (debug_fix_space_level > 5)
00723     tprintf("\n");
00724 
00725   /* Now find the worst one which is far enough away from the end of the word */
00726 
00727   non_noise_count = 0;
00728   for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
00729     if (noise_score[i] >= non_noise_limit) {
00730       non_noise_count++;
00731     }
00732   }
00733   if (non_noise_count < fixsp_non_noise_limit)
00734     return -1;
00735 
00736   min_noise_blob = i;
00737 
00738   non_noise_count = 0;
00739   for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
00740        i--) {
00741     if (noise_score[i] >= non_noise_limit) {
00742       non_noise_count++;
00743     }
00744   }
00745   if (non_noise_count < fixsp_non_noise_limit)
00746     return -1;
00747 
00748   max_noise_blob = i;
00749 
00750   if (min_noise_blob > max_noise_blob)
00751     return -1;
00752 
00753   *worst_noise_score = small_limit;
00754   worst_noise_blob = -1;
00755   for (i = min_noise_blob; i <= max_noise_blob; i++) {
00756     if (noise_score[i] < *worst_noise_score) {
00757       worst_noise_blob = i;
00758       *worst_noise_score = noise_score[i];
00759     }
00760   }
00761   return worst_noise_blob;
00762 }
00763 
00764 float Tesseract::blob_noise_score(TBLOB *blob) {
00765   TBOX box;                       // BB of outline
00766   inT16 outline_count = 0;
00767   inT16 max_dimension;
00768   inT16 largest_outline_dimension = 0;
00769 
00770   for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {
00771     outline_count++;
00772     box = ol->bounding_box();
00773     if (box.height() > box.width()) {
00774       max_dimension = box.height();
00775     } else {
00776       max_dimension = box.width();
00777     }
00778 
00779     if (largest_outline_dimension < max_dimension)
00780       largest_outline_dimension = max_dimension;
00781   }
00782 
00783   if (outline_count > 5) {
00784     // penalise LOTS of blobs
00785     largest_outline_dimension *= 2;
00786   }
00787 
00788   box = blob->bounding_box();
00789   if (box.bottom() > kBlnBaselineOffset * 4 ||
00790       box.top() < kBlnBaselineOffset / 2) {
00791     // Lax blob is if high or low
00792     largest_outline_dimension /= 2;
00793   }
00794 
00795   return largest_outline_dimension;
00796 }
00797 }  // namespace tesseract
00798 
00799 void fixspace_dbg(WERD_RES *word) {
00800   TBOX box = word->word->bounding_box();
00801   BOOL8 show_map_detail = FALSE;
00802   inT16 i;
00803 
00804   box.print();
00805   tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
00806   tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
00807           word->word->cblob_list()->length(),
00808           word->rebuild_word->NumBlobs(),
00809           word->box_word->length());
00810   word->reject_map.print(debug_fp);
00811   tprintf("\n");
00812   if (show_map_detail) {
00813     tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
00814     for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
00815       tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
00816       word->reject_map[i].full_print(debug_fp);
00817     }
00818   }
00819 
00820   tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
00821   tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
00822 }
00823 
00824 
00833 namespace tesseract {
00834 inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
00835   WERD_RES_IT word_it(&word_res_list);
00836   WERD_RES *word;
00837   inT16 word_length;
00838   inT16 score = 0;
00839   inT16 i;
00840   float small_limit = kBlnXHeight * fixsp_small_outlines_size;
00841 
00842   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00843     word = word_it.data();
00844     if (word->rebuild_word == NULL)
00845       continue;  // Can't handle cube words.
00846     word_length = word->reject_map.length();
00847     if (word->done ||
00848         word->tess_accepted ||
00849         word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
00850         word->best_choice->permuter() == FREQ_DAWG_PERM ||
00851         word->best_choice->permuter() == USER_DAWG_PERM ||
00852         safe_dict_word(word) > 0) {
00853       int num_blobs = word->rebuild_word->NumBlobs();
00854       UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
00855       for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
00856         TBLOB* blob = word->rebuild_word->blobs[i];
00857         if (word->best_choice->unichar_id(i) == space ||
00858             blob_noise_score(blob) < small_limit) {
00859           score -= 1;  // penalise possibly erroneous non-space
00860         } else if (word->reject_map[i].accepted()) {
00861           score++;
00862         }
00863       }
00864     }
00865   }
00866   if (score < 0)
00867     score = 0;
00868   return score;
00869 }
00870 
00871 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines