tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/docqual.cpp
Go to the documentation of this file.
00001 /******************************************************************
00002  * File:        docqual.cpp  (Formerly docqual.c)
00003  * Description: Document Quality Metrics
00004  * Author:              Phil Cheatle
00005  * Created:             Mon May  9 11:27:28 BST 1994
00006  *
00007  * (C) Copyright 1994, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #include          <ctype.h>
00025 #include          "docqual.h"
00026 #include          "reject.h"
00027 #include          "tesscallback.h"
00028 #include          "tessvars.h"
00029 #include          "secname.h"
00030 #include          "globals.h"
00031 #include          "tesseractclass.h"
00032 
00033 namespace tesseract{
00034 
00035 // A little class to provide the callbacks as we have no pre-bound args.
00036 struct DocQualCallbacks {
00037   explicit DocQualCallbacks(WERD_RES* word0)
00038     : word(word0), match_count(0), accepted_match_count(0) {}
00039 
00040   void CountMatchingBlobs(int index) {
00041     ++match_count;
00042   }
00043 
00044   void CountAcceptedBlobs(int index) {
00045     if (word->reject_map[index].accepted())
00046       ++accepted_match_count;
00047     ++match_count;
00048   }
00049 
00050   void AcceptIfGoodQuality(int index) {
00051     if (word->reject_map[index].accept_if_good_quality())
00052       word->reject_map[index].setrej_quality_accept();
00053   }
00054 
00055   WERD_RES* word;
00056   inT16 match_count;
00057   inT16 accepted_match_count;
00058 };
00059 
00060 /*************************************************************************
00061  * word_blob_quality()
00062  * How many blobs in the box_word are identical to those of the inword?
00063  * ASSUME blobs in both initial word and box_word are in ascending order of
00064  * left hand blob edge.
00065  *************************************************************************/
00066 inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) {
00067   if (word->bln_boxes == NULL ||
00068       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
00069     return 0;
00070 
00071   DocQualCallbacks cb(word);
00072   word->bln_boxes->ProcessMatchedBlobs(
00073       *word->rebuild_word,
00074       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs));
00075   return cb.match_count;
00076 }
00077 
00078 inT16 Tesseract::word_outline_errs(WERD_RES *word) {
00079   inT16 i = 0;
00080   inT16 err_count = 0;
00081 
00082   if (word->rebuild_word != NULL) {
00083     for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
00084       TBLOB* blob = word->rebuild_word->blobs[b];
00085       err_count += count_outline_errs(word->best_choice->unichar_string()[i],
00086                                       blob->NumOutlines());
00087       i++;
00088     }
00089   }
00090   return err_count;
00091 }
00092 
00093 /*************************************************************************
00094  * word_char_quality()
00095  * Combination of blob quality and outline quality - how many good chars are
00096  * there? - I.e chars which pass the blob AND outline tests.
00097  *************************************************************************/
00098 void Tesseract::word_char_quality(WERD_RES *word,
00099                                   ROW *row,
00100                                   inT16 *match_count,
00101                                   inT16 *accepted_match_count) {
00102   if (word->bln_boxes == NULL ||
00103       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
00104     return;
00105 
00106   DocQualCallbacks cb(word);
00107   word->bln_boxes->ProcessMatchedBlobs(
00108       *word->rebuild_word,
00109       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs));
00110   *match_count = cb.match_count;
00111   *accepted_match_count = cb.accepted_match_count;
00112 }
00113 
00114 /*************************************************************************
00115  * unrej_good_chs()
00116  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
00117  *************************************************************************/
00118 void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) {
00119   if (word->bln_boxes == NULL ||
00120       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
00121     return;
00122 
00123   DocQualCallbacks cb(word);
00124   word->bln_boxes->ProcessMatchedBlobs(
00125       *word->rebuild_word,
00126       NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality));
00127 }
00128 
00129 inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
00130   int expected_outline_count;
00131 
00132   if (STRING (outlines_odd).contains (c))
00133     return 0;                    //Dont use this char
00134   else if (STRING (outlines_2).contains (c))
00135     expected_outline_count = 2;
00136   else
00137     expected_outline_count = 1;
00138   return abs (outline_count - expected_outline_count);
00139 }
00140 
00141 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
00142                                         BOOL8 good_quality_doc) {
00143   if ((tessedit_good_quality_unrej && good_quality_doc))
00144     unrej_good_quality_words(page_res_it);
00145   doc_and_block_rejection(page_res_it, good_quality_doc);
00146   if (unlv_tilde_crunching) {
00147     tilde_crunch(page_res_it);
00148     tilde_delete(page_res_it);
00149   }
00150 }
00151 
00152 
00153 /*************************************************************************
00154  * unrej_good_quality_words()
00155  * Accept potential rejects in words which pass the following checks:
00156  *    - Contains a potential reject
00157  *    - Word looks like a sensible alpha word.
00158  *    - Word segmentation is the same as the original image
00159  *              - All characters have the expected number of outlines
00160  * NOTE - the rejection counts are recalculated after unrejection
00161  *      - CANT do it in a single pass without a bit of fiddling
00162  *              - keep it simple but inefficient
00163  *************************************************************************/
00164 void Tesseract::unrej_good_quality_words(  //unreject potential
00165                                          PAGE_RES_IT &page_res_it) {
00166   WERD_RES *word;
00167   ROW_RES *current_row;
00168   BLOCK_RES *current_block;
00169   int i;
00170 
00171   page_res_it.restart_page ();
00172   while (page_res_it.word () != NULL) {
00173     check_debug_pt (page_res_it.word (), 100);
00174     if (bland_unrej) {
00175       word = page_res_it.word ();
00176       for (i = 0; i < word->reject_map.length (); i++) {
00177         if (word->reject_map[i].accept_if_good_quality ())
00178           word->reject_map[i].setrej_quality_accept ();
00179       }
00180       page_res_it.forward ();
00181     }
00182     else if ((page_res_it.row ()->char_count > 0) &&
00183       ((page_res_it.row ()->rej_count /
00184       (float) page_res_it.row ()->char_count) <=
00185     quality_rowrej_pc)) {
00186       word = page_res_it.word ();
00187       if (word->reject_map.quality_recoverable_rejects() &&
00188           (tessedit_unrej_any_wd ||
00189            acceptable_word_string(*word->uch_set,
00190                                   word->best_choice->unichar_string().string(),
00191                                   word->best_choice->unichar_lengths().string())
00192                != AC_UNACCEPTABLE)) {
00193         unrej_good_chs(word, page_res_it.row ()->row);
00194       }
00195       page_res_it.forward ();
00196     }
00197     else {
00198       /* Skip to end of dodgy row */
00199       current_row = page_res_it.row ();
00200       while ((page_res_it.word () != NULL) &&
00201         (page_res_it.row () == current_row))
00202         page_res_it.forward ();
00203     }
00204     check_debug_pt (page_res_it.word (), 110);
00205   }
00206   page_res_it.restart_page ();
00207   page_res_it.page_res->char_count = 0;
00208   page_res_it.page_res->rej_count = 0;
00209   current_block = NULL;
00210   current_row = NULL;
00211   while (page_res_it.word () != NULL) {
00212     if (current_block != page_res_it.block ()) {
00213       current_block = page_res_it.block ();
00214       current_block->char_count = 0;
00215       current_block->rej_count = 0;
00216     }
00217     if (current_row != page_res_it.row ()) {
00218       current_row = page_res_it.row ();
00219       current_row->char_count = 0;
00220       current_row->rej_count = 0;
00221       current_row->whole_word_rej_count = 0;
00222     }
00223     page_res_it.rej_stat_word ();
00224     page_res_it.forward ();
00225   }
00226 }
00227 
00228 
00229 /*************************************************************************
00230  * doc_and_block_rejection()
00231  *
00232  * If the page has too many rejects - reject all of it.
00233  * If any block has too many rejects - reject all words in the block
00234  *************************************************************************/
00235 
00236 void Tesseract::doc_and_block_rejection(  //reject big chunks
00237                                         PAGE_RES_IT &page_res_it,
00238                                         BOOL8 good_quality_doc) {
00239   inT16 block_no = 0;
00240   inT16 row_no = 0;
00241   BLOCK_RES *current_block;
00242   ROW_RES *current_row;
00243 
00244   BOOL8 rej_word;
00245   BOOL8 prev_word_rejected;
00246   inT16 char_quality = 0;
00247   inT16 accepted_char_quality;
00248 
00249   if (page_res_it.page_res->rej_count * 100.0 /
00250       page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
00251     reject_whole_page(page_res_it);
00252     if (tessedit_debug_doc_rejection) {
00253       tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
00254               page_res_it.page_res->char_count,
00255               page_res_it.page_res->rej_count);
00256     }
00257   } else {
00258     if (tessedit_debug_doc_rejection) {
00259       tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
00260               page_res_it.page_res->char_count,
00261               page_res_it.page_res->rej_count);
00262     }
00263 
00264     /* Walk blocks testing for block rejection */
00265 
00266     page_res_it.restart_page();
00267     WERD_RES* word;
00268     while ((word = page_res_it.word()) != NULL) {
00269       current_block = page_res_it.block();
00270       block_no = current_block->block->index();
00271       if (current_block->char_count > 0 &&
00272           (current_block->rej_count * 100.0 / current_block->char_count) >
00273            tessedit_reject_block_percent) {
00274         if (tessedit_debug_block_rejection) {
00275           tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
00276                   block_no, current_block->char_count,
00277                   current_block->rej_count);
00278         }
00279         prev_word_rejected = FALSE;
00280         while ((word = page_res_it.word()) != NULL &&
00281                (page_res_it.block() == current_block)) {
00282           if (tessedit_preserve_blk_rej_perfect_wds) {
00283             rej_word = word->reject_map.reject_count() > 0 ||
00284                 word->reject_map.length () < tessedit_preserve_min_wd_len;
00285             if (rej_word && tessedit_dont_blkrej_good_wds &&
00286                 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
00287                 acceptable_word_string(
00288                     *word->uch_set,
00289                     word->best_choice->unichar_string().string(),
00290                     word->best_choice->unichar_lengths().string()) !=
00291                 AC_UNACCEPTABLE) {
00292               word_char_quality(word, page_res_it.row()->row,
00293                                 &char_quality,
00294                                 &accepted_char_quality);
00295               rej_word = char_quality !=  word->reject_map.length();
00296             }
00297           } else {
00298             rej_word = TRUE;
00299           }
00300           if (rej_word) {
00301             /*
00302               Reject spacing if both current and prev words are rejected.
00303               NOTE - this is NOT restricted to FUZZY spaces. - When tried this
00304               generated more space errors.
00305             */
00306             if (tessedit_use_reject_spaces &&
00307                 prev_word_rejected &&
00308                 page_res_it.prev_row() == page_res_it.row() &&
00309                 word->word->space() == 1)
00310               word->reject_spaces = TRUE;
00311             word->reject_map.rej_word_block_rej();
00312           }
00313           prev_word_rejected = rej_word;
00314           page_res_it.forward();
00315         }
00316       } else {
00317         if (tessedit_debug_block_rejection) {
00318           tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
00319                   block_no, page_res_it.block()->char_count,
00320                   page_res_it.block()->rej_count);
00321         }
00322 
00323         /* Walk rows in block testing for row rejection */
00324         row_no = 0;
00325         while ((word = page_res_it.word()) != NULL &&
00326                page_res_it.block() == current_block) {
00327           current_row = page_res_it.row();
00328           row_no++;
00329           /* Reject whole row if:
00330             fraction of chars on row which are rejected exceed a limit AND
00331             fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
00332             limit
00333           */
00334           if (current_row->char_count > 0 &&
00335               (current_row->rej_count * 100.0 / current_row->char_count) >
00336               tessedit_reject_row_percent &&
00337               (current_row->whole_word_rej_count * 100.0 /
00338                   current_row->rej_count) <
00339               tessedit_whole_wd_rej_row_percent) {
00340             if (tessedit_debug_block_rejection) {
00341               tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
00342                       row_no, current_row->char_count,
00343                       current_row->rej_count);
00344             }
00345             prev_word_rejected = FALSE;
00346             while ((word = page_res_it.word()) != NULL &&
00347                    page_res_it.row () == current_row) {
00348               /* Preserve words on good docs unless they are mostly rejected*/
00349               if (!tessedit_row_rej_good_docs && good_quality_doc) {
00350                 rej_word = word->reject_map.reject_count() /
00351                     static_cast<float>(word->reject_map.length()) >
00352                     tessedit_good_doc_still_rowrej_wd;
00353               } else if (tessedit_preserve_row_rej_perfect_wds) {
00354                 /* Preserve perfect words anyway */
00355                 rej_word = word->reject_map.reject_count() > 0 ||
00356                     word->reject_map.length () < tessedit_preserve_min_wd_len;
00357                 if (rej_word && tessedit_dont_rowrej_good_wds &&
00358                     word->reject_map.length() >= tessedit_preserve_min_wd_len &&
00359                     acceptable_word_string(*word->uch_set,
00360                         word->best_choice->unichar_string().string(),
00361                         word->best_choice->unichar_lengths().string()) !=
00362                             AC_UNACCEPTABLE) {
00363                   word_char_quality(word, page_res_it.row()->row,
00364                                     &char_quality,
00365                                     &accepted_char_quality);
00366                   rej_word = char_quality != word->reject_map.length();
00367                 }
00368               } else {
00369                 rej_word = TRUE;
00370               }
00371               if (rej_word) {
00372                 /*
00373                   Reject spacing if both current and prev words are rejected.
00374                   NOTE - this is NOT restricted to FUZZY spaces. - When tried
00375                   this generated more space errors.
00376                 */
00377                 if (tessedit_use_reject_spaces &&
00378                     prev_word_rejected &&
00379                     page_res_it.prev_row() == page_res_it.row() &&
00380                     word->word->space () == 1)
00381                   word->reject_spaces = TRUE;
00382                 word->reject_map.rej_word_row_rej();
00383               }
00384               prev_word_rejected = rej_word;
00385               page_res_it.forward();
00386             }
00387           } else {
00388             if (tessedit_debug_block_rejection) {
00389               tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
00390                       row_no, current_row->char_count, current_row->rej_count);
00391             }
00392             while (page_res_it.word() != NULL &&
00393                    page_res_it.row() == current_row)
00394               page_res_it.forward();
00395           }
00396         }
00397       }
00398     }
00399   }
00400 }
00401 
00402 }  // namespace tesseract
00403 
00404 
00405 /*************************************************************************
00406  * reject_whole_page()
00407  * Dont believe any of it - set the reject map to 00..00 in all words
00408  *
00409  *************************************************************************/
00410 
00411 void reject_whole_page(PAGE_RES_IT &page_res_it) {
00412   page_res_it.restart_page ();
00413   while (page_res_it.word () != NULL) {
00414     page_res_it.word ()->reject_map.rej_word_doc_rej ();
00415     page_res_it.forward ();
00416   }
00417                                  //whole page is rejected
00418   page_res_it.page_res->rejected = TRUE;
00419 }
00420 
00421 namespace tesseract {
00422 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
00423   WERD_RES *word;
00424   GARBAGE_LEVEL garbage_level;
00425   PAGE_RES_IT copy_it;
00426   BOOL8 prev_potential_marked = FALSE;
00427   BOOL8 found_terrible_word = FALSE;
00428   BOOL8 ok_dict_word;
00429 
00430   page_res_it.restart_page();
00431   while (page_res_it.word() != NULL) {
00432     POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
00433     if (pb != NULL && !pb->IsText()) {
00434       page_res_it.forward();
00435       continue;
00436     }
00437     word = page_res_it.word();
00438 
00439     if (crunch_early_convert_bad_unlv_chs)
00440       convert_bad_unlv_chs(word);
00441 
00442     if (crunch_early_merge_tess_fails)
00443       word->merge_tess_fails();
00444 
00445     if (word->reject_map.accept_count () != 0) {
00446       found_terrible_word = FALSE;
00447                                  //Forget earlier potential crunches
00448       prev_potential_marked = FALSE;
00449     }
00450     else {
00451       ok_dict_word = safe_dict_word(word);
00452       garbage_level = garbage_word (word, ok_dict_word);
00453 
00454       if ((garbage_level != G_NEVER_CRUNCH) &&
00455       (terrible_word_crunch (word, garbage_level))) {
00456         if (crunch_debug > 0) {
00457           tprintf ("T CRUNCHING: \"%s\"\n",
00458             word->best_choice->unichar_string().string());
00459         }
00460         word->unlv_crunch_mode = CR_KEEP_SPACE;
00461         if (prev_potential_marked) {
00462           while (copy_it.word () != word) {
00463             if (crunch_debug > 0) {
00464               tprintf ("P1 CRUNCHING: \"%s\"\n",
00465                 copy_it.word()->best_choice->unichar_string().string());
00466             }
00467             copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
00468             copy_it.forward ();
00469           }
00470           prev_potential_marked = FALSE;
00471         }
00472         found_terrible_word = TRUE;
00473       }
00474       else if ((garbage_level != G_NEVER_CRUNCH) &&
00475         (potential_word_crunch (word,
00476       garbage_level, ok_dict_word))) {
00477         if (found_terrible_word) {
00478           if (crunch_debug > 0) {
00479             tprintf ("P2 CRUNCHING: \"%s\"\n",
00480               word->best_choice->unichar_string().string());
00481           }
00482           word->unlv_crunch_mode = CR_KEEP_SPACE;
00483         }
00484         else if (!prev_potential_marked) {
00485           copy_it = page_res_it;
00486           prev_potential_marked = TRUE;
00487           if (crunch_debug > 1) {
00488             tprintf ("P3 CRUNCHING: \"%s\"\n",
00489               word->best_choice->unichar_string().string());
00490           }
00491         }
00492       }
00493       else {
00494         found_terrible_word = FALSE;
00495                                  //Forget earlier potential crunches
00496         prev_potential_marked = FALSE;
00497         if (crunch_debug > 2) {
00498           tprintf ("NO CRUNCH: \"%s\"\n",
00499             word->best_choice->unichar_string().string());
00500         }
00501       }
00502     }
00503     page_res_it.forward ();
00504   }
00505 }
00506 
00507 
00508 BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word,
00509                                       GARBAGE_LEVEL garbage_level) {
00510   float rating_per_ch;
00511   int adjusted_len;
00512   int crunch_mode = 0;
00513 
00514   if ((word->best_choice->unichar_string().length () == 0) ||
00515     (strspn (word->best_choice->unichar_string().string(), " ") ==
00516     word->best_choice->unichar_string().length ()))
00517     crunch_mode = 1;
00518   else {
00519     adjusted_len = word->reject_map.length ();
00520     if (adjusted_len > crunch_rating_max)
00521       adjusted_len = crunch_rating_max;
00522     rating_per_ch = word->best_choice->rating () / adjusted_len;
00523 
00524     if (rating_per_ch > crunch_terrible_rating)
00525       crunch_mode = 2;
00526     else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
00527       crunch_mode = 3;
00528     else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
00529       (garbage_level != G_OK))
00530       crunch_mode = 4;
00531     else if ((rating_per_ch > crunch_poor_garbage_rate) &&
00532       (garbage_level != G_OK))
00533       crunch_mode = 5;
00534   }
00535   if (crunch_mode > 0) {
00536     if (crunch_debug > 2) {
00537       tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
00538         crunch_mode, word->best_choice->unichar_string().string());
00539     }
00540     return TRUE;
00541   }
00542   else
00543     return FALSE;
00544 }
00545 
00546 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word,
00547                                        GARBAGE_LEVEL garbage_level,
00548                                        BOOL8 ok_dict_word) {
00549   float rating_per_ch;
00550   int adjusted_len;
00551   const char *str = word->best_choice->unichar_string().string();
00552   const char *lengths = word->best_choice->unichar_lengths().string();
00553   BOOL8 word_crunchable;
00554   int poor_indicator_count = 0;
00555 
00556   word_crunchable = !crunch_leave_accept_strings ||
00557                     word->reject_map.length() < 3 ||
00558                     (acceptable_word_string(*word->uch_set,
00559                                             str, lengths) == AC_UNACCEPTABLE &&
00560                      !ok_dict_word);
00561 
00562   adjusted_len = word->reject_map.length();
00563   if (adjusted_len > 10)
00564     adjusted_len = 10;
00565   rating_per_ch = word->best_choice->rating() / adjusted_len;
00566 
00567   if (rating_per_ch > crunch_pot_poor_rate) {
00568     if (crunch_debug > 2) {
00569       tprintf("Potential poor rating on \"%s\"\n",
00570               word->best_choice->unichar_string().string());
00571     }
00572     poor_indicator_count++;
00573   }
00574 
00575   if (word_crunchable &&
00576       word->best_choice->certainty() < crunch_pot_poor_cert) {
00577     if (crunch_debug > 2) {
00578       tprintf("Potential poor cert on \"%s\"\n",
00579               word->best_choice->unichar_string().string());
00580     }
00581     poor_indicator_count++;
00582   }
00583 
00584   if (garbage_level != G_OK) {
00585     if (crunch_debug > 2) {
00586       tprintf("Potential garbage on \"%s\"\n",
00587               word->best_choice->unichar_string().string());
00588     }
00589     poor_indicator_count++;
00590   }
00591   return poor_indicator_count >= crunch_pot_indicators;
00592 }
00593 
00594 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
00595   WERD_RES *word;
00596   PAGE_RES_IT copy_it;
00597   BOOL8 deleting_from_bol = FALSE;
00598   BOOL8 marked_delete_point = FALSE;
00599   inT16 debug_delete_mode;
00600   CRUNCH_MODE delete_mode;
00601   inT16 x_debug_delete_mode;
00602   CRUNCH_MODE x_delete_mode;
00603 
00604   page_res_it.restart_page();
00605   while (page_res_it.word() != NULL) {
00606     word = page_res_it.word();
00607 
00608     delete_mode = word_deletable (word, debug_delete_mode);
00609     if (delete_mode != CR_NONE) {
00610       if (word->word->flag (W_BOL) || deleting_from_bol) {
00611         if (crunch_debug > 0) {
00612           tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
00613             debug_delete_mode,
00614             word->best_choice->unichar_string().string());
00615         }
00616         word->unlv_crunch_mode = delete_mode;
00617         deleting_from_bol = TRUE;
00618       } else if (word->word->flag(W_EOL)) {
00619         if (marked_delete_point) {
00620           while (copy_it.word() != word) {
00621             x_delete_mode = word_deletable (copy_it.word (),
00622               x_debug_delete_mode);
00623             if (crunch_debug > 0) {
00624               tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
00625                 x_debug_delete_mode,
00626                 copy_it.word()->best_choice->unichar_string().string());
00627             }
00628             copy_it.word ()->unlv_crunch_mode = x_delete_mode;
00629             copy_it.forward ();
00630           }
00631         }
00632         if (crunch_debug > 0) {
00633           tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
00634             debug_delete_mode,
00635             word->best_choice->unichar_string().string());
00636         }
00637         word->unlv_crunch_mode = delete_mode;
00638         deleting_from_bol = FALSE;
00639         marked_delete_point = FALSE;
00640       }
00641       else {
00642         if (!marked_delete_point) {
00643           copy_it = page_res_it;
00644           marked_delete_point = TRUE;
00645         }
00646       }
00647     }
00648     else {
00649       deleting_from_bol = FALSE;
00650                                  //Forget earlier potential crunches
00651       marked_delete_point = FALSE;
00652     }
00653     /*
00654       The following step has been left till now as the tess fails are used to
00655       determine if the word is deletable.
00656     */
00657     if (!crunch_early_merge_tess_fails)
00658       word->merge_tess_fails();
00659     page_res_it.forward ();
00660   }
00661 }
00662 
00663 
00664 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
00665   int i;
00666   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
00667   UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
00668   UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
00669   UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
00670   for (i = 0; i < word_res->reject_map.length(); ++i) {
00671     if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
00672       word_res->best_choice->set_unichar_id(unichar_dash, i);
00673       if (word_res->reject_map[i].accepted ())
00674         word_res->reject_map[i].setrej_unlv_rej ();
00675     }
00676     if (word_res->best_choice->unichar_id(i) == unichar_pow) {
00677       word_res->best_choice->set_unichar_id(unichar_space, i);
00678       if (word_res->reject_map[i].accepted ())
00679         word_res->reject_map[i].setrej_unlv_rej ();
00680     }
00681   }
00682 }
00683 
00684 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
00685   enum STATES
00686   {
00687     JUNK,
00688     FIRST_UPPER,
00689     FIRST_LOWER,
00690     FIRST_NUM,
00691     SUBSEQUENT_UPPER,
00692     SUBSEQUENT_LOWER,
00693     SUBSEQUENT_NUM
00694   };
00695   const char *str = word->best_choice->unichar_string().string();
00696   const char *lengths = word->best_choice->unichar_lengths().string();
00697   STATES state = JUNK;
00698   int len = 0;
00699   int isolated_digits = 0;
00700   int isolated_alphas = 0;
00701   int bad_char_count = 0;
00702   int tess_rejs = 0;
00703   int dodgy_chars = 0;
00704   int ok_chars;
00705   UNICHAR_ID last_char = -1;
00706   int alpha_repetition_count = 0;
00707   int longest_alpha_repetition_count = 0;
00708   int longest_lower_run_len = 0;
00709   int lower_string_count = 0;
00710   int longest_upper_run_len = 0;
00711   int upper_string_count = 0;
00712   int total_alpha_count = 0;
00713   int total_digit_count = 0;
00714 
00715   for (; *str != '\0'; str += *(lengths++)) {
00716     len++;
00717     if (word->uch_set->get_isupper (str, *lengths)) {
00718       total_alpha_count++;
00719       switch (state) {
00720         case SUBSEQUENT_UPPER:
00721         case FIRST_UPPER:
00722           state = SUBSEQUENT_UPPER;
00723           upper_string_count++;
00724           if (longest_upper_run_len < upper_string_count)
00725             longest_upper_run_len = upper_string_count;
00726           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
00727             alpha_repetition_count++;
00728             if (longest_alpha_repetition_count < alpha_repetition_count) {
00729               longest_alpha_repetition_count = alpha_repetition_count;
00730             }
00731           }
00732           else {
00733             last_char = word->uch_set->unichar_to_id(str, *lengths);
00734             alpha_repetition_count = 1;
00735           }
00736           break;
00737         case FIRST_NUM:
00738           isolated_digits++;
00739         default:
00740           state = FIRST_UPPER;
00741           last_char = word->uch_set->unichar_to_id(str, *lengths);
00742           alpha_repetition_count = 1;
00743           upper_string_count = 1;
00744           break;
00745       }
00746     }
00747     else if (word->uch_set->get_islower (str, *lengths)) {
00748       total_alpha_count++;
00749       switch (state) {
00750         case SUBSEQUENT_LOWER:
00751         case FIRST_LOWER:
00752           state = SUBSEQUENT_LOWER;
00753           lower_string_count++;
00754           if (longest_lower_run_len < lower_string_count)
00755             longest_lower_run_len = lower_string_count;
00756           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
00757             alpha_repetition_count++;
00758             if (longest_alpha_repetition_count < alpha_repetition_count) {
00759               longest_alpha_repetition_count = alpha_repetition_count;
00760             }
00761           }
00762           else {
00763             last_char = word->uch_set->unichar_to_id(str, *lengths);
00764             alpha_repetition_count = 1;
00765           }
00766           break;
00767         case FIRST_NUM:
00768           isolated_digits++;
00769         default:
00770           state = FIRST_LOWER;
00771           last_char = word->uch_set->unichar_to_id(str, *lengths);
00772           alpha_repetition_count = 1;
00773           lower_string_count = 1;
00774           break;
00775       }
00776     }
00777     else if (word->uch_set->get_isdigit (str, *lengths)) {
00778       total_digit_count++;
00779       switch (state) {
00780         case FIRST_NUM:
00781           state = SUBSEQUENT_NUM;
00782         case SUBSEQUENT_NUM:
00783           break;
00784         case FIRST_UPPER:
00785         case FIRST_LOWER:
00786           isolated_alphas++;
00787         default:
00788           state = FIRST_NUM;
00789           break;
00790       }
00791     }
00792     else {
00793       if (*lengths == 1 && *str == ' ')
00794         tess_rejs++;
00795       else
00796         bad_char_count++;
00797       switch (state) {
00798         case FIRST_NUM:
00799           isolated_digits++;
00800           break;
00801         case FIRST_UPPER:
00802         case FIRST_LOWER:
00803           isolated_alphas++;
00804         default:
00805           break;
00806       }
00807       state = JUNK;
00808     }
00809   }
00810 
00811   switch (state) {
00812     case FIRST_NUM:
00813       isolated_digits++;
00814       break;
00815     case FIRST_UPPER:
00816     case FIRST_LOWER:
00817       isolated_alphas++;
00818     default:
00819       break;
00820   }
00821 
00822   if (crunch_include_numerals) {
00823     total_alpha_count += total_digit_count - isolated_digits;
00824   }
00825 
00826   if (crunch_leave_ok_strings && len >= 4 &&
00827       2 * (total_alpha_count - isolated_alphas) > len &&
00828       longest_alpha_repetition_count < crunch_long_repetitions) {
00829     if ((crunch_accept_ok &&
00830          acceptable_word_string(*word->uch_set, str, lengths) !=
00831              AC_UNACCEPTABLE) ||
00832         longest_lower_run_len > crunch_leave_lc_strings ||
00833         longest_upper_run_len > crunch_leave_uc_strings)
00834       return G_NEVER_CRUNCH;
00835   }
00836   if (word->reject_map.length() > 1 &&
00837       strpbrk(str, " ") == NULL &&
00838       (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
00839        word->best_choice->permuter() == FREQ_DAWG_PERM ||
00840        word->best_choice->permuter() == USER_DAWG_PERM ||
00841        word->best_choice->permuter() == NUMBER_PERM ||
00842        acceptable_word_string(*word->uch_set, str, lengths) !=
00843            AC_UNACCEPTABLE || ok_dict_word))
00844     return G_OK;
00845 
00846   ok_chars = len - bad_char_count - isolated_digits -
00847     isolated_alphas - tess_rejs;
00848 
00849   if (crunch_debug > 3) {
00850     tprintf("garbage_word: \"%s\"\n",
00851             word->best_choice->unichar_string().string());
00852     tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
00853             len,
00854             bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
00855   }
00856   if (bad_char_count == 0 &&
00857       tess_rejs == 0 &&
00858       (len > isolated_digits + isolated_alphas || len <= 2))
00859     return G_OK;
00860 
00861   if (tess_rejs > ok_chars ||
00862       (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
00863     return G_TERRIBLE;
00864 
00865   if (len > 4) {
00866     dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
00867         isolated_alphas;
00868     if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
00869       return G_DODGY;
00870     else
00871       return G_OK;
00872   } else {
00873     dodgy_chars = 2 * tess_rejs + bad_char_count;
00874     if ((len == 4 && dodgy_chars > 2) ||
00875         (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
00876       return G_DODGY;
00877     else
00878       return G_OK;
00879   }
00880 }
00881 
00882 
00883 /*************************************************************************
00884  * word_deletable()
00885  *     DELETE WERDS AT ENDS OF ROWS IF
00886  *        Word is crunched &&
00887  *        ( string length = 0                                          OR
00888  *          > 50% of chars are "|" (before merging)                    OR
00889  *          certainty < -10                                            OR
00890  *          rating /char > 60                                          OR
00891  *          TOP of word is more than 0.5 xht BELOW baseline            OR
00892  *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR
00893  *          length of word < 3xht                                      OR
00894  *          height of word < 0.7 xht                                   OR
00895  *          height of word > 3.0 xht                                   OR
00896  *          >75% of the outline BBs have longest dimension < 0.5xht
00897  *************************************************************************/
00898 
00899 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) {
00900   int word_len = word->reject_map.length ();
00901   float rating_per_ch;
00902   TBOX box;                       //BB of word
00903 
00904   if (word->unlv_crunch_mode == CR_NONE) {
00905     delete_mode = 0;
00906     return CR_NONE;
00907   }
00908 
00909   if (word_len == 0) {
00910     delete_mode = 1;
00911     return CR_DELETE;
00912   }
00913 
00914   if (word->rebuild_word != NULL) {
00915     // Cube leaves rebuild_word NULL.
00916     box = word->rebuild_word->bounding_box();
00917     if (box.height () < crunch_del_min_ht * kBlnXHeight) {
00918       delete_mode = 4;
00919       return CR_DELETE;
00920     }
00921 
00922     if (noise_outlines(word->rebuild_word)) {
00923       delete_mode = 5;
00924       return CR_DELETE;
00925     }
00926   }
00927 
00928   if ((failure_count (word) * 1.5) > word_len) {
00929     delete_mode = 2;
00930     return CR_LOOSE_SPACE;
00931   }
00932 
00933   if (word->best_choice->certainty () < crunch_del_cert) {
00934     delete_mode = 7;
00935     return CR_LOOSE_SPACE;
00936   }
00937 
00938   rating_per_ch = word->best_choice->rating () / word_len;
00939 
00940   if (rating_per_ch > crunch_del_rating) {
00941     delete_mode = 8;
00942     return CR_LOOSE_SPACE;
00943   }
00944 
00945   if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
00946     delete_mode = 9;
00947     return CR_LOOSE_SPACE;
00948   }
00949 
00950   if (box.bottom () >
00951   kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
00952     delete_mode = 10;
00953     return CR_LOOSE_SPACE;
00954   }
00955 
00956   if (box.height () > crunch_del_max_ht * kBlnXHeight) {
00957     delete_mode = 11;
00958     return CR_LOOSE_SPACE;
00959   }
00960 
00961   if (box.width () < crunch_del_min_width * kBlnXHeight) {
00962     delete_mode = 3;
00963     return CR_LOOSE_SPACE;
00964   }
00965 
00966   delete_mode = 0;
00967   return CR_NONE;
00968 }
00969 
00970 inT16 Tesseract::failure_count(WERD_RES *word) {
00971   const char *str = word->best_choice->unichar_string().string();
00972   int tess_rejs = 0;
00973 
00974   for (; *str != '\0'; str++) {
00975     if (*str == ' ')
00976       tess_rejs++;
00977   }
00978   return tess_rejs;
00979 }
00980 
00981 
00982 BOOL8 Tesseract::noise_outlines(TWERD *word) {
00983   TBOX box;                       // BB of outline
00984   inT16 outline_count = 0;
00985   inT16 small_outline_count = 0;
00986   inT16 max_dimension;
00987   float small_limit = kBlnXHeight * crunch_small_outlines_size;
00988 
00989   for (int b = 0; b < word->NumBlobs(); ++b) {
00990     TBLOB* blob = word->blobs[b];
00991     for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
00992       outline_count++;
00993       box = ol->bounding_box();
00994       if (box.height() > box.width())
00995         max_dimension = box.height();
00996       else
00997         max_dimension = box.width();
00998       if (max_dimension < small_limit)
00999         small_outline_count++;
01000     }
01001   }
01002   return small_outline_count >= outline_count;
01003 }
01004 
01005 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines