tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/output.cpp
Go to the documentation of this file.
00001 /******************************************************************
00002  * File:        output.cpp  (Formerly output.c)
00003  * Description: Output pass
00004  * Author:                                      Phil Cheatle
00005  * Created:                                     Thu Aug  4 10:56:08 BST 1994
00006  *
00007  * (C) Copyright 1994, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #include <string.h>
00025 #include <ctype.h>
00026 #ifdef __UNIX__
00027 #include          <assert.h>
00028 #include          <unistd.h>
00029 #include          <errno.h>
00030 #endif
00031 #include "helpers.h"
00032 #include "tessvars.h"
00033 #include "control.h"
00034 #include "secname.h"
00035 #include "reject.h"
00036 #include "docqual.h"
00037 #include "output.h"
00038 #include "globals.h"
00039 #include "tesseractclass.h"
00040 
00041 #define EPAPER_EXT      ".ep"
00042 #define PAGE_YSIZE      3508
00043 #define CTRL_INSET      '\024'   //dc4=text inset
00044 #define CTRL_FONT       '\016'   //so=font change
00045 #define CTRL_DEFAULT      '\017' //si=default font
00046 #define CTRL_SHIFT      '\022'   //dc2=x shift
00047 #define CTRL_TAB        '\011'   //tab
00048 #define CTRL_NEWLINE      '\012' //newline
00049 #define CTRL_HARDLINE   '\015'   //cr
00050 
00051 /**********************************************************************
00052  * pixels_to_pts
00053  *
00054  * Convert an integer number of pixels to the nearest integer
00055  * number of points.
00056  **********************************************************************/
00057 
00058 inT32 pixels_to_pts(               //convert coords
00059                     inT32 pixels,
00060                     inT32 pix_res  //resolution
00061                    ) {
00062   float pts;                     //converted value
00063 
00064   pts = pixels * 72.0 / pix_res;
00065   return (inT32) (pts + 0.5);    //round it
00066 }
00067 
00068 namespace tesseract {
00069 void Tesseract::output_pass(  //Tess output pass //send to api
00070                             PAGE_RES_IT &page_res_it,
00071                             const TBOX *target_word_box) {
00072   BLOCK_RES *block_of_last_word;
00073   BOOL8 force_eol;               //During output
00074   BLOCK *nextblock;              //block of next word
00075   WERD *nextword;                //next word
00076 
00077   page_res_it.restart_page ();
00078   block_of_last_word = NULL;
00079   while (page_res_it.word () != NULL) {
00080     check_debug_pt (page_res_it.word (), 120);
00081 
00082         if (target_word_box)
00083         {
00084 
00085                 TBOX current_word_box=page_res_it.word ()->word->bounding_box();
00086                 FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
00087                 if (!target_word_box->contains(center_pt))
00088                 {
00089                         page_res_it.forward ();
00090                         continue;
00091                 }
00092 
00093         }
00094     if (tessedit_write_block_separators &&
00095     block_of_last_word != page_res_it.block ()) {
00096       block_of_last_word = page_res_it.block ();
00097     }
00098 
00099     force_eol = (tessedit_write_block_separators &&
00100       (page_res_it.block () != page_res_it.next_block ())) ||
00101       (page_res_it.next_word () == NULL);
00102 
00103     if (page_res_it.next_word () != NULL)
00104       nextword = page_res_it.next_word ()->word;
00105     else
00106       nextword = NULL;
00107     if (page_res_it.next_block () != NULL)
00108       nextblock = page_res_it.next_block ()->block;
00109     else
00110       nextblock = NULL;
00111                                  //regardless of tilde crunching
00112     write_results(page_res_it,
00113                   determine_newline_type(page_res_it.word()->word,
00114                                          page_res_it.block()->block,
00115                                          nextword, nextblock), force_eol);
00116     page_res_it.forward();
00117   }
00118 }
00119 
00120 
00121 /*************************************************************************
00122  * write_results()
00123  *
00124  * All recognition and rejection has now been done. Generate the following:
00125  *   .txt file     - giving the final best choices with NO highlighting
00126  *   .raw file     - giving the tesseract top choice output for each word
00127  *   .map file     - showing how the .txt file has been rejected in the .ep file
00128  *   epchoice list - a list of one element per word, containing the text for the
00129  *                   epaper. Reject strings are inserted.
00130  *   inset list    - a list of bounding boxes of reject insets - indexed by the
00131  *                   reject strings in the epchoice text.
00132  *************************************************************************/
00133 void Tesseract::write_results(PAGE_RES_IT &page_res_it,
00134                               char newline_type,  // type of newline
00135                               BOOL8 force_eol) {  // override tilde crunch?
00136   WERD_RES *word = page_res_it.word();
00137   const UNICHARSET &uchset = *word->uch_set;
00138   int i;
00139   BOOL8 need_reject = FALSE;
00140   UNICHAR_ID space = uchset.unichar_to_id(" ");
00141 
00142   if ((word->unlv_crunch_mode != CR_NONE ||
00143        word->best_choice->length() == 0) &&
00144       !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
00145     if ((word->unlv_crunch_mode != CR_DELETE) &&
00146         (!stats_.tilde_crunch_written ||
00147          ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
00148           (word->word->space () > 0) &&
00149           !word->word->flag (W_FUZZY_NON) &&
00150           !word->word->flag (W_FUZZY_SP)))) {
00151       if (!word->word->flag (W_BOL) &&
00152           (word->word->space () > 0) &&
00153           !word->word->flag (W_FUZZY_NON) &&
00154           !word->word->flag (W_FUZZY_SP)) {
00155         stats_.last_char_was_tilde = false;
00156       }
00157       need_reject = TRUE;
00158     }
00159     if ((need_reject && !stats_.last_char_was_tilde) ||
00160         (force_eol && stats_.write_results_empty_block)) {
00161       /* Write a reject char - mark as rejected unless zero_rejection mode */
00162       stats_.last_char_was_tilde = TRUE;
00163       stats_.tilde_crunch_written = true;
00164       stats_.last_char_was_newline = false;
00165       stats_.write_results_empty_block = false;
00166     }
00167 
00168     if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
00169       stats_.tilde_crunch_written = false;
00170       stats_.last_char_was_newline = true;
00171       stats_.last_char_was_tilde = false;
00172     }
00173 
00174     if (force_eol)
00175       stats_.write_results_empty_block = true;
00176     return;
00177   }
00178 
00179   /* NORMAL PROCESSING of non tilde crunched words */
00180 
00181   stats_.tilde_crunch_written = false;
00182   if (newline_type)
00183     stats_.last_char_was_newline = true;
00184   else
00185     stats_.last_char_was_newline = false;
00186   stats_.write_results_empty_block = force_eol;  // about to write a real word
00187 
00188   if (unlv_tilde_crunching &&
00189       stats_.last_char_was_tilde &&
00190       (word->word->space() == 0) &&
00191       !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
00192       (word->best_choice->unichar_id(0) == space)) {
00193     /* Prevent adjacent tilde across words - we know that adjacent tildes within
00194        words have been removed */
00195     word->MergeAdjacentBlobs(0);
00196   }
00197   if (newline_type ||
00198     (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
00199     stats_.last_char_was_tilde = false;
00200   else {
00201     if (word->reject_map.length () > 0) {
00202       if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
00203         stats_.last_char_was_tilde = true;
00204       else
00205         stats_.last_char_was_tilde = false;
00206     }
00207     else if (word->word->space () > 0)
00208       stats_.last_char_was_tilde = false;
00209     /* else it is unchanged as there are no output chars */
00210   }
00211 
00212   ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
00213 
00214   set_unlv_suspects(word);
00215   check_debug_pt (word, 120);
00216   if (tessedit_rejection_debug) {
00217     tprintf ("Dict word: \"%s\": %d\n",
00218              word->best_choice->debug_string().string(),
00219              dict_word(*(word->best_choice)));
00220   }
00221   if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
00222     if (tessedit_zero_rejection) {
00223       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
00224       for (i = 0; i < word->best_choice->length(); ++i) {
00225         if (word->reject_map[i].rejected())
00226           word->reject_map[i].setrej_minimal_rej_accept();
00227       }
00228     }
00229     if (tessedit_minimal_rejection) {
00230       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
00231       for (i = 0; i < word->best_choice->length(); ++i) {
00232         if ((word->best_choice->unichar_id(i) != space) &&
00233             word->reject_map[i].rejected())
00234           word->reject_map[i].setrej_minimal_rej_accept();
00235       }
00236     }
00237   }
00238 }
00239 }  // namespace tesseract
00240 
00241 /**********************************************************************
00242  * determine_newline_type
00243  *
00244  * Find whether we have a wrapping or hard newline.
00245  * Return FALSE if not at end of line.
00246  **********************************************************************/
00247 
00248 char determine_newline_type(                   //test line ends
00249                             WERD *word,        //word to do
00250                             BLOCK *block,      //current block
00251                             WERD *next_word,   //next word
00252                             BLOCK *next_block  //block of next word
00253                            ) {
00254   inT16 end_gap;                 //to right edge
00255   inT16 width;                   //of next word
00256   TBOX word_box;                  //bounding
00257   TBOX next_box;                  //next word
00258   TBOX block_box;                 //block bounding
00259 
00260   if (!word->flag (W_EOL))
00261     return FALSE;                //not end of line
00262   if (next_word == NULL || next_block == NULL || block != next_block)
00263     return CTRL_NEWLINE;
00264   if (next_word->space () > 0)
00265     return CTRL_HARDLINE;        //it is tabbed
00266   word_box = word->bounding_box ();
00267   next_box = next_word->bounding_box ();
00268   block_box = block->bounding_box ();
00269                                  //gap to eol
00270   end_gap = block_box.right () - word_box.right ();
00271   end_gap -= (inT32) block->space ();
00272   width = next_box.right () - next_box.left ();
00273   //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
00274   //              block_box.right(),word_box.right(),end_gap,
00275   //              next_box.right(),next_box.left(),width,
00276   //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
00277   return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
00278 }
00279 
00280 /*************************************************************************
00281  * get_rep_char()
00282  * Return the first accepted character from the repetition string. This is the
00283  * character which is repeated - as determined earlier by fix_rep_char()
00284  *************************************************************************/
00285 namespace tesseract {
00286 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
00287   int i;
00288   for (i = 0; ((i < word->reject_map.length()) &&
00289                (word->reject_map[i].rejected())); ++i);
00290 
00291   if (i < word->reject_map.length()) {
00292     return word->best_choice->unichar_id(i);
00293   } else {
00294     return word->uch_set->unichar_to_id(unrecognised_char.string());
00295   }
00296 }
00297 
00298 /*************************************************************************
00299  * SUSPECT LEVELS
00300  *
00301  * 0 - dont reject ANYTHING
00302  * 1,2 - partial rejection
00303  * 3 - BEST
00304  *
00305  * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
00306  * tessedit_minimal_rejection.
00307  *************************************************************************/
00308 void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
00309   int len = word_res->reject_map.length();
00310   const WERD_CHOICE &word = *(word_res->best_choice);
00311   const UNICHARSET &uchset = *word.unicharset();
00312   int i;
00313   float rating_per_ch;
00314 
00315   if (suspect_level == 0) {
00316     for (i = 0; i < len; i++) {
00317       if (word_res->reject_map[i].rejected())
00318         word_res->reject_map[i].setrej_minimal_rej_accept();
00319     }
00320     return;
00321   }
00322 
00323   if (suspect_level >= 3)
00324     return;                      //Use defaults
00325 
00326   /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
00327 
00328   if (safe_dict_word(word_res) &&
00329       (count_alphas(word) > suspect_short_words)) {
00330     /* Unreject alphas in dictionary words */
00331     for (i = 0; i < len; ++i) {
00332       if (word_res->reject_map[i].rejected() &&
00333           uchset.get_isalpha(word.unichar_id(i)))
00334         word_res->reject_map[i].setrej_minimal_rej_accept();
00335     }
00336   }
00337 
00338   rating_per_ch = word.rating() / word_res->reject_map.length();
00339 
00340   if (rating_per_ch >= suspect_rating_per_ch)
00341     return;                      //Dont touch bad ratings
00342 
00343   if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
00344     /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
00345     for (i = 0; i < len; ++i) {
00346       if (word_res->reject_map[i].rejected() &&
00347           (!uchset.eq(word.unichar_id(i), " ")))
00348         word_res->reject_map[i].setrej_minimal_rej_accept();
00349     }
00350   }
00351 
00352   for (i = 0; i < len; i++) {
00353     if (word_res->reject_map[i].rejected()) {
00354       if (word_res->reject_map[i].flag(R_DOC_REJ))
00355         word_res->reject_map[i].setrej_minimal_rej_accept();
00356       if (word_res->reject_map[i].flag(R_BLOCK_REJ))
00357         word_res->reject_map[i].setrej_minimal_rej_accept();
00358       if (word_res->reject_map[i].flag(R_ROW_REJ))
00359         word_res->reject_map[i].setrej_minimal_rej_accept();
00360     }
00361   }
00362 
00363   if (suspect_level == 2)
00364     return;
00365 
00366   if (!suspect_constrain_1Il ||
00367       (word_res->reject_map.length() <= suspect_short_words)) {
00368     for (i = 0; i < len; i++) {
00369       if (word_res->reject_map[i].rejected()) {
00370         if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
00371           word_res->reject_map[i].flag(R_POSTNN_1IL)))
00372           word_res->reject_map[i].setrej_minimal_rej_accept();
00373 
00374         if (!suspect_constrain_1Il &&
00375           word_res->reject_map[i].flag(R_MM_REJECT))
00376           word_res->reject_map[i].setrej_minimal_rej_accept();
00377       }
00378     }
00379   }
00380 
00381   if (acceptable_word_string(*word_res->uch_set,
00382                              word.unichar_string().string(),
00383                              word.unichar_lengths().string()) !=
00384                                  AC_UNACCEPTABLE ||
00385       acceptable_number_string(word.unichar_string().string(),
00386                                word.unichar_lengths().string())) {
00387     if (word_res->reject_map.length() > suspect_short_words) {
00388       for (i = 0; i < len; i++) {
00389         if (word_res->reject_map[i].rejected() &&
00390           (!word_res->reject_map[i].perm_rejected() ||
00391            word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
00392            word_res->reject_map[i].flag (R_POSTNN_1IL) ||
00393            word_res->reject_map[i].flag (R_MM_REJECT))) {
00394           word_res->reject_map[i].setrej_minimal_rej_accept();
00395         }
00396       }
00397     }
00398   }
00399 }
00400 
00401 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
00402   int count = 0;
00403   for (int i = 0; i < word.length(); ++i) {
00404     if (word.unicharset()->get_isalpha(word.unichar_id(i)))
00405       count++;
00406   }
00407   return count;
00408 }
00409 
00410 
00411 inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) {
00412   int count = 0;
00413   for (int i = 0; i < word.length(); ++i) {
00414     if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
00415         word.unicharset()->get_isdigit(word.unichar_id(i)))
00416       count++;
00417   }
00418   return count;
00419 }
00420 
00421 
00422 BOOL8 Tesseract::acceptable_number_string(const char *s,
00423                                           const char *lengths) {
00424   BOOL8 prev_digit = FALSE;
00425 
00426   if (*lengths == 1 && *s == '(')
00427     s++;
00428 
00429   if (*lengths == 1 &&
00430       ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
00431     s++;
00432 
00433   for (; *s != '\0'; s += *(lengths++)) {
00434     if (unicharset.get_isdigit(s, *lengths))
00435       prev_digit = TRUE;
00436     else if (prev_digit &&
00437              (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
00438       prev_digit = FALSE;
00439     else if (prev_digit && *lengths == 1 &&
00440              (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
00441       return TRUE;
00442     else if (prev_digit &&
00443              *lengths == 1 && (*s == '%') &&
00444              (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
00445              (*(s + *lengths + *(lengths + 1)) == '\0'))
00446       return TRUE;
00447     else
00448       return FALSE;
00449   }
00450   return TRUE;
00451 }
00452 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines