tesseract
3.03
|
00001 /****************************************************************** 00002 * File: output.cpp (Formerly output.c) 00003 * Description: Output pass 00004 * Author: Phil Cheatle 00005 * Created: Thu Aug 4 10:56:08 BST 1994 00006 * 00007 * (C) Copyright 1994, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include <string.h> 00025 #include <ctype.h> 00026 #ifdef __UNIX__ 00027 #include <assert.h> 00028 #include <unistd.h> 00029 #include <errno.h> 00030 #endif 00031 #include "helpers.h" 00032 #include "tessvars.h" 00033 #include "control.h" 00034 #include "secname.h" 00035 #include "reject.h" 00036 #include "docqual.h" 00037 #include "output.h" 00038 #include "globals.h" 00039 #include "tesseractclass.h" 00040 00041 #define EPAPER_EXT ".ep" 00042 #define PAGE_YSIZE 3508 00043 #define CTRL_INSET '\024' //dc4=text inset 00044 #define CTRL_FONT '\016' //so=font change 00045 #define CTRL_DEFAULT '\017' //si=default font 00046 #define CTRL_SHIFT '\022' //dc2=x shift 00047 #define CTRL_TAB '\011' //tab 00048 #define CTRL_NEWLINE '\012' //newline 00049 #define CTRL_HARDLINE '\015' //cr 00050 00051 /********************************************************************** 00052 * pixels_to_pts 00053 * 00054 * Convert an integer number of pixels to the nearest integer 00055 * number of points. 00056 **********************************************************************/ 00057 00058 inT32 pixels_to_pts( //convert coords 00059 inT32 pixels, 00060 inT32 pix_res //resolution 00061 ) { 00062 float pts; //converted value 00063 00064 pts = pixels * 72.0 / pix_res; 00065 return (inT32) (pts + 0.5); //round it 00066 } 00067 00068 namespace tesseract { 00069 void Tesseract::output_pass( //Tess output pass //send to api 00070 PAGE_RES_IT &page_res_it, 00071 const TBOX *target_word_box) { 00072 BLOCK_RES *block_of_last_word; 00073 BOOL8 force_eol; //During output 00074 BLOCK *nextblock; //block of next word 00075 WERD *nextword; //next word 00076 00077 page_res_it.restart_page (); 00078 block_of_last_word = NULL; 00079 while (page_res_it.word () != NULL) { 00080 check_debug_pt (page_res_it.word (), 120); 00081 00082 if (target_word_box) 00083 { 00084 00085 TBOX current_word_box=page_res_it.word ()->word->bounding_box(); 00086 FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2); 00087 if (!target_word_box->contains(center_pt)) 00088 { 00089 page_res_it.forward (); 00090 continue; 00091 } 00092 00093 } 00094 if (tessedit_write_block_separators && 00095 block_of_last_word != page_res_it.block ()) { 00096 block_of_last_word = page_res_it.block (); 00097 } 00098 00099 force_eol = (tessedit_write_block_separators && 00100 (page_res_it.block () != page_res_it.next_block ())) || 00101 (page_res_it.next_word () == NULL); 00102 00103 if (page_res_it.next_word () != NULL) 00104 nextword = page_res_it.next_word ()->word; 00105 else 00106 nextword = NULL; 00107 if (page_res_it.next_block () != NULL) 00108 nextblock = page_res_it.next_block ()->block; 00109 else 00110 nextblock = NULL; 00111 //regardless of tilde crunching 00112 write_results(page_res_it, 00113 determine_newline_type(page_res_it.word()->word, 00114 page_res_it.block()->block, 00115 nextword, nextblock), force_eol); 00116 page_res_it.forward(); 00117 } 00118 } 00119 00120 00121 /************************************************************************* 00122 * write_results() 00123 * 00124 * All recognition and rejection has now been done. Generate the following: 00125 * .txt file - giving the final best choices with NO highlighting 00126 * .raw file - giving the tesseract top choice output for each word 00127 * .map file - showing how the .txt file has been rejected in the .ep file 00128 * epchoice list - a list of one element per word, containing the text for the 00129 * epaper. Reject strings are inserted. 00130 * inset list - a list of bounding boxes of reject insets - indexed by the 00131 * reject strings in the epchoice text. 00132 *************************************************************************/ 00133 void Tesseract::write_results(PAGE_RES_IT &page_res_it, 00134 char newline_type, // type of newline 00135 BOOL8 force_eol) { // override tilde crunch? 00136 WERD_RES *word = page_res_it.word(); 00137 const UNICHARSET &uchset = *word->uch_set; 00138 int i; 00139 BOOL8 need_reject = FALSE; 00140 UNICHAR_ID space = uchset.unichar_to_id(" "); 00141 00142 if ((word->unlv_crunch_mode != CR_NONE || 00143 word->best_choice->length() == 0) && 00144 !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { 00145 if ((word->unlv_crunch_mode != CR_DELETE) && 00146 (!stats_.tilde_crunch_written || 00147 ((word->unlv_crunch_mode == CR_KEEP_SPACE) && 00148 (word->word->space () > 0) && 00149 !word->word->flag (W_FUZZY_NON) && 00150 !word->word->flag (W_FUZZY_SP)))) { 00151 if (!word->word->flag (W_BOL) && 00152 (word->word->space () > 0) && 00153 !word->word->flag (W_FUZZY_NON) && 00154 !word->word->flag (W_FUZZY_SP)) { 00155 stats_.last_char_was_tilde = false; 00156 } 00157 need_reject = TRUE; 00158 } 00159 if ((need_reject && !stats_.last_char_was_tilde) || 00160 (force_eol && stats_.write_results_empty_block)) { 00161 /* Write a reject char - mark as rejected unless zero_rejection mode */ 00162 stats_.last_char_was_tilde = TRUE; 00163 stats_.tilde_crunch_written = true; 00164 stats_.last_char_was_newline = false; 00165 stats_.write_results_empty_block = false; 00166 } 00167 00168 if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) { 00169 stats_.tilde_crunch_written = false; 00170 stats_.last_char_was_newline = true; 00171 stats_.last_char_was_tilde = false; 00172 } 00173 00174 if (force_eol) 00175 stats_.write_results_empty_block = true; 00176 return; 00177 } 00178 00179 /* NORMAL PROCESSING of non tilde crunched words */ 00180 00181 stats_.tilde_crunch_written = false; 00182 if (newline_type) 00183 stats_.last_char_was_newline = true; 00184 else 00185 stats_.last_char_was_newline = false; 00186 stats_.write_results_empty_block = force_eol; // about to write a real word 00187 00188 if (unlv_tilde_crunching && 00189 stats_.last_char_was_tilde && 00190 (word->word->space() == 0) && 00191 !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && 00192 (word->best_choice->unichar_id(0) == space)) { 00193 /* Prevent adjacent tilde across words - we know that adjacent tildes within 00194 words have been removed */ 00195 word->MergeAdjacentBlobs(0); 00196 } 00197 if (newline_type || 00198 (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) 00199 stats_.last_char_was_tilde = false; 00200 else { 00201 if (word->reject_map.length () > 0) { 00202 if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) 00203 stats_.last_char_was_tilde = true; 00204 else 00205 stats_.last_char_was_tilde = false; 00206 } 00207 else if (word->word->space () > 0) 00208 stats_.last_char_was_tilde = false; 00209 /* else it is unchanged as there are no output chars */ 00210 } 00211 00212 ASSERT_HOST (word->best_choice->length() == word->reject_map.length()); 00213 00214 set_unlv_suspects(word); 00215 check_debug_pt (word, 120); 00216 if (tessedit_rejection_debug) { 00217 tprintf ("Dict word: \"%s\": %d\n", 00218 word->best_choice->debug_string().string(), 00219 dict_word(*(word->best_choice))); 00220 } 00221 if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) { 00222 if (tessedit_zero_rejection) { 00223 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ 00224 for (i = 0; i < word->best_choice->length(); ++i) { 00225 if (word->reject_map[i].rejected()) 00226 word->reject_map[i].setrej_minimal_rej_accept(); 00227 } 00228 } 00229 if (tessedit_minimal_rejection) { 00230 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ 00231 for (i = 0; i < word->best_choice->length(); ++i) { 00232 if ((word->best_choice->unichar_id(i) != space) && 00233 word->reject_map[i].rejected()) 00234 word->reject_map[i].setrej_minimal_rej_accept(); 00235 } 00236 } 00237 } 00238 } 00239 } // namespace tesseract 00240 00241 /********************************************************************** 00242 * determine_newline_type 00243 * 00244 * Find whether we have a wrapping or hard newline. 00245 * Return FALSE if not at end of line. 00246 **********************************************************************/ 00247 00248 char determine_newline_type( //test line ends 00249 WERD *word, //word to do 00250 BLOCK *block, //current block 00251 WERD *next_word, //next word 00252 BLOCK *next_block //block of next word 00253 ) { 00254 inT16 end_gap; //to right edge 00255 inT16 width; //of next word 00256 TBOX word_box; //bounding 00257 TBOX next_box; //next word 00258 TBOX block_box; //block bounding 00259 00260 if (!word->flag (W_EOL)) 00261 return FALSE; //not end of line 00262 if (next_word == NULL || next_block == NULL || block != next_block) 00263 return CTRL_NEWLINE; 00264 if (next_word->space () > 0) 00265 return CTRL_HARDLINE; //it is tabbed 00266 word_box = word->bounding_box (); 00267 next_box = next_word->bounding_box (); 00268 block_box = block->bounding_box (); 00269 //gap to eol 00270 end_gap = block_box.right () - word_box.right (); 00271 end_gap -= (inT32) block->space (); 00272 width = next_box.right () - next_box.left (); 00273 // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n", 00274 // block_box.right(),word_box.right(),end_gap, 00275 // next_box.right(),next_box.left(),width, 00276 // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE); 00277 return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE; 00278 } 00279 00280 /************************************************************************* 00281 * get_rep_char() 00282 * Return the first accepted character from the repetition string. This is the 00283 * character which is repeated - as determined earlier by fix_rep_char() 00284 *************************************************************************/ 00285 namespace tesseract { 00286 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated? 00287 int i; 00288 for (i = 0; ((i < word->reject_map.length()) && 00289 (word->reject_map[i].rejected())); ++i); 00290 00291 if (i < word->reject_map.length()) { 00292 return word->best_choice->unichar_id(i); 00293 } else { 00294 return word->uch_set->unichar_to_id(unrecognised_char.string()); 00295 } 00296 } 00297 00298 /************************************************************************* 00299 * SUSPECT LEVELS 00300 * 00301 * 0 - dont reject ANYTHING 00302 * 1,2 - partial rejection 00303 * 3 - BEST 00304 * 00305 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and 00306 * tessedit_minimal_rejection. 00307 *************************************************************************/ 00308 void Tesseract::set_unlv_suspects(WERD_RES *word_res) { 00309 int len = word_res->reject_map.length(); 00310 const WERD_CHOICE &word = *(word_res->best_choice); 00311 const UNICHARSET &uchset = *word.unicharset(); 00312 int i; 00313 float rating_per_ch; 00314 00315 if (suspect_level == 0) { 00316 for (i = 0; i < len; i++) { 00317 if (word_res->reject_map[i].rejected()) 00318 word_res->reject_map[i].setrej_minimal_rej_accept(); 00319 } 00320 return; 00321 } 00322 00323 if (suspect_level >= 3) 00324 return; //Use defaults 00325 00326 /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ 00327 00328 if (safe_dict_word(word_res) && 00329 (count_alphas(word) > suspect_short_words)) { 00330 /* Unreject alphas in dictionary words */ 00331 for (i = 0; i < len; ++i) { 00332 if (word_res->reject_map[i].rejected() && 00333 uchset.get_isalpha(word.unichar_id(i))) 00334 word_res->reject_map[i].setrej_minimal_rej_accept(); 00335 } 00336 } 00337 00338 rating_per_ch = word.rating() / word_res->reject_map.length(); 00339 00340 if (rating_per_ch >= suspect_rating_per_ch) 00341 return; //Dont touch bad ratings 00342 00343 if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { 00344 /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ 00345 for (i = 0; i < len; ++i) { 00346 if (word_res->reject_map[i].rejected() && 00347 (!uchset.eq(word.unichar_id(i), " "))) 00348 word_res->reject_map[i].setrej_minimal_rej_accept(); 00349 } 00350 } 00351 00352 for (i = 0; i < len; i++) { 00353 if (word_res->reject_map[i].rejected()) { 00354 if (word_res->reject_map[i].flag(R_DOC_REJ)) 00355 word_res->reject_map[i].setrej_minimal_rej_accept(); 00356 if (word_res->reject_map[i].flag(R_BLOCK_REJ)) 00357 word_res->reject_map[i].setrej_minimal_rej_accept(); 00358 if (word_res->reject_map[i].flag(R_ROW_REJ)) 00359 word_res->reject_map[i].setrej_minimal_rej_accept(); 00360 } 00361 } 00362 00363 if (suspect_level == 2) 00364 return; 00365 00366 if (!suspect_constrain_1Il || 00367 (word_res->reject_map.length() <= suspect_short_words)) { 00368 for (i = 0; i < len; i++) { 00369 if (word_res->reject_map[i].rejected()) { 00370 if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) || 00371 word_res->reject_map[i].flag(R_POSTNN_1IL))) 00372 word_res->reject_map[i].setrej_minimal_rej_accept(); 00373 00374 if (!suspect_constrain_1Il && 00375 word_res->reject_map[i].flag(R_MM_REJECT)) 00376 word_res->reject_map[i].setrej_minimal_rej_accept(); 00377 } 00378 } 00379 } 00380 00381 if (acceptable_word_string(*word_res->uch_set, 00382 word.unichar_string().string(), 00383 word.unichar_lengths().string()) != 00384 AC_UNACCEPTABLE || 00385 acceptable_number_string(word.unichar_string().string(), 00386 word.unichar_lengths().string())) { 00387 if (word_res->reject_map.length() > suspect_short_words) { 00388 for (i = 0; i < len; i++) { 00389 if (word_res->reject_map[i].rejected() && 00390 (!word_res->reject_map[i].perm_rejected() || 00391 word_res->reject_map[i].flag (R_1IL_CONFLICT) || 00392 word_res->reject_map[i].flag (R_POSTNN_1IL) || 00393 word_res->reject_map[i].flag (R_MM_REJECT))) { 00394 word_res->reject_map[i].setrej_minimal_rej_accept(); 00395 } 00396 } 00397 } 00398 } 00399 } 00400 00401 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) { 00402 int count = 0; 00403 for (int i = 0; i < word.length(); ++i) { 00404 if (word.unicharset()->get_isalpha(word.unichar_id(i))) 00405 count++; 00406 } 00407 return count; 00408 } 00409 00410 00411 inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) { 00412 int count = 0; 00413 for (int i = 0; i < word.length(); ++i) { 00414 if (word.unicharset()->get_isalpha(word.unichar_id(i)) || 00415 word.unicharset()->get_isdigit(word.unichar_id(i))) 00416 count++; 00417 } 00418 return count; 00419 } 00420 00421 00422 BOOL8 Tesseract::acceptable_number_string(const char *s, 00423 const char *lengths) { 00424 BOOL8 prev_digit = FALSE; 00425 00426 if (*lengths == 1 && *s == '(') 00427 s++; 00428 00429 if (*lengths == 1 && 00430 ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) 00431 s++; 00432 00433 for (; *s != '\0'; s += *(lengths++)) { 00434 if (unicharset.get_isdigit(s, *lengths)) 00435 prev_digit = TRUE; 00436 else if (prev_digit && 00437 (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) 00438 prev_digit = FALSE; 00439 else if (prev_digit && *lengths == 1 && 00440 (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')'))) 00441 return TRUE; 00442 else if (prev_digit && 00443 *lengths == 1 && (*s == '%') && 00444 (*(lengths + 1) == 1 && *(s + *lengths) == ')') && 00445 (*(s + *lengths + *(lengths + 1)) == '\0')) 00446 return TRUE; 00447 else 00448 return FALSE; 00449 } 00450 return TRUE; 00451 } 00452 } // namespace tesseract