tesseract
3.03
|
00001 /****************************************************************** 00002 * File: superscript.cpp 00003 * Description: Correction pass to fix superscripts and subscripts. 00004 * Author: David Eger 00005 * Created: Mon Mar 12 14:05:00 PDT 2012 00006 * 00007 * (C) Copyright 2012, Google, Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "normalis.h" 00021 #include "tesseractclass.h" 00022 00023 static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) { 00024 int num_chopped = 0; 00025 for (int i = 0; i < num_unichars; i++) 00026 num_chopped += word->best_state[i]; 00027 return num_chopped; 00028 } 00029 00030 static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) { 00031 int num_chopped = 0; 00032 for (int i = 0; i < num_unichars; i++) 00033 num_chopped += word->best_state[word->best_state.size() - 1 - i]; 00034 return num_chopped; 00035 } 00036 00037 00038 namespace tesseract { 00039 00046 void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index, 00047 int super_y_bottom, int sub_y_top, 00048 ScriptPos *leading_pos, int *num_leading_outliers, 00049 ScriptPos *trailing_pos, int *num_trailing_outliers) { 00050 ScriptPos sp_unused1, sp_unused2; 00051 int unused1, unused2; 00052 if (!leading_pos) leading_pos = &sp_unused1; 00053 if (!num_leading_outliers) num_leading_outliers = &unused1; 00054 if (!trailing_pos) trailing_pos = &sp_unused2; 00055 if (!num_trailing_outliers) num_trailing_outliers = &unused2; 00056 00057 *num_leading_outliers = *num_trailing_outliers = 0; 00058 *leading_pos = *trailing_pos = SP_NORMAL; 00059 00060 int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index); 00061 int num_chopped_pieces = word->best_state[rebuilt_blob_index]; 00062 ScriptPos last_pos = SP_NORMAL; 00063 int trailing_outliers = 0; 00064 for (int i = 0; i < num_chopped_pieces; i++) { 00065 TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box(); 00066 ScriptPos pos = SP_NORMAL; 00067 if (box.bottom() >= super_y_bottom) { 00068 pos = SP_SUPERSCRIPT; 00069 } else if (box.top() <= sub_y_top) { 00070 pos = SP_SUBSCRIPT; 00071 } 00072 if (pos == SP_NORMAL) { 00073 if (trailing_outliers == i) { 00074 *num_leading_outliers = trailing_outliers; 00075 *leading_pos = last_pos; 00076 } 00077 trailing_outliers = 0; 00078 } else { 00079 if (pos == last_pos) { 00080 trailing_outliers++; 00081 } else { 00082 trailing_outliers = 1; 00083 } 00084 } 00085 last_pos = pos; 00086 } 00087 *num_trailing_outliers = trailing_outliers; 00088 *trailing_pos = last_pos; 00089 } 00090 00101 bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) { 00102 if (word->tess_failed || word->word->flag(W_REP_CHAR) || 00103 !word->best_choice) { 00104 return false; 00105 } 00106 int num_leading, num_trailing; 00107 ScriptPos sp_leading, sp_trailing; 00108 float leading_certainty, trailing_certainty; 00109 float avg_certainty, unlikely_threshold; 00110 00111 // Calculate the number of whole suspicious characters at the edges. 00112 GetSubAndSuperscriptCandidates( 00113 word, &num_leading, &sp_leading, &leading_certainty, 00114 &num_trailing, &sp_trailing, &trailing_certainty, 00115 &avg_certainty, &unlikely_threshold); 00116 00117 const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super"; 00118 const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super"; 00119 00120 int num_blobs = word->best_choice->length(); 00121 00122 // Calculate the remainder (partial characters) at the edges. 00123 // This accounts for us having classified the best version of 00124 // a word as [speaker?'] when it was instead [speaker.^{21}] 00125 // (that is we accidentally thought the 2 was attached to the period). 00126 int num_remainder_leading = 0, num_remainder_trailing = 0; 00127 if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) { 00128 int super_y_bottom = 00129 kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom; 00130 int sub_y_top = 00131 kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top; 00132 int last_word_char = num_blobs - 1 - num_trailing; 00133 float last_char_certainty = word->best_choice->certainty(last_word_char); 00134 if (word->best_choice->unichar_id(last_word_char) != 0 && 00135 last_char_certainty <= unlikely_threshold) { 00136 ScriptPos rpos; 00137 YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top, 00138 NULL, NULL, &rpos, &num_remainder_trailing); 00139 if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0; 00140 if (num_remainder_trailing > 0 && 00141 last_char_certainty < trailing_certainty) { 00142 trailing_certainty = last_char_certainty; 00143 } 00144 } 00145 bool another_blob_available = (num_remainder_trailing == 0) || 00146 num_leading + num_trailing + 1 < num_blobs; 00147 int first_char_certainty = word->best_choice->certainty(num_leading); 00148 if (another_blob_available && 00149 word->best_choice->unichar_id(num_leading) != 0 && 00150 first_char_certainty <= unlikely_threshold) { 00151 ScriptPos lpos; 00152 YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, 00153 &lpos, &num_remainder_leading, NULL, NULL); 00154 if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0; 00155 if (num_remainder_leading > 0 && 00156 first_char_certainty < leading_certainty) { 00157 leading_certainty = first_char_certainty; 00158 } 00159 } 00160 } 00161 00162 // If nothing to do, bail now. 00163 if (num_leading + num_trailing + 00164 num_remainder_leading + num_remainder_trailing == 0) { 00165 return false; 00166 } 00167 00168 if (superscript_debug >= 1) { 00169 tprintf("Candidate for superscript detection: %s (", 00170 word->best_choice->unichar_string().string()); 00171 if (num_leading || num_remainder_leading) { 00172 tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading, 00173 leading_pos); 00174 } 00175 if (num_trailing || num_remainder_trailing) { 00176 tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing, 00177 trailing_pos); 00178 } 00179 tprintf(")\n"); 00180 } 00181 if (superscript_debug >= 3) { 00182 word->best_choice->print(); 00183 } 00184 if (superscript_debug >= 2) { 00185 tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ", 00186 avg_certainty, unlikely_threshold); 00187 if (num_leading) 00188 tprintf("Orig. leading (min): %.2f ", leading_certainty); 00189 if (num_trailing) 00190 tprintf("Orig. trailing (min): %.2f ", trailing_certainty); 00191 tprintf("\n"); 00192 } 00193 00194 // We've now calculated the number of rebuilt blobs we want to carve off. 00195 // However, split_word() works from TBLOBs in chopped_word, so we need to 00196 // convert to those. 00197 int num_chopped_leading = 00198 LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading; 00199 int num_chopped_trailing = 00200 TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing; 00201 00202 int retry_leading = 0; 00203 int retry_trailing = 0; 00204 bool is_good = false; 00205 WERD_RES *revised = TrySuperscriptSplits( 00206 num_chopped_leading, leading_certainty, sp_leading, 00207 num_chopped_trailing, trailing_certainty, sp_trailing, 00208 word, &is_good, &retry_leading, &retry_trailing); 00209 if (is_good) { 00210 word->ConsumeWordResults(revised); 00211 } else if (retry_leading || retry_trailing) { 00212 int retry_chopped_leading = 00213 LeadingUnicharsToChopped(revised, retry_leading); 00214 int retry_chopped_trailing = 00215 TrailingUnicharsToChopped(revised, retry_trailing); 00216 WERD_RES *revised2 = TrySuperscriptSplits( 00217 retry_chopped_leading, leading_certainty, sp_leading, 00218 retry_chopped_trailing, trailing_certainty, sp_trailing, 00219 revised, &is_good, &retry_leading, &retry_trailing); 00220 if (is_good) { 00221 word->ConsumeWordResults(revised2); 00222 } 00223 delete revised2; 00224 } 00225 delete revised; 00226 return is_good; 00227 } 00228 00253 void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word, 00254 int *num_rebuilt_leading, 00255 ScriptPos *leading_pos, 00256 float *leading_certainty, 00257 int *num_rebuilt_trailing, 00258 ScriptPos *trailing_pos, 00259 float *trailing_certainty, 00260 float *avg_certainty, 00261 float *unlikely_threshold) { 00262 *avg_certainty = *unlikely_threshold = 0.0f; 00263 *num_rebuilt_leading = *num_rebuilt_trailing = 0; 00264 *leading_certainty = *trailing_certainty = 0.0f; 00265 00266 int super_y_bottom = 00267 kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom; 00268 int sub_y_top = 00269 kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top; 00270 00271 // Step one: Get an average certainty for "normally placed" characters. 00272 00273 // Counts here are of blobs in the rebuild_word / unichars in best_choice. 00274 *leading_pos = *trailing_pos = SP_NORMAL; 00275 int leading_outliers = 0; 00276 int trailing_outliers = 0; 00277 int num_normal = 0; 00278 float normal_certainty_total = 0.0f; 00279 float worst_normal_certainty = 0.0f; 00280 ScriptPos last_pos = SP_NORMAL; 00281 int num_blobs = word->rebuild_word->NumBlobs(); 00282 for (int b = 0; b < num_blobs; ++b) { 00283 TBOX box = word->rebuild_word->blobs[b]->bounding_box(); 00284 ScriptPos pos = SP_NORMAL; 00285 if (box.bottom() >= super_y_bottom) { 00286 pos = SP_SUPERSCRIPT; 00287 } else if (box.top() <= sub_y_top) { 00288 pos = SP_SUBSCRIPT; 00289 } 00290 if (pos == SP_NORMAL) { 00291 if (word->best_choice->unichar_id(b) != 0) { 00292 float char_certainty = word->best_choice->certainty(b); 00293 if (char_certainty < worst_normal_certainty) { 00294 worst_normal_certainty = char_certainty; 00295 } 00296 num_normal++; 00297 normal_certainty_total += char_certainty; 00298 } 00299 if (trailing_outliers == b) { 00300 leading_outliers = trailing_outliers; 00301 *leading_pos = last_pos; 00302 } 00303 trailing_outliers = 0; 00304 } else { 00305 if (last_pos == pos) { 00306 trailing_outliers++; 00307 } else { 00308 trailing_outliers = 1; 00309 } 00310 } 00311 last_pos = pos; 00312 } 00313 *trailing_pos = last_pos; 00314 if (num_normal >= 3) { // throw out the worst as an outlier. 00315 num_normal--; 00316 normal_certainty_total -= worst_normal_certainty; 00317 } 00318 if (num_normal > 0) { 00319 *avg_certainty = normal_certainty_total / num_normal; 00320 *unlikely_threshold = superscript_worse_certainty * (*avg_certainty); 00321 } 00322 if (num_normal == 0 || 00323 (leading_outliers == 0 && trailing_outliers == 0)) { 00324 return; 00325 } 00326 00327 // Step two: Try to split off bits of the word that are both outliers 00328 // and have much lower certainty than average 00329 // Calculate num_leading and leading_certainty. 00330 for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; 00331 *num_rebuilt_leading < leading_outliers; 00332 (*num_rebuilt_leading)++) { 00333 float char_certainty = word->best_choice->certainty(*num_rebuilt_leading); 00334 if (char_certainty > *unlikely_threshold) { 00335 break; 00336 } 00337 if (char_certainty < *leading_certainty) { 00338 *leading_certainty = char_certainty; 00339 } 00340 } 00341 00342 // Calculate num_trailing and trailing_certainty. 00343 for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0; 00344 *num_rebuilt_trailing < trailing_outliers; 00345 (*num_rebuilt_trailing)++) { 00346 int blob_idx = num_blobs - 1 - *num_rebuilt_trailing; 00347 float char_certainty = word->best_choice->certainty(blob_idx); 00348 if (char_certainty > *unlikely_threshold) { 00349 break; 00350 } 00351 if (char_certainty < *trailing_certainty) { 00352 *trailing_certainty = char_certainty; 00353 } 00354 } 00355 } 00356 00357 00382 WERD_RES *Tesseract::TrySuperscriptSplits( 00383 int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, 00384 int num_chopped_trailing, float trailing_certainty, 00385 ScriptPos trailing_pos, 00386 WERD_RES *word, 00387 bool *is_good, 00388 int *retry_rebuild_leading, int *retry_rebuild_trailing) { 00389 int num_chopped = word->chopped_word->NumBlobs(); 00390 00391 *retry_rebuild_leading = *retry_rebuild_trailing = 0; 00392 00393 // Chop apart the word into up to three pieces. 00394 00395 BlamerBundle *bb0 = NULL; 00396 BlamerBundle *bb1 = NULL; 00397 WERD_RES *prefix = NULL; 00398 WERD_RES *core = NULL; 00399 WERD_RES *suffix = NULL; 00400 if (num_chopped_leading > 0) { 00401 prefix = new WERD_RES(*word); 00402 split_word(prefix, num_chopped_leading, &core, &bb0); 00403 } else { 00404 core = new WERD_RES(*word); 00405 } 00406 00407 if (num_chopped_trailing > 0) { 00408 int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading; 00409 split_word(core, split_pt, &suffix, &bb1); 00410 } 00411 00412 // Recognize the pieces in turn. 00413 int saved_cp_multiplier = classify_class_pruner_multiplier; 00414 int saved_im_multiplier = classify_integer_matcher_multiplier; 00415 if (prefix) { 00416 // Turn off Tesseract's y-position penalties for the leading superscript. 00417 classify_class_pruner_multiplier.set_value(0); 00418 classify_integer_matcher_multiplier.set_value(0); 00419 00420 // Adjust our expectations about the baseline for this prefix. 00421 if (superscript_debug >= 3) { 00422 tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading); 00423 } 00424 recog_word_recursive(prefix); 00425 if (superscript_debug >= 2) { 00426 tprintf(" The leading bits look like %s %s\n", 00427 ScriptPosToString(leading_pos), 00428 prefix->best_choice->unichar_string().string()); 00429 } 00430 00431 // Restore the normal y-position penalties. 00432 classify_class_pruner_multiplier.set_value(saved_cp_multiplier); 00433 classify_integer_matcher_multiplier.set_value(saved_im_multiplier); 00434 } 00435 00436 if (superscript_debug >= 3) { 00437 tprintf(" recognizing middle %d chopped blobs\n", 00438 num_chopped - num_chopped_leading - num_chopped_trailing); 00439 } 00440 00441 if (suffix) { 00442 // Turn off Tesseract's y-position penalties for the trailing superscript. 00443 classify_class_pruner_multiplier.set_value(0); 00444 classify_integer_matcher_multiplier.set_value(0); 00445 00446 if (superscript_debug >= 3) { 00447 tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing); 00448 } 00449 recog_word_recursive(suffix); 00450 if (superscript_debug >= 2) { 00451 tprintf(" The trailing bits look like %s %s\n", 00452 ScriptPosToString(trailing_pos), 00453 suffix->best_choice->unichar_string().string()); 00454 } 00455 00456 // Restore the normal y-position penalties. 00457 classify_class_pruner_multiplier.set_value(saved_cp_multiplier); 00458 classify_integer_matcher_multiplier.set_value(saved_im_multiplier); 00459 } 00460 00461 // Evaluate whether we think the results are believably better 00462 // than what we already had. 00463 bool good_prefix = !prefix || BelievableSuperscript( 00464 superscript_debug >= 1, *prefix, 00465 superscript_bettered_certainty * leading_certainty, 00466 retry_rebuild_leading, NULL); 00467 bool good_suffix = !suffix || BelievableSuperscript( 00468 superscript_debug >= 1, *suffix, 00469 superscript_bettered_certainty * trailing_certainty, 00470 NULL, retry_rebuild_trailing); 00471 00472 *is_good = good_prefix && good_suffix; 00473 if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) { 00474 // None of it is any good. Quit now. 00475 delete core; 00476 delete prefix; 00477 delete suffix; 00478 return NULL; 00479 } 00480 recog_word_recursive(core); 00481 00482 // Now paste the results together into core. 00483 if (suffix) { 00484 suffix->SetAllScriptPositions(trailing_pos); 00485 join_words(core, suffix, bb1); 00486 } 00487 if (prefix) { 00488 prefix->SetAllScriptPositions(leading_pos); 00489 join_words(prefix, core, bb0); 00490 core = prefix; 00491 prefix = NULL; 00492 } 00493 00494 if (superscript_debug >= 1) { 00495 tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT", 00496 core->best_choice->unichar_string().string()); 00497 } 00498 return core; 00499 } 00500 00501 00520 bool Tesseract::BelievableSuperscript(bool debug, 00521 const WERD_RES &word, 00522 float certainty_threshold, 00523 int *left_ok, 00524 int *right_ok) const { 00525 int initial_ok_run_count = 0; 00526 int ok_run_count = 0; 00527 float worst_certainty = 0.0f; 00528 const WERD_CHOICE &wc = *word.best_choice; 00529 00530 const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table(); 00531 for (int i = 0; i < wc.length(); i++) { 00532 TBLOB *blob = word.rebuild_word->blobs[i]; 00533 UNICHAR_ID unichar_id = wc.unichar_id(i); 00534 float char_certainty = wc.certainty(i); 00535 bool bad_certainty = char_certainty < certainty_threshold; 00536 bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id); 00537 bool is_italic = word.fontinfo && word.fontinfo->is_italic(); 00538 BLOB_CHOICE *choice = word.GetBlobChoice(i); 00539 if (choice && fontinfo_table.size() > 0) { 00540 // Get better information from the specific choice, if available. 00541 int font_id1 = choice->fontinfo_id(); 00542 bool font1_is_italic = font_id1 >= 0 00543 ? fontinfo_table.get(font_id1).is_italic() : false; 00544 int font_id2 = choice->fontinfo_id2(); 00545 is_italic = font1_is_italic && 00546 (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic()); 00547 } 00548 00549 float height_fraction = 1.0f; 00550 float char_height = blob->bounding_box().height(); 00551 float normal_height = char_height; 00552 if (wc.unicharset()->top_bottom_useful()) { 00553 int min_bot, max_bot, min_top, max_top; 00554 wc.unicharset()->get_top_bottom(unichar_id, 00555 &min_bot, &max_bot, 00556 &min_top, &max_top); 00557 float hi_height = max_top - max_bot; 00558 float lo_height = min_top - min_bot; 00559 normal_height = (hi_height + lo_height) / 2; 00560 if (normal_height >= kBlnXHeight) { 00561 // Only ding characters that we have decent information for because 00562 // they're supposed to be normal sized, not tiny specks or dashes. 00563 height_fraction = char_height / normal_height; 00564 } 00565 } 00566 bool bad_height = height_fraction < superscript_scaledown_ratio; 00567 00568 if (debug) { 00569 if (is_italic) { 00570 tprintf(" Rejecting: superscript is italic.\n"); 00571 } 00572 if (is_punc) { 00573 tprintf(" Rejecting: punctuation present.\n"); 00574 } 00575 const char *char_str = wc.unicharset()->id_to_unichar(unichar_id); 00576 if (bad_certainty) { 00577 tprintf(" Rejecting: don't believe character %s with certainty %.2f " 00578 "which is less than threshold %.2f\n", char_str, 00579 char_certainty, certainty_threshold); 00580 } 00581 if (bad_height) { 00582 tprintf(" Rejecting: character %s seems too small @ %.2f versus " 00583 "expected %.2f\n", char_str, char_height, normal_height); 00584 } 00585 } 00586 if (bad_certainty || bad_height || is_punc || is_italic) { 00587 if (ok_run_count == i) { 00588 initial_ok_run_count = ok_run_count; 00589 } 00590 ok_run_count = 0; 00591 } else { 00592 ok_run_count++; 00593 } 00594 if (char_certainty < worst_certainty) { 00595 worst_certainty = char_certainty; 00596 } 00597 } 00598 bool all_ok = ok_run_count == wc.length(); 00599 if (all_ok && debug) { 00600 tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty); 00601 } 00602 if (!all_ok) { 00603 if (left_ok) *left_ok = initial_ok_run_count; 00604 if (right_ok) *right_ok = ok_run_count; 00605 } 00606 return all_ok; 00607 } 00608 00609 00610 } // namespace tesseract