tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/superscript.cpp
Go to the documentation of this file.
00001 /******************************************************************
00002  * File:        superscript.cpp
00003  * Description: Correction pass to fix superscripts and subscripts.
00004  * Author:      David Eger
00005  * Created:     Mon Mar 12 14:05:00 PDT 2012
00006  *
00007  * (C) Copyright 2012, Google, Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "normalis.h"
00021 #include "tesseractclass.h"
00022 
00023 static int LeadingUnicharsToChopped(WERD_RES *word, int num_unichars) {
00024   int num_chopped = 0;
00025   for (int i = 0; i < num_unichars; i++)
00026     num_chopped += word->best_state[i];
00027   return num_chopped;
00028 }
00029 
00030 static int TrailingUnicharsToChopped(WERD_RES *word, int num_unichars) {
00031   int num_chopped = 0;
00032   for (int i = 0; i < num_unichars; i++)
00033     num_chopped += word->best_state[word->best_state.size() - 1 - i];
00034   return num_chopped;
00035 }
00036 
00037 
00038 namespace tesseract {
00039 
00046 void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index,
00047                     int super_y_bottom, int sub_y_top,
00048                     ScriptPos *leading_pos, int *num_leading_outliers,
00049                     ScriptPos *trailing_pos, int *num_trailing_outliers) {
00050   ScriptPos sp_unused1, sp_unused2;
00051   int unused1, unused2;
00052   if (!leading_pos) leading_pos = &sp_unused1;
00053   if (!num_leading_outliers) num_leading_outliers = &unused1;
00054   if (!trailing_pos) trailing_pos = &sp_unused2;
00055   if (!num_trailing_outliers) num_trailing_outliers = &unused2;
00056 
00057   *num_leading_outliers = *num_trailing_outliers = 0;
00058   *leading_pos = *trailing_pos = SP_NORMAL;
00059 
00060   int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
00061   int num_chopped_pieces = word->best_state[rebuilt_blob_index];
00062   ScriptPos last_pos = SP_NORMAL;
00063   int trailing_outliers = 0;
00064   for (int i = 0; i < num_chopped_pieces; i++) {
00065     TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
00066     ScriptPos pos = SP_NORMAL;
00067     if (box.bottom() >= super_y_bottom) {
00068       pos = SP_SUPERSCRIPT;
00069     } else if (box.top() <= sub_y_top) {
00070       pos = SP_SUBSCRIPT;
00071     }
00072     if (pos == SP_NORMAL) {
00073       if (trailing_outliers == i) {
00074         *num_leading_outliers = trailing_outliers;
00075         *leading_pos = last_pos;
00076       }
00077       trailing_outliers = 0;
00078     } else {
00079       if (pos == last_pos) {
00080         trailing_outliers++;
00081       } else {
00082         trailing_outliers = 1;
00083       }
00084     }
00085     last_pos = pos;
00086   }
00087   *num_trailing_outliers = trailing_outliers;
00088   *trailing_pos = last_pos;
00089 }
00090 
00101 bool Tesseract::SubAndSuperscriptFix(WERD_RES *word) {
00102   if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
00103       !word->best_choice) {
00104     return false;
00105   }
00106   int num_leading, num_trailing;
00107   ScriptPos sp_leading, sp_trailing;
00108   float leading_certainty, trailing_certainty;
00109   float avg_certainty, unlikely_threshold;
00110 
00111   // Calculate the number of whole suspicious characters at the edges.
00112   GetSubAndSuperscriptCandidates(
00113           word, &num_leading, &sp_leading, &leading_certainty,
00114           &num_trailing, &sp_trailing, &trailing_certainty,
00115           &avg_certainty, &unlikely_threshold);
00116 
00117   const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
00118   const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
00119 
00120   int num_blobs = word->best_choice->length();
00121 
00122   // Calculate the remainder (partial characters) at the edges.
00123   // This accounts for us having classified the best version of
00124   // a word as [speaker?'] when it was instead [speaker.^{21}]
00125   // (that is we accidentally thought the 2 was attached to the period).
00126   int num_remainder_leading = 0, num_remainder_trailing = 0;
00127   if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
00128     int super_y_bottom =
00129         kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
00130     int sub_y_top =
00131         kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
00132     int last_word_char = num_blobs - 1 - num_trailing;
00133     float last_char_certainty = word->best_choice->certainty(last_word_char);
00134     if (word->best_choice->unichar_id(last_word_char) != 0 &&
00135         last_char_certainty <= unlikely_threshold) {
00136       ScriptPos rpos;
00137       YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
00138                      NULL, NULL, &rpos, &num_remainder_trailing);
00139       if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
00140       if (num_remainder_trailing > 0 &&
00141           last_char_certainty < trailing_certainty) {
00142         trailing_certainty = last_char_certainty;
00143       }
00144     }
00145     bool another_blob_available = (num_remainder_trailing == 0) ||
00146         num_leading + num_trailing + 1 < num_blobs;
00147     int first_char_certainty = word->best_choice->certainty(num_leading);
00148     if (another_blob_available &&
00149         word->best_choice->unichar_id(num_leading) != 0 &&
00150         first_char_certainty <= unlikely_threshold) {
00151       ScriptPos lpos;
00152       YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
00153                      &lpos, &num_remainder_leading, NULL, NULL);
00154       if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
00155       if (num_remainder_leading > 0 &&
00156           first_char_certainty < leading_certainty) {
00157         leading_certainty = first_char_certainty;
00158       }
00159     }
00160   }
00161 
00162   // If nothing to do, bail now.
00163   if (num_leading + num_trailing +
00164       num_remainder_leading + num_remainder_trailing == 0) {
00165     return false;
00166   }
00167 
00168   if (superscript_debug >= 1) {
00169     tprintf("Candidate for superscript detection: %s (",
00170             word->best_choice->unichar_string().string());
00171     if (num_leading || num_remainder_leading) {
00172       tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
00173               leading_pos);
00174     }
00175     if (num_trailing || num_remainder_trailing) {
00176       tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
00177               trailing_pos);
00178     }
00179     tprintf(")\n");
00180   }
00181   if (superscript_debug >= 3) {
00182     word->best_choice->print();
00183   }
00184   if (superscript_debug >= 2) {
00185     tprintf(" Certainties -- Average: %.2f  Unlikely thresh: %.2f  ",
00186             avg_certainty, unlikely_threshold);
00187     if (num_leading)
00188       tprintf("Orig. leading (min): %.2f  ", leading_certainty);
00189     if (num_trailing)
00190       tprintf("Orig. trailing (min): %.2f  ", trailing_certainty);
00191     tprintf("\n");
00192   }
00193 
00194   // We've now calculated the number of rebuilt blobs we want to carve off.
00195   // However, split_word() works from TBLOBs in chopped_word, so we need to
00196   // convert to those.
00197   int num_chopped_leading =
00198       LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
00199   int num_chopped_trailing =
00200       TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
00201 
00202   int retry_leading = 0;
00203   int retry_trailing = 0;
00204   bool is_good = false;
00205   WERD_RES *revised = TrySuperscriptSplits(
00206       num_chopped_leading, leading_certainty, sp_leading,
00207       num_chopped_trailing, trailing_certainty, sp_trailing,
00208       word, &is_good, &retry_leading, &retry_trailing);
00209   if (is_good) {
00210     word->ConsumeWordResults(revised);
00211   } else if (retry_leading || retry_trailing) {
00212     int retry_chopped_leading =
00213         LeadingUnicharsToChopped(revised, retry_leading);
00214     int retry_chopped_trailing =
00215         TrailingUnicharsToChopped(revised, retry_trailing);
00216     WERD_RES *revised2 = TrySuperscriptSplits(
00217         retry_chopped_leading, leading_certainty, sp_leading,
00218         retry_chopped_trailing, trailing_certainty, sp_trailing,
00219         revised, &is_good, &retry_leading, &retry_trailing);
00220     if (is_good) {
00221       word->ConsumeWordResults(revised2);
00222     }
00223     delete revised2;
00224   }
00225   delete revised;
00226   return is_good;
00227 }
00228 
00253 void Tesseract::GetSubAndSuperscriptCandidates(const WERD_RES *word,
00254                                                int *num_rebuilt_leading,
00255                                                ScriptPos *leading_pos,
00256                                                float *leading_certainty,
00257                                                int *num_rebuilt_trailing,
00258                                                ScriptPos *trailing_pos,
00259                                                float *trailing_certainty,
00260                                                float *avg_certainty,
00261                                                float *unlikely_threshold) {
00262   *avg_certainty = *unlikely_threshold = 0.0f;
00263   *num_rebuilt_leading = *num_rebuilt_trailing = 0;
00264   *leading_certainty = *trailing_certainty = 0.0f;
00265 
00266   int super_y_bottom =
00267       kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
00268   int sub_y_top =
00269       kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
00270 
00271   // Step one: Get an average certainty for "normally placed" characters.
00272 
00273   // Counts here are of blobs in the rebuild_word / unichars in best_choice.
00274   *leading_pos = *trailing_pos = SP_NORMAL;
00275   int leading_outliers = 0;
00276   int trailing_outliers = 0;
00277   int num_normal = 0;
00278   float normal_certainty_total = 0.0f;
00279   float worst_normal_certainty = 0.0f;
00280   ScriptPos last_pos = SP_NORMAL;
00281   int num_blobs = word->rebuild_word->NumBlobs();
00282   for (int b = 0; b < num_blobs; ++b) {
00283     TBOX box = word->rebuild_word->blobs[b]->bounding_box();
00284     ScriptPos pos = SP_NORMAL;
00285     if (box.bottom() >= super_y_bottom) {
00286       pos = SP_SUPERSCRIPT;
00287     } else if (box.top() <= sub_y_top) {
00288       pos = SP_SUBSCRIPT;
00289     }
00290     if (pos == SP_NORMAL) {
00291       if (word->best_choice->unichar_id(b) != 0) {
00292         float char_certainty = word->best_choice->certainty(b);
00293         if (char_certainty < worst_normal_certainty) {
00294           worst_normal_certainty = char_certainty;
00295         }
00296         num_normal++;
00297         normal_certainty_total += char_certainty;
00298       }
00299       if (trailing_outliers == b) {
00300         leading_outliers = trailing_outliers;
00301         *leading_pos = last_pos;
00302       }
00303       trailing_outliers = 0;
00304     } else {
00305       if (last_pos == pos) {
00306         trailing_outliers++;
00307       } else {
00308         trailing_outliers = 1;
00309       }
00310     }
00311     last_pos = pos;
00312   }
00313   *trailing_pos = last_pos;
00314   if (num_normal >= 3) {  // throw out the worst as an outlier.
00315     num_normal--;
00316     normal_certainty_total -= worst_normal_certainty;
00317   }
00318   if (num_normal > 0) {
00319     *avg_certainty = normal_certainty_total / num_normal;
00320     *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
00321   }
00322   if (num_normal == 0 ||
00323       (leading_outliers == 0 && trailing_outliers == 0)) {
00324     return;
00325   }
00326 
00327   // Step two: Try to split off bits of the word that are both outliers
00328   //           and have much lower certainty than average
00329   // Calculate num_leading and leading_certainty.
00330   for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
00331        *num_rebuilt_leading < leading_outliers;
00332        (*num_rebuilt_leading)++) {
00333     float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
00334     if (char_certainty > *unlikely_threshold) {
00335       break;
00336     }
00337     if (char_certainty < *leading_certainty) {
00338       *leading_certainty = char_certainty;
00339     }
00340   }
00341 
00342   // Calculate num_trailing and trailing_certainty.
00343   for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
00344        *num_rebuilt_trailing < trailing_outliers;
00345        (*num_rebuilt_trailing)++) {
00346     int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
00347     float char_certainty = word->best_choice->certainty(blob_idx);
00348     if (char_certainty > *unlikely_threshold) {
00349       break;
00350     }
00351     if (char_certainty < *trailing_certainty) {
00352       *trailing_certainty = char_certainty;
00353     }
00354   }
00355 }
00356 
00357 
00382 WERD_RES *Tesseract::TrySuperscriptSplits(
00383     int num_chopped_leading, float leading_certainty, ScriptPos leading_pos,
00384     int num_chopped_trailing, float trailing_certainty,
00385     ScriptPos trailing_pos,
00386     WERD_RES *word,
00387     bool *is_good,
00388     int *retry_rebuild_leading, int *retry_rebuild_trailing) {
00389   int num_chopped = word->chopped_word->NumBlobs();
00390 
00391   *retry_rebuild_leading = *retry_rebuild_trailing = 0;
00392 
00393   // Chop apart the word into up to three pieces.
00394 
00395   BlamerBundle *bb0 = NULL;
00396   BlamerBundle *bb1 = NULL;
00397   WERD_RES *prefix = NULL;
00398   WERD_RES *core = NULL;
00399   WERD_RES *suffix = NULL;
00400   if (num_chopped_leading > 0) {
00401     prefix = new WERD_RES(*word);
00402     split_word(prefix, num_chopped_leading, &core, &bb0);
00403   } else {
00404     core = new WERD_RES(*word);
00405   }
00406 
00407   if (num_chopped_trailing > 0) {
00408     int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
00409     split_word(core, split_pt, &suffix, &bb1);
00410   }
00411 
00412   //  Recognize the pieces in turn.
00413   int saved_cp_multiplier = classify_class_pruner_multiplier;
00414   int saved_im_multiplier = classify_integer_matcher_multiplier;
00415   if (prefix) {
00416     // Turn off Tesseract's y-position penalties for the leading superscript.
00417     classify_class_pruner_multiplier.set_value(0);
00418     classify_integer_matcher_multiplier.set_value(0);
00419 
00420     // Adjust our expectations about the baseline for this prefix.
00421     if (superscript_debug >= 3) {
00422       tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
00423     }
00424     recog_word_recursive(prefix);
00425     if (superscript_debug >= 2) {
00426       tprintf(" The leading bits look like %s %s\n",
00427               ScriptPosToString(leading_pos),
00428               prefix->best_choice->unichar_string().string());
00429     }
00430 
00431     // Restore the normal y-position penalties.
00432     classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
00433     classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
00434   }
00435 
00436   if (superscript_debug >= 3) {
00437     tprintf(" recognizing middle %d chopped blobs\n",
00438             num_chopped - num_chopped_leading - num_chopped_trailing);
00439   }
00440 
00441   if (suffix) {
00442     // Turn off Tesseract's y-position penalties for the trailing superscript.
00443     classify_class_pruner_multiplier.set_value(0);
00444     classify_integer_matcher_multiplier.set_value(0);
00445 
00446     if (superscript_debug >= 3) {
00447       tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
00448     }
00449     recog_word_recursive(suffix);
00450     if (superscript_debug >= 2) {
00451       tprintf(" The trailing bits look like %s %s\n",
00452               ScriptPosToString(trailing_pos),
00453               suffix->best_choice->unichar_string().string());
00454     }
00455 
00456     // Restore the normal y-position penalties.
00457     classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
00458     classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
00459   }
00460 
00461   // Evaluate whether we think the results are believably better
00462   // than what we already had.
00463   bool good_prefix = !prefix || BelievableSuperscript(
00464       superscript_debug >= 1, *prefix,
00465       superscript_bettered_certainty * leading_certainty,
00466       retry_rebuild_leading, NULL);
00467   bool good_suffix = !suffix || BelievableSuperscript(
00468       superscript_debug >= 1, *suffix,
00469       superscript_bettered_certainty * trailing_certainty,
00470       NULL, retry_rebuild_trailing);
00471 
00472   *is_good = good_prefix && good_suffix;
00473   if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
00474     // None of it is any good. Quit now.
00475     delete core;
00476     delete prefix;
00477     delete suffix;
00478     return NULL;
00479   }
00480   recog_word_recursive(core);
00481 
00482   // Now paste the results together into core.
00483   if (suffix) {
00484     suffix->SetAllScriptPositions(trailing_pos);
00485     join_words(core, suffix, bb1);
00486   }
00487   if (prefix) {
00488     prefix->SetAllScriptPositions(leading_pos);
00489     join_words(prefix, core, bb0);
00490     core = prefix;
00491     prefix = NULL;
00492   }
00493 
00494   if (superscript_debug >= 1) {
00495     tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
00496             core->best_choice->unichar_string().string());
00497   }
00498   return core;
00499 }
00500 
00501 
00520 bool Tesseract::BelievableSuperscript(bool debug,
00521                                       const WERD_RES &word,
00522                                       float certainty_threshold,
00523                                       int *left_ok,
00524                                       int *right_ok) const {
00525   int initial_ok_run_count = 0;
00526   int ok_run_count = 0;
00527   float worst_certainty = 0.0f;
00528   const WERD_CHOICE &wc = *word.best_choice;
00529 
00530   const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
00531   for (int i = 0; i < wc.length(); i++) {
00532     TBLOB *blob = word.rebuild_word->blobs[i];
00533     UNICHAR_ID unichar_id = wc.unichar_id(i);
00534     float char_certainty = wc.certainty(i);
00535     bool bad_certainty = char_certainty < certainty_threshold;
00536     bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
00537     bool is_italic = word.fontinfo && word.fontinfo->is_italic();
00538     BLOB_CHOICE *choice = word.GetBlobChoice(i);
00539     if (choice && fontinfo_table.size() > 0) {
00540       // Get better information from the specific choice, if available.
00541       int font_id1 = choice->fontinfo_id();
00542       bool font1_is_italic = font_id1 >= 0
00543           ? fontinfo_table.get(font_id1).is_italic() : false;
00544       int font_id2 = choice->fontinfo_id2();
00545       is_italic = font1_is_italic &&
00546           (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
00547     }
00548 
00549     float height_fraction = 1.0f;
00550     float char_height = blob->bounding_box().height();
00551     float normal_height = char_height;
00552     if (wc.unicharset()->top_bottom_useful()) {
00553       int min_bot, max_bot, min_top, max_top;
00554       wc.unicharset()->get_top_bottom(unichar_id,
00555                                       &min_bot, &max_bot,
00556                                       &min_top, &max_top);
00557       float hi_height = max_top - max_bot;
00558       float lo_height = min_top - min_bot;
00559       normal_height = (hi_height + lo_height) / 2;
00560       if (normal_height >= kBlnXHeight) {
00561         // Only ding characters that we have decent information for because
00562         // they're supposed to be normal sized, not tiny specks or dashes.
00563         height_fraction = char_height / normal_height;
00564       }
00565     }
00566     bool bad_height = height_fraction < superscript_scaledown_ratio;
00567 
00568     if (debug) {
00569       if (is_italic) {
00570         tprintf(" Rejecting: superscript is italic.\n");
00571       }
00572       if (is_punc) {
00573         tprintf(" Rejecting: punctuation present.\n");
00574       }
00575       const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
00576       if (bad_certainty) {
00577         tprintf(" Rejecting: don't believe character %s with certainty %.2f "
00578                 "which is less than threshold %.2f\n", char_str,
00579                 char_certainty, certainty_threshold);
00580       }
00581       if (bad_height) {
00582         tprintf(" Rejecting: character %s seems too small @ %.2f versus "
00583                 "expected %.2f\n", char_str, char_height, normal_height);
00584       }
00585     }
00586     if (bad_certainty || bad_height || is_punc || is_italic) {
00587       if (ok_run_count == i) {
00588         initial_ok_run_count = ok_run_count;
00589       }
00590       ok_run_count = 0;
00591     } else {
00592       ok_run_count++;
00593     }
00594     if (char_certainty < worst_certainty) {
00595       worst_certainty = char_certainty;
00596     }
00597   }
00598   bool all_ok = ok_run_count == wc.length();
00599   if (all_ok && debug) {
00600     tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
00601   }
00602   if (!all_ok) {
00603     if (left_ok) *left_ok = initial_ok_run_count;
00604     if (right_ok) *right_ok = ok_run_count;
00605   }
00606   return all_ok;
00607 }
00608 
00609 
00610 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines