tesseract
3.03
|
00001 /********************************************************************** 00002 * File: fixxht.cpp (Formerly fixxht.c) 00003 * Description: Improve x_ht and look out for case inconsistencies 00004 * Author: Phil Cheatle 00005 * Created: Thu Aug 5 14:11:08 BST 1993 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <string.h> 00021 #include <ctype.h> 00022 #include "params.h" 00023 #include "float2int.h" 00024 #include "tesseractclass.h" 00025 00026 namespace tesseract { 00027 00028 // Fixxht overview. 00029 // Premise: Initial estimate of x-height is adequate most of the time, but 00030 // occasionally it is incorrect. Most notable causes of failure are: 00031 // 1. Small caps, where the top of the caps is the same as the body text 00032 // xheight. For small caps words the xheight needs to be reduced to correctly 00033 // recognize the caps in the small caps word. 00034 // 2. All xheight lines, such as summer. Here the initial estimate will have 00035 // guessed that the blob tops are caps and will have placed the xheight too low. 00036 // 3. Noise/logos beside words, or changes in font size on a line. Such 00037 // things can blow the statistics and cause an incorrect estimate. 00038 // 00039 // Algorithm. 00040 // Compare the vertical position (top only) of alphnumerics in a word with 00041 // the range of positions in training data (in the unicharset). 00042 // See CountMisfitTops. If any characters disagree sufficiently with the 00043 // initial xheight estimate, then recalculate the xheight, re-run OCR on 00044 // the word, and if the number of vertical misfits goes down, along with 00045 // either the word rating or certainty, then keep the new xheight. 00046 // The new xheight is calculated as follows:ComputeCompatibleXHeight 00047 // For each alphanumeric character that has a vertically misplaced top 00048 // (a misfit), yet its bottom is within the acceptable range (ie it is not 00049 // likely a sub-or super-script) calculate the range of acceptable xheight 00050 // positions from its range of tops, and give each value in the range a 00051 // number of votes equal to the distance of its top from its acceptance range. 00052 // The x-height position with the median of the votes becomes the new 00053 // x-height. This assumes that most characters will be correctly recognized 00054 // even if the x-height is incorrect. This is not a terrible assumption, but 00055 // it is not great. An improvement would be to use a classifier that does 00056 // not care about vertical position or scaling at all. 00057 00058 // If the max-min top of a unicharset char is bigger than kMaxCharTopRange 00059 // then the char top cannot be used to judge misfits or suggest a new top. 00060 const int kMaxCharTopRange = 48; 00061 00062 // Returns the number of misfit blob tops in this word. 00063 int Tesseract::CountMisfitTops(WERD_RES *word_res) { 00064 int bad_blobs = 0; 00065 int num_blobs = word_res->rebuild_word->NumBlobs(); 00066 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { 00067 TBLOB* blob = word_res->rebuild_word->blobs[blob_id]; 00068 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); 00069 if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { 00070 int top = blob->bounding_box().top(); 00071 if (top >= INT_FEAT_RANGE) 00072 top = INT_FEAT_RANGE - 1; 00073 int min_bottom, max_bottom, min_top, max_top; 00074 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, 00075 &min_top, &max_top); 00076 if (max_top - min_top > kMaxCharTopRange) 00077 continue; 00078 bool bad = top < min_top - x_ht_acceptance_tolerance || 00079 top > max_top + x_ht_acceptance_tolerance; 00080 if (bad) 00081 ++bad_blobs; 00082 if (debug_x_ht_level >= 1) { 00083 tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n", 00084 unicharset.id_to_unichar(class_id), 00085 bad ? "Misfit" : "OK", top, min_top, max_top, 00086 static_cast<int>(x_ht_acceptance_tolerance)); 00087 } 00088 } 00089 } 00090 return bad_blobs; 00091 } 00092 00093 // Returns a new x-height maximally compatible with the result in word_res. 00094 // See comment above for overall algorithm. 00095 float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) { 00096 STATS top_stats(0, MAX_UINT8); 00097 int num_blobs = word_res->rebuild_word->NumBlobs(); 00098 for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { 00099 TBLOB* blob = word_res->rebuild_word->blobs[blob_id]; 00100 UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); 00101 if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { 00102 int top = blob->bounding_box().top(); 00103 // Clip the top to the limit of normalized feature space. 00104 if (top >= INT_FEAT_RANGE) 00105 top = INT_FEAT_RANGE - 1; 00106 int bottom = blob->bounding_box().bottom(); 00107 int min_bottom, max_bottom, min_top, max_top; 00108 unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, 00109 &min_top, &max_top); 00110 // Chars with a wild top range would mess up the result so ignore them. 00111 if (max_top - min_top > kMaxCharTopRange) 00112 continue; 00113 int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top, 00114 top - (max_top + x_ht_acceptance_tolerance)); 00115 int height = top - kBlnBaselineOffset; 00116 if (debug_x_ht_level >= 20) { 00117 tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ", 00118 unicharset.id_to_unichar(class_id), 00119 height, min_bottom, max_bottom, min_top, max_top, 00120 bottom, top); 00121 } 00122 // Use only chars that fit in the expected bottom range, and where 00123 // the range of tops is sensibly near the xheight. 00124 if (min_bottom <= bottom + x_ht_acceptance_tolerance && 00125 bottom - x_ht_acceptance_tolerance <= max_bottom && 00126 min_top > kBlnBaselineOffset && 00127 max_top - kBlnBaselineOffset >= kBlnXHeight && 00128 misfit_dist > 0) { 00129 // Compute the x-height position using proportionality between the 00130 // actual height and expected height. 00131 int min_xht = DivRounded(height * kBlnXHeight, 00132 max_top - kBlnBaselineOffset); 00133 int max_xht = DivRounded(height * kBlnXHeight, 00134 min_top - kBlnBaselineOffset); 00135 if (debug_x_ht_level >= 20) { 00136 tprintf(" xht range min=%d, max=%d\n", 00137 min_xht, max_xht); 00138 } 00139 // The range of expected heights gets a vote equal to the distance 00140 // of the actual top from the expected top. 00141 for (int y = min_xht; y <= max_xht; ++y) 00142 top_stats.add(y, misfit_dist); 00143 } else if (debug_x_ht_level >= 20) { 00144 tprintf(" already OK\n"); 00145 } 00146 } 00147 } 00148 if (top_stats.get_total() == 0) 00149 return 0.0f; 00150 // The new xheight is just the median vote, which is then scaled out 00151 // of BLN space back to pixel space to get the x-height in pixel space. 00152 float new_xht = top_stats.median(); 00153 if (debug_x_ht_level >= 20) { 00154 tprintf("Median xht=%f\n", new_xht); 00155 tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", 00156 new_xht, new_xht / word_res->denorm.y_scale()); 00157 } 00158 // The xheight must change by at least x_ht_min_change to be used. 00159 if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change) 00160 return new_xht / word_res->denorm.y_scale(); 00161 else 00162 return 0.0f; 00163 } 00164 00165 } // namespace tesseract