tesseract
3.03
|
00001 00002 // File: associate.cpp 00003 // Description: Functions for scoring segmentation paths according to 00004 // their character widths, gap widths and seam cuts. 00005 // Author: Daria Antonova 00006 // Created: Mon Mar 8 11:26:43 PDT 2010 00007 // 00008 // (C) Copyright 2010, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 00022 #include <stdio.h> 00023 #ifdef __UNIX__ 00024 #include <assert.h> 00025 #endif 00026 #include <math.h> 00027 00028 #include "associate.h" 00029 #include "normalis.h" 00030 #include "pageres.h" 00031 00032 namespace tesseract { 00033 00034 const float AssociateUtils::kMaxFixedPitchCharAspectRatio = 2.0f; 00035 const float AssociateUtils::kMinGap = 0.03f; 00036 00037 void AssociateUtils::ComputeStats(int col, int row, 00038 const AssociateStats *parent_stats, 00039 int parent_path_length, 00040 bool fixed_pitch, 00041 float max_char_wh_ratio, 00042 WERD_RES *word_res, 00043 bool debug, 00044 AssociateStats *stats) { 00045 stats->Clear(); 00046 00047 ASSERT_HOST(word_res != NULL); 00048 if (word_res->blob_widths.empty()) { 00049 return; 00050 } 00051 if (debug) { 00052 tprintf("AssociateUtils::ComputeStats() for col=%d, row=%d%s\n", 00053 col, row, fixed_pitch ? " (fixed pitch)" : ""); 00054 } 00055 float normalizing_height = kBlnXHeight; 00056 ROW* blob_row = word_res->blob_row; 00057 // TODO(rays/daria) Can unicharset.script_has_xheight be useful here? 00058 if (fixed_pitch && blob_row != NULL) { 00059 // For fixed pitch language like CJK, we use the full text height 00060 // as the normalizing factor so we are not dependent on xheight 00061 // calculation. 00062 if (blob_row->body_size() > 0.0f) { 00063 normalizing_height = word_res->denorm.y_scale() * blob_row->body_size(); 00064 } else { 00065 normalizing_height = word_res->denorm.y_scale() * 00066 (blob_row->x_height() + blob_row->ascenders()); 00067 } 00068 if (debug) { 00069 tprintf("normalizing height = %g (scale %g xheight %g ascenders %g)\n", 00070 normalizing_height, word_res->denorm.y_scale(), 00071 blob_row->x_height(), blob_row->ascenders()); 00072 } 00073 } 00074 float wh_ratio = word_res->GetBlobsWidth(col, row) / normalizing_height; 00075 if (wh_ratio > max_char_wh_ratio) stats->bad_shape = true; 00076 // Compute the gap sum for this shape. If there are only negative or only 00077 // positive gaps, record their sum in stats->gap_sum. However, if there is 00078 // a mixture, record only the sum of the positive gaps. 00079 // TODO(antonova): explain fragment. 00080 int negative_gap_sum = 0; 00081 for (int c = col; c < row; ++c) { 00082 int gap = word_res->GetBlobsGap(c); 00083 (gap > 0) ? stats->gap_sum += gap : negative_gap_sum += gap; 00084 } 00085 if (stats->gap_sum == 0) stats->gap_sum = negative_gap_sum; 00086 if (debug) { 00087 tprintf("wh_ratio=%g (max_char_wh_ratio=%g) gap_sum=%d %s\n", 00088 wh_ratio, max_char_wh_ratio, stats->gap_sum, 00089 stats->bad_shape ? "bad_shape" : ""); 00090 } 00091 // Compute shape_cost (for fixed pitch mode). 00092 if (fixed_pitch) { 00093 bool end_row = (row == (word_res->ratings->dimension() - 1)); 00094 00095 // Ensure that the blob has gaps on the left and the right sides 00096 // (except for beginning and ending punctuation) and that there is 00097 // no cutting through ink at the blob boundaries. 00098 if (col > 0) { 00099 float left_gap = word_res->GetBlobsGap(col - 1) / normalizing_height; 00100 SEAM *left_seam = word_res->seam_array[col - 1]; 00101 if ((!end_row && left_gap < kMinGap) || left_seam->priority > 0.0f) { 00102 stats->bad_shape = true; 00103 } 00104 if (debug) { 00105 tprintf("left_gap %g, left_seam %g %s\n", left_gap, left_seam->priority, 00106 stats->bad_shape ? "bad_shape" : ""); 00107 } 00108 } 00109 float right_gap = 0.0f; 00110 if (!end_row) { 00111 right_gap = word_res->GetBlobsGap(row) / normalizing_height; 00112 SEAM *right_seam = word_res->seam_array[row]; 00113 if (right_gap < kMinGap || right_seam->priority > 0.0f) { 00114 stats->bad_shape = true; 00115 if (right_gap < kMinGap) stats->bad_fixed_pitch_right_gap = true; 00116 } 00117 if (debug) { 00118 tprintf("right_gap %g right_seam %g %s\n", 00119 right_gap, right_seam->priority, 00120 stats->bad_shape ? "bad_shape" : ""); 00121 } 00122 } 00123 00124 // Impose additional segmentation penalties if blob widths or gaps 00125 // distribution don't fit a fixed-pitch model. 00126 // Since we only know the widths and gaps of the path explored so far, 00127 // the means and variances are computed for the path so far (not 00128 // considering characters to the right of the last character on the path). 00129 stats->full_wh_ratio = wh_ratio + right_gap; 00130 if (parent_stats != NULL) { 00131 stats->full_wh_ratio_total = 00132 (parent_stats->full_wh_ratio_total + stats->full_wh_ratio); 00133 float mean = 00134 stats->full_wh_ratio_total / static_cast<float>(parent_path_length+1); 00135 stats->full_wh_ratio_var = 00136 parent_stats->full_wh_ratio_var + pow(mean-stats->full_wh_ratio, 2); 00137 } else { 00138 stats->full_wh_ratio_total = stats->full_wh_ratio; 00139 } 00140 if (debug) { 00141 tprintf("full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\n", 00142 stats->full_wh_ratio, stats->full_wh_ratio_total, 00143 stats->full_wh_ratio_var); 00144 } 00145 00146 stats->shape_cost = 00147 FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio); 00148 00149 // For some reason Tesseract prefers to treat the whole CJ words 00150 // as one blob when the initial segmentation is particularly bad. 00151 // This hack is to avoid favoring such states. 00152 if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) { 00153 stats->shape_cost += 10; 00154 } 00155 stats->shape_cost += stats->full_wh_ratio_var; 00156 if (debug) tprintf("shape_cost %g\n", stats->shape_cost); 00157 } 00158 } 00159 00160 float AssociateUtils::FixedPitchWidthCost(float norm_width, 00161 float right_gap, 00162 bool end_pos, 00163 float max_char_wh_ratio) { 00164 float cost = 0.0f; 00165 if (norm_width > max_char_wh_ratio) cost += norm_width; 00166 if (norm_width > kMaxFixedPitchCharAspectRatio) 00167 cost += norm_width * norm_width; // extra penalty for merging CJK chars 00168 // Penalize skinny blobs, except for punctuation in the last position. 00169 if (norm_width+right_gap < 0.5f && !end_pos) { 00170 cost += 1.0f - (norm_width + right_gap); 00171 } 00172 return cost; 00173 } 00174 00175 } // namespace tesseract