tesseract
3.03
|
00001 /********************************************************************** 00002 * tospace.cpp 00003 * 00004 * Compute fuzzy word spacing thresholds for each row. 00005 * I.e. set : max_nonspace 00006 * space_threshold 00007 * min_space 00008 * kern_size 00009 * space_size 00010 * for each row. 00011 * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE 00012 * 00013 * Note: functions in this file were originally not members of any 00014 * class or enclosed by any namespace. Now they are all static members 00015 * of the Textord class. 00016 * 00017 **********************************************************************/ 00018 00019 #include "drawtord.h" 00020 #include "ndminx.h" 00021 #include "statistc.h" 00022 #include "textord.h" 00023 #include "tovars.h" 00024 00025 // Include automatically generated configuration file if running autoconf. 00026 #ifdef HAVE_CONFIG_H 00027 #include "config_auto.h" 00028 #endif 00029 00030 #define MAXSPACING 128 /*max expected spacing in pix */ 00031 00032 namespace tesseract { 00033 void Textord::to_spacing( 00034 ICOORD page_tr, //topright of page 00035 TO_BLOCK_LIST *blocks //blocks on page 00036 ) { 00037 TO_BLOCK_IT block_it; //iterator 00038 TO_BLOCK *block; //current block; 00039 TO_ROW_IT row_it; //row iterator 00040 TO_ROW *row; //current row 00041 int block_index; //block number 00042 int row_index; //row number 00043 //estimated width of real spaces for whole block 00044 inT16 block_space_gap_width; 00045 //estimated width of non space gaps for whole block 00046 inT16 block_non_space_gap_width; 00047 BOOL8 old_text_ord_proportional;//old fixed/prop result 00048 GAPMAP *gapmap = NULL; //map of big vert gaps in blk 00049 00050 block_it.set_to_list (blocks); 00051 block_index = 1; 00052 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00053 block_it.forward ()) { 00054 block = block_it.data (); 00055 gapmap = new GAPMAP (block); 00056 block_spacing_stats(block, 00057 gapmap, 00058 old_text_ord_proportional, 00059 block_space_gap_width, 00060 block_non_space_gap_width); 00061 // Make sure relative values of block-level space and non-space gap 00062 // widths are reasonable. The ratio of 1:3 is also used in 00063 // block_spacing_stats, to corrrect the block_space_gap_width 00064 // Useful for arabic and hindi, when the non-space gap width is 00065 // often over-estimated and should not be trusted. A similar ratio 00066 // is found in block_spacing_stats. 00067 if (tosp_old_to_method && tosp_old_to_constrain_sp_kn && 00068 (float) block_space_gap_width / block_non_space_gap_width < 3.0) { 00069 block_non_space_gap_width = (inT16) floor (block_space_gap_width / 3.0); 00070 } 00071 row_it.set_to_list (block->get_rows ()); 00072 row_index = 1; 00073 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00074 row = row_it.data (); 00075 if ((row->pitch_decision == PITCH_DEF_PROP) || 00076 (row->pitch_decision == PITCH_CORR_PROP)) { 00077 if ((tosp_debug_level > 0) && !old_text_ord_proportional) 00078 tprintf ("Block %d Row %d: Now Proportional\n", 00079 block_index, row_index); 00080 row_spacing_stats(row, 00081 gapmap, 00082 block_index, 00083 row_index, 00084 block_space_gap_width, 00085 block_non_space_gap_width); 00086 } 00087 else { 00088 if ((tosp_debug_level > 0) && old_text_ord_proportional) 00089 tprintf 00090 ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", 00091 block_index, row_index, row->pitch_decision, 00092 row->fixed_pitch); 00093 } 00094 #ifndef GRAPHICS_DISABLED 00095 if (textord_show_initial_words) 00096 plot_word_decisions (to_win, (inT16) row->fixed_pitch, row); 00097 #endif 00098 row_index++; 00099 } 00100 delete gapmap; 00101 block_index++; 00102 } 00103 } 00104 00105 00106 /************************************************************************* 00107 * block_spacing_stats() 00108 *************************************************************************/ 00109 00110 void Textord::block_spacing_stats( 00111 TO_BLOCK *block, 00112 GAPMAP *gapmap, 00113 BOOL8 &old_text_ord_proportional, 00114 inT16 &block_space_gap_width, //resulting estimate 00115 inT16 &block_non_space_gap_width //resulting estimate 00116 ) { 00117 TO_ROW_IT row_it; //row iterator 00118 TO_ROW *row; //current row 00119 BLOBNBOX_IT blob_it; //iterator 00120 00121 STATS centre_to_centre_stats (0, MAXSPACING); 00122 //DEBUG USE ONLY 00123 STATS all_gap_stats (0, MAXSPACING); 00124 STATS space_gap_stats (0, MAXSPACING); 00125 inT16 minwidth = MAX_INT16; //narrowest blob 00126 TBOX blob_box; 00127 TBOX prev_blob_box; 00128 inT16 centre_to_centre; 00129 inT16 gap_width; 00130 float real_space_threshold; 00131 float iqr_centre_to_centre; //DEBUG USE ONLY 00132 float iqr_all_gap_stats; //DEBUG USE ONLY 00133 inT32 end_of_row; 00134 inT32 row_length; 00135 00136 row_it.set_to_list (block->get_rows ()); 00137 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00138 row = row_it.data (); 00139 if (!row->blob_list ()->empty () && 00140 (!tosp_only_use_prop_rows || 00141 (row->pitch_decision == PITCH_DEF_PROP) || 00142 (row->pitch_decision == PITCH_CORR_PROP))) { 00143 blob_it.set_to_list (row->blob_list ()); 00144 blob_it.mark_cycle_pt (); 00145 end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); 00146 if (tosp_use_pre_chopping) 00147 blob_box = box_next_pre_chopped (&blob_it); 00148 else if (tosp_stats_use_xht_gaps) 00149 blob_box = reduced_box_next (row, &blob_it); 00150 else 00151 blob_box = box_next (&blob_it); 00152 row_length = end_of_row - blob_box.left (); 00153 if (blob_box.width () < minwidth) 00154 minwidth = blob_box.width (); 00155 prev_blob_box = blob_box; 00156 while (!blob_it.cycled_list ()) { 00157 if (tosp_use_pre_chopping) 00158 blob_box = box_next_pre_chopped (&blob_it); 00159 else if (tosp_stats_use_xht_gaps) 00160 blob_box = reduced_box_next (row, &blob_it); 00161 else 00162 blob_box = box_next (&blob_it); 00163 if (blob_box.width () < minwidth) 00164 minwidth = blob_box.width (); 00165 gap_width = blob_box.left () - prev_blob_box.right (); 00166 if (!ignore_big_gap (row, row_length, gapmap, 00167 prev_blob_box.right (), blob_box.left ())) { 00168 all_gap_stats.add (gap_width, 1); 00169 00170 centre_to_centre = (blob_box.left () + blob_box.right () - 00171 (prev_blob_box.left () + 00172 prev_blob_box.right ())) / 2; 00173 //DEBUG 00174 centre_to_centre_stats.add (centre_to_centre, 1); 00175 // DEBUG 00176 } 00177 prev_blob_box = blob_box; 00178 } 00179 } 00180 } 00181 00182 //Inadequate samples 00183 if (all_gap_stats.get_total () <= 1) { 00184 block_non_space_gap_width = minwidth; 00185 block_space_gap_width = -1; //No est. space width 00186 //DEBUG 00187 old_text_ord_proportional = TRUE; 00188 } 00189 else { 00190 /* For debug only ..... */ 00191 iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) - 00192 centre_to_centre_stats.ile (0.25); 00193 iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25); 00194 old_text_ord_proportional = 00195 iqr_centre_to_centre * 2 > iqr_all_gap_stats; 00196 /* .......For debug only */ 00197 00198 /* 00199 The median of the gaps is used as an estimate of the NON-SPACE gap width. 00200 This RELIES on the assumption that there are more gaps WITHIN words than 00201 BETWEEN words in a block 00202 00203 Now try to estimate the width of a real space for all real spaces in the 00204 block. Do this by using a crude threshold to ignore "narrow" gaps, then 00205 find the median of the "wide" gaps and use this. 00206 */ 00207 block_non_space_gap_width = (inT16) floor (all_gap_stats.median ()); 00208 // median gap 00209 00210 row_it.set_to_list (block->get_rows ()); 00211 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00212 row = row_it.data (); 00213 if (!row->blob_list ()->empty () && 00214 (!tosp_only_use_prop_rows || 00215 (row->pitch_decision == PITCH_DEF_PROP) || 00216 (row->pitch_decision == PITCH_CORR_PROP))) { 00217 real_space_threshold = 00218 MAX (tosp_init_guess_kn_mult * block_non_space_gap_width, 00219 tosp_init_guess_xht_mult * row->xheight); 00220 blob_it.set_to_list (row->blob_list ()); 00221 blob_it.mark_cycle_pt (); 00222 end_of_row = 00223 blob_it.data_relative (-1)->bounding_box ().right (); 00224 if (tosp_use_pre_chopping) 00225 blob_box = box_next_pre_chopped (&blob_it); 00226 else if (tosp_stats_use_xht_gaps) 00227 blob_box = reduced_box_next (row, &blob_it); 00228 else 00229 blob_box = box_next (&blob_it); 00230 row_length = blob_box.left () - end_of_row; 00231 prev_blob_box = blob_box; 00232 while (!blob_it.cycled_list ()) { 00233 if (tosp_use_pre_chopping) 00234 blob_box = box_next_pre_chopped (&blob_it); 00235 else if (tosp_stats_use_xht_gaps) 00236 blob_box = reduced_box_next (row, &blob_it); 00237 else 00238 blob_box = box_next (&blob_it); 00239 gap_width = blob_box.left () - prev_blob_box.right (); 00240 if ((gap_width > real_space_threshold) && 00241 !ignore_big_gap (row, row_length, gapmap, 00242 prev_blob_box.right (), 00243 blob_box.left ())) { 00244 /* 00245 If tosp_use_cert_spaces is enabled, the estimate of the space gap is 00246 restricted to obvious spaces - those wider than half the xht or those 00247 with wide blobs on both sides - i.e not things that are suspect 1's or 00248 punctuation that is sometimes widely spaced. 00249 */ 00250 if (!tosp_block_use_cert_spaces || 00251 (gap_width > 00252 tosp_fuzzy_space_factor2 * row->xheight) 00253 || 00254 ((gap_width > 00255 tosp_fuzzy_space_factor1 * row->xheight) 00256 && (!tosp_narrow_blobs_not_cert 00257 || (!narrow_blob (row, prev_blob_box) 00258 && !narrow_blob (row, blob_box)))) 00259 || (wide_blob (row, prev_blob_box) 00260 && wide_blob (row, blob_box))) 00261 space_gap_stats.add (gap_width, 1); 00262 } 00263 prev_blob_box = blob_box; 00264 } 00265 } 00266 } 00267 //Inadequate samples 00268 if (space_gap_stats.get_total () <= 2) 00269 block_space_gap_width = -1;//No est. space width 00270 else 00271 block_space_gap_width = 00272 MAX ((inT16) floor (space_gap_stats.median ()), 00273 3 * block_non_space_gap_width); 00274 } 00275 } 00276 00277 00278 /************************************************************************* 00279 * row_spacing_stats() 00280 * Set values for min_space, max_non_space based on row stats only 00281 * If failure - return 0 values. 00282 *************************************************************************/ 00283 void Textord::row_spacing_stats( 00284 TO_ROW *row, 00285 GAPMAP *gapmap, 00286 inT16 block_idx, 00287 inT16 row_idx, 00288 inT16 block_space_gap_width, //estimate for block 00289 inT16 block_non_space_gap_width //estimate for block 00290 ) { 00291 //iterator 00292 BLOBNBOX_IT blob_it = row->blob_list (); 00293 STATS all_gap_stats (0, MAXSPACING); 00294 STATS cert_space_gap_stats (0, MAXSPACING); 00295 STATS all_space_gap_stats (0, MAXSPACING); 00296 STATS small_gap_stats (0, MAXSPACING); 00297 TBOX blob_box; 00298 TBOX prev_blob_box; 00299 inT16 gap_width; 00300 inT16 real_space_threshold = 0; 00301 inT16 max = 0; 00302 inT16 index; 00303 inT16 large_gap_count = 0; 00304 BOOL8 suspected_table; 00305 inT32 max_max_nonspace; //upper bound 00306 BOOL8 good_block_space_estimate = block_space_gap_width > 0; 00307 inT32 end_of_row; 00308 inT32 row_length = 0; 00309 float sane_space; 00310 inT32 sane_threshold; 00311 00312 /* Collect first pass stats for row */ 00313 00314 if (!good_block_space_estimate) 00315 block_space_gap_width = inT16 (floor (row->xheight / 2)); 00316 if (!row->blob_list ()->empty ()) { 00317 if (tosp_threshold_bias1 > 0) 00318 real_space_threshold = 00319 block_non_space_gap_width + 00320 inT16 (floor (0.5 + 00321 tosp_threshold_bias1 * (block_space_gap_width - 00322 block_non_space_gap_width))); 00323 else 00324 real_space_threshold = //Old TO method 00325 (block_space_gap_width + block_non_space_gap_width) / 2; 00326 blob_it.set_to_list (row->blob_list ()); 00327 blob_it.mark_cycle_pt (); 00328 end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); 00329 if (tosp_use_pre_chopping) 00330 blob_box = box_next_pre_chopped (&blob_it); 00331 else if (tosp_stats_use_xht_gaps) 00332 blob_box = reduced_box_next (row, &blob_it); 00333 else 00334 blob_box = box_next (&blob_it); 00335 row_length = end_of_row - blob_box.left (); 00336 prev_blob_box = blob_box; 00337 while (!blob_it.cycled_list ()) { 00338 if (tosp_use_pre_chopping) 00339 blob_box = box_next_pre_chopped (&blob_it); 00340 else if (tosp_stats_use_xht_gaps) 00341 blob_box = reduced_box_next (row, &blob_it); 00342 else 00343 blob_box = box_next (&blob_it); 00344 gap_width = blob_box.left () - prev_blob_box.right (); 00345 if (ignore_big_gap (row, row_length, gapmap, 00346 prev_blob_box.right (), blob_box.left ())) 00347 large_gap_count++; 00348 else { 00349 if (gap_width >= real_space_threshold) { 00350 if (!tosp_row_use_cert_spaces || 00351 (gap_width > tosp_fuzzy_space_factor2 * row->xheight) || 00352 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) 00353 && (!tosp_narrow_blobs_not_cert 00354 || (!narrow_blob (row, prev_blob_box) 00355 && !narrow_blob (row, blob_box)))) 00356 || (wide_blob (row, prev_blob_box) 00357 && wide_blob (row, blob_box))) 00358 cert_space_gap_stats.add (gap_width, 1); 00359 all_space_gap_stats.add (gap_width, 1); 00360 } 00361 else 00362 small_gap_stats.add (gap_width, 1); 00363 all_gap_stats.add (gap_width, 1); 00364 } 00365 prev_blob_box = blob_box; 00366 } 00367 } 00368 suspected_table = (large_gap_count > 1) || 00369 ((large_gap_count > 0) && 00370 (all_gap_stats.get_total () <= tosp_few_samples)); 00371 00372 /* Now determine row kern size, space size and threshold */ 00373 00374 if ((cert_space_gap_stats.get_total () >= 00375 tosp_enough_space_samples_for_median) || 00376 ((suspected_table || 00377 all_gap_stats.get_total () <= tosp_short_row) && 00378 cert_space_gap_stats.get_total () > 0)) { 00379 old_to_method(row, 00380 &all_gap_stats, 00381 &cert_space_gap_stats, 00382 &small_gap_stats, 00383 block_space_gap_width, 00384 block_non_space_gap_width); 00385 } else { 00386 if (!tosp_recovery_isolated_row_stats || 00387 !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table, 00388 block_idx, row_idx)) { 00389 if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) 00390 tprintf ("B:%d R:%d -- Inadequate certain spaces.\n", 00391 block_idx, row_idx); 00392 if (tosp_row_use_cert_spaces1 && good_block_space_estimate) { 00393 //Use block default 00394 row->space_size = block_space_gap_width; 00395 if (all_gap_stats.get_total () > tosp_redo_kern_limit) 00396 row->kern_size = all_gap_stats.median (); 00397 else 00398 row->kern_size = block_non_space_gap_width; 00399 row->space_threshold = 00400 inT32 (floor ((row->space_size + row->kern_size) / 00401 tosp_old_sp_kn_th_factor)); 00402 } 00403 else 00404 old_to_method(row, 00405 &all_gap_stats, 00406 &all_space_gap_stats, 00407 &small_gap_stats, 00408 block_space_gap_width, 00409 block_non_space_gap_width); 00410 } 00411 } 00412 00413 if (tosp_improve_thresh && !suspected_table) 00414 improve_row_threshold(row, &all_gap_stats); 00415 00416 /* Now lets try to be careful not to do anything silly with tables when we 00417 are ignoring big gaps*/ 00418 if (tosp_sanity_method == 0) { 00419 if (suspected_table && 00420 (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) { 00421 if (tosp_debug_level > 5) 00422 tprintf ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f.\n", 00423 block_idx, row_idx, 00424 row->kern_size, row->space_threshold, row->space_size); 00425 row->space_threshold = 00426 (inT32) (tosp_table_kn_sp_ratio * row->kern_size); 00427 row->space_size = MAX (row->space_threshold + 1, row->xheight); 00428 } 00429 } 00430 else if (tosp_sanity_method == 1) { 00431 sane_space = row->space_size; 00432 /* NEVER let space size get too close to kern size */ 00433 if ((row->space_size < tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5)) 00434 || ((row->space_size - row->kern_size) < 00435 (tosp_silly_kn_sp_gap * row->xheight))) { 00436 if (good_block_space_estimate && 00437 (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) 00438 sane_space = block_space_gap_width; 00439 else 00440 sane_space = 00441 MAX (tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5), 00442 row->xheight / 2); 00443 if (tosp_debug_level > 5) 00444 tprintf 00445 ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", 00446 block_idx, row_idx, row->kern_size, row->space_threshold, 00447 row->space_size, sane_space); 00448 row->space_size = sane_space; 00449 row->space_threshold = 00450 inT32 (floor ((row->space_size + row->kern_size) / 00451 tosp_old_sp_kn_th_factor)); 00452 } 00453 /* NEVER let threshold get VERY far away from kern */ 00454 sane_threshold = inT32 (floor (tosp_max_sane_kn_thresh * 00455 MAX (row->kern_size, 2.5))); 00456 if (row->space_threshold > sane_threshold) { 00457 if (tosp_debug_level > 5) 00458 tprintf ("B:%d R:%d -- DONT BELIEVE THRESH %3.2f %d %3.2f->%d.\n", 00459 block_idx, row_idx, 00460 row->kern_size, 00461 row->space_threshold, row->space_size, sane_threshold); 00462 row->space_threshold = sane_threshold; 00463 if (row->space_size <= sane_threshold) 00464 row->space_size = row->space_threshold + 1.0f; 00465 } 00466 /* Beware of tables - there may be NO spaces */ 00467 if (suspected_table) { 00468 sane_space = MAX (tosp_table_kn_sp_ratio * row->kern_size, 00469 tosp_table_xht_sp_ratio * row->xheight); 00470 sane_threshold = inT32 (floor ((sane_space + row->kern_size) / 2)); 00471 00472 if ((row->space_size < sane_space) || 00473 (row->space_threshold < sane_threshold)) { 00474 if (tosp_debug_level > 5) 00475 tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", 00476 block_idx, row_idx, 00477 row->kern_size, 00478 row->space_threshold, row->space_size); 00479 //the minimum sane value 00480 row->space_threshold = (inT32) sane_space; 00481 row->space_size = MAX (row->space_threshold + 1, row->xheight); 00482 } 00483 } 00484 } 00485 00486 /* Now lets try to put some error limits on the threshold */ 00487 00488 if (tosp_old_to_method) { 00489 /* Old textord made a space if gap >= threshold */ 00490 //NO FUZZY SPACES YET 00491 row->max_nonspace = row->space_threshold; 00492 //NO FUZZY SPACES YET 00493 row->min_space = row->space_threshold + 1; 00494 } 00495 else { 00496 /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */ 00497 row->min_space = 00498 MIN (inT32 (ceil (tosp_fuzzy_space_factor * row->xheight)), 00499 inT32 (row->space_size)); 00500 if (row->min_space <= row->space_threshold) 00501 //Dont be silly 00502 row->min_space = row->space_threshold + 1; 00503 /* 00504 Lets try to guess the max certain kern gap by looking at the cluster of 00505 kerns for the row. The row is proportional so the kerns should cluster 00506 tightly at the bottom of the distribution. We also expect most gaps to be 00507 kerns. Find the maximum of the kern piles between 0 and twice the kern 00508 estimate. Piles before the first one with less than 1/10 the maximum 00509 number of samples can be taken as certain kerns. 00510 00511 Of course, there are some cases where the kern peak and space peaks merge, 00512 so we will put an UPPER limit on the max certain kern gap of some fraction 00513 below the threshold. 00514 */ 00515 00516 max_max_nonspace = inT32 ((row->space_threshold + row->kern_size) / 2); 00517 00518 //default 00519 row->max_nonspace = max_max_nonspace; 00520 for (index = 0; index <= max_max_nonspace; index++) { 00521 if (all_gap_stats.pile_count (index) > max) 00522 max = all_gap_stats.pile_count (index); 00523 if ((index > row->kern_size) && 00524 (all_gap_stats.pile_count (index) < 0.1 * max)) { 00525 row->max_nonspace = index; 00526 break; 00527 } 00528 } 00529 } 00530 00531 /* Yet another algorithm - simpler this time - just choose a fraction of the 00532 threshold to space range */ 00533 00534 if ((tosp_fuzzy_sp_fraction > 0) && 00535 (row->space_size > row->space_threshold)) 00536 row->min_space = MAX (row->min_space, 00537 (inT32) ceil (row->space_threshold + 00538 tosp_fuzzy_sp_fraction * 00539 (row->space_size - 00540 row->space_threshold))); 00541 00542 /* Ensure that ANY space less than some multiplier times the kern size is 00543 fuzzy. In tables there is a risk of erroneously setting a small space size 00544 when there are no real spaces. Sometimes tables have text squashed into 00545 columns so that the kn->sp ratio is small anyway - this means that we cant 00546 use this to force a wider separation - hence we rely on context to join any 00547 dubious breaks. */ 00548 00549 if ((tosp_table_fuzzy_kn_sp_ratio > 0) && 00550 (suspected_table || tosp_fuzzy_limit_all)) 00551 row->min_space = MAX (row->min_space, 00552 (inT32) ceil (tosp_table_fuzzy_kn_sp_ratio * 00553 row->kern_size)); 00554 00555 if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) { 00556 row->max_nonspace = (inT32) floor (0.5 + row->kern_size + 00557 tosp_fuzzy_kn_fraction * 00558 (row->space_threshold - 00559 row->kern_size)); 00560 } 00561 if (row->max_nonspace > row->space_threshold) { 00562 //Dont be silly 00563 row->max_nonspace = row->space_threshold; 00564 } 00565 00566 if (tosp_debug_level > 5) 00567 tprintf 00568 ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n", 00569 block_idx, row_idx, row_length, block_non_space_gap_width, 00570 block_space_gap_width, real_space_threshold, row->kern_size, 00571 row->max_nonspace, row->space_threshold, row->min_space, 00572 row->space_size); 00573 if (tosp_debug_level > 10) 00574 tprintf("row->kern_size = %3.2f, row->space_size = %3.2f, " 00575 "row->space_threshold = %d\n", 00576 row->kern_size, row->space_size, row->space_threshold); 00577 } 00578 00579 void Textord::old_to_method( 00580 TO_ROW *row, 00581 STATS *all_gap_stats, 00582 STATS *space_gap_stats, 00583 STATS *small_gap_stats, 00584 inT16 block_space_gap_width, //estimate for block 00585 inT16 block_non_space_gap_width //estimate for block 00586 ) { 00587 /* First, estimate row space size */ 00588 /* Old to condition was > 2 */ 00589 if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) { 00590 //Adequate samples 00591 /* Set space size to median of spaces BUT limits it if it seems wildly out */ 00592 row->space_size = space_gap_stats->median (); 00593 if (row->space_size > block_space_gap_width * 1.5) { 00594 if (tosp_old_to_bug_fix) 00595 row->space_size = block_space_gap_width * 1.5; 00596 else 00597 //BUG??? should be *1.5 00598 row->space_size = block_space_gap_width; 00599 } 00600 if (row->space_size < (block_non_space_gap_width * 2) + 1) 00601 row->space_size = (block_non_space_gap_width * 2) + 1; 00602 } 00603 //Only 1 or 2 samples 00604 else if (space_gap_stats->get_total () >= 1) { 00605 //hence mean not median 00606 row->space_size = space_gap_stats->mean (); 00607 if (row->space_size > block_space_gap_width * 1.5) { 00608 if (tosp_old_to_bug_fix) 00609 row->space_size = block_space_gap_width * 1.5; 00610 else 00611 //BUG??? should be *1.5 00612 row->space_size = block_space_gap_width; 00613 } 00614 if (row->space_size < (block_non_space_gap_width * 3) + 1) 00615 row->space_size = (block_non_space_gap_width * 3) + 1; 00616 } 00617 else { 00618 //Use block default 00619 row->space_size = block_space_gap_width; 00620 } 00621 00622 /* Next, estimate row kern size */ 00623 if ((tosp_only_small_gaps_for_kern) && 00624 (small_gap_stats->get_total () > tosp_redo_kern_limit)) 00625 row->kern_size = small_gap_stats->median (); 00626 else if (all_gap_stats->get_total () > tosp_redo_kern_limit) 00627 row->kern_size = all_gap_stats->median (); 00628 else //old TO -SAME FOR ALL ROWS 00629 row->kern_size = block_non_space_gap_width; 00630 00631 /* Finally, estimate row space threshold */ 00632 if (tosp_threshold_bias2 > 0) { 00633 row->space_threshold = 00634 inT32 (floor (0.5 + row->kern_size + 00635 tosp_threshold_bias2 * (row->space_size - 00636 row->kern_size))); 00637 } else { 00638 /* 00639 NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold 00640 and holds this in a float. The use is with a >= test 00641 NEW textord uses an integer threshold and a > test 00642 It comes to the same thing. 00643 (Though there is a difference in that old textor has integer space_size 00644 and kern_size.) 00645 */ 00646 row->space_threshold = 00647 inT32 (floor ((row->space_size + row->kern_size) / 2)); 00648 } 00649 00650 // Apply the same logic and ratios as in row_spacing_stats to 00651 // restrict relative values of the row's space_size, kern_size, and 00652 // space_threshold 00653 if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 && 00654 ((row->space_size < 00655 tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5)) || 00656 ((row->space_size - row->kern_size) < 00657 tosp_silly_kn_sp_gap * row->xheight))) { 00658 if (row->kern_size > 2.5) 00659 row->kern_size = row->space_size / tosp_min_sane_kn_sp; 00660 row->space_threshold = inT32 (floor ((row->space_size + row->kern_size) / 00661 tosp_old_sp_kn_th_factor)); 00662 } 00663 } 00664 00665 00666 /************************************************************************* 00667 * isolated_row_stats() 00668 * Set values for min_space, max_non_space based on row stats only 00669 *************************************************************************/ 00670 BOOL8 Textord::isolated_row_stats(TO_ROW *row, 00671 GAPMAP *gapmap, 00672 STATS *all_gap_stats, 00673 BOOL8 suspected_table, 00674 inT16 block_idx, 00675 inT16 row_idx) { 00676 float kern_estimate; 00677 float crude_threshold_estimate; 00678 inT16 small_gaps_count; 00679 inT16 total; 00680 //iterator 00681 BLOBNBOX_IT blob_it = row->blob_list (); 00682 STATS cert_space_gap_stats (0, MAXSPACING); 00683 STATS all_space_gap_stats (0, MAXSPACING); 00684 STATS small_gap_stats (0, MAXSPACING); 00685 TBOX blob_box; 00686 TBOX prev_blob_box; 00687 inT16 gap_width; 00688 inT32 end_of_row; 00689 inT32 row_length; 00690 00691 kern_estimate = all_gap_stats->median (); 00692 crude_threshold_estimate = MAX (tosp_init_guess_kn_mult * kern_estimate, 00693 tosp_init_guess_xht_mult * row->xheight); 00694 small_gaps_count = stats_count_under (all_gap_stats, 00695 (inT16) 00696 ceil (crude_threshold_estimate)); 00697 total = all_gap_stats->get_total (); 00698 00699 if ((total <= tosp_redo_kern_limit) || 00700 ((small_gaps_count / (float) total) < tosp_enough_small_gaps) || 00701 (total - small_gaps_count < 1)) { 00702 if (tosp_debug_level > 5) 00703 tprintf ("B:%d R:%d -- Cant do isolated row stats.\n", 00704 block_idx, row_idx); 00705 return FALSE; 00706 } 00707 blob_it.set_to_list (row->blob_list ()); 00708 blob_it.mark_cycle_pt (); 00709 end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); 00710 if (tosp_use_pre_chopping) 00711 blob_box = box_next_pre_chopped (&blob_it); 00712 else if (tosp_stats_use_xht_gaps) 00713 blob_box = reduced_box_next (row, &blob_it); 00714 else 00715 blob_box = box_next (&blob_it); 00716 row_length = end_of_row - blob_box.left (); 00717 prev_blob_box = blob_box; 00718 while (!blob_it.cycled_list ()) { 00719 if (tosp_use_pre_chopping) 00720 blob_box = box_next_pre_chopped (&blob_it); 00721 else if (tosp_stats_use_xht_gaps) 00722 blob_box = reduced_box_next (row, &blob_it); 00723 else 00724 blob_box = box_next (&blob_it); 00725 gap_width = blob_box.left () - prev_blob_box.right (); 00726 if (!ignore_big_gap (row, row_length, gapmap, 00727 prev_blob_box.right (), blob_box.left ()) && 00728 (gap_width > crude_threshold_estimate)) { 00729 if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) || 00730 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && 00731 (!tosp_narrow_blobs_not_cert || 00732 (!narrow_blob (row, prev_blob_box) && 00733 !narrow_blob (row, blob_box)))) || 00734 (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box))) 00735 cert_space_gap_stats.add (gap_width, 1); 00736 all_space_gap_stats.add (gap_width, 1); 00737 } 00738 if (gap_width < crude_threshold_estimate) 00739 small_gap_stats.add (gap_width, 1); 00740 00741 prev_blob_box = blob_box; 00742 } 00743 if (cert_space_gap_stats.get_total () >= 00744 tosp_enough_space_samples_for_median) 00745 //median 00746 row->space_size = cert_space_gap_stats.median (); 00747 else if (suspected_table && (cert_space_gap_stats.get_total () > 0)) 00748 //to avoid spaced 00749 row->space_size = cert_space_gap_stats.mean (); 00750 // 1's in tables 00751 else if (all_space_gap_stats.get_total () >= 00752 tosp_enough_space_samples_for_median) 00753 //median 00754 row->space_size = all_space_gap_stats.median (); 00755 else 00756 row->space_size = all_space_gap_stats.mean (); 00757 00758 if (tosp_only_small_gaps_for_kern) 00759 row->kern_size = small_gap_stats.median (); 00760 else 00761 row->kern_size = all_gap_stats->median (); 00762 row->space_threshold = 00763 inT32 (floor ((row->space_size + row->kern_size) / 2)); 00764 /* Sanity check */ 00765 if ((row->kern_size >= row->space_threshold) || 00766 (row->space_threshold >= row->space_size) || 00767 (row->space_threshold <= 0)) { 00768 if (tosp_debug_level > 5) 00769 tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", 00770 block_idx, row_idx, 00771 row->kern_size, row->space_threshold, row->space_size); 00772 row->kern_size = 0.0f; 00773 row->space_threshold = 0; 00774 row->space_size = 0.0f; 00775 return FALSE; 00776 } 00777 00778 if (tosp_debug_level > 5) 00779 tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n", 00780 block_idx, row_idx, 00781 row->kern_size, row->space_threshold, row->space_size); 00782 return TRUE; 00783 } 00784 00785 inT16 Textord::stats_count_under(STATS *stats, inT16 threshold) { 00786 inT16 index; 00787 inT16 total = 0; 00788 00789 for (index = 0; index < threshold; index++) 00790 total += stats->pile_count (index); 00791 return total; 00792 } 00793 00794 00795 /************************************************************************* 00796 * improve_row_threshold() 00797 * Try to recognise a "normal line" - 00798 * > 25 gaps 00799 * && space > 3 * kn && space > 10 00800 * (I.e. reasonably large space and kn:sp ratio) 00801 * && > 3/4 # gaps < kn + (sp - kn)/3 00802 * (I.e. most gaps are well away from space estimate) 00803 * && a gap of max( 3, (sp - kn)/3 ) empty histogram positions is found 00804 * somewhere in the histogram between kn and sp 00805 * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies 00806 * NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!! 00807 * try moving the default threshold to within this band but leave the 00808 * fuzzy limit calculation as at present. 00809 *************************************************************************/ 00810 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) { 00811 float sp = row->space_size; 00812 float kn = row->kern_size; 00813 inT16 reqd_zero_width = 0; 00814 inT16 zero_width = 0; 00815 inT16 zero_start = 0; 00816 inT16 index = 0; 00817 00818 if (tosp_debug_level > 10) 00819 tprintf ("Improve row threshold 0"); 00820 if ((all_gap_stats->get_total () <= 25) || 00821 (sp <= 10) || 00822 (sp <= 3 * kn) || 00823 (stats_count_under (all_gap_stats, 00824 (inT16) ceil (kn + (sp - kn) / 3 + 0.5)) < 00825 (0.75 * all_gap_stats->get_total ()))) 00826 return; 00827 if (tosp_debug_level > 10) 00828 tprintf (" 1"); 00829 /* 00830 Look for the first region of all 0's in the histogram which is wider than 00831 max( 3, (sp - kn)/3 ) and starts between kn and sp. If found, and current 00832 threshold is not within it, move the threshold so that is is just inside it. 00833 */ 00834 reqd_zero_width = (inT16) floor ((sp - kn) / 3 + 0.5); 00835 if (reqd_zero_width < 3) 00836 reqd_zero_width = 3; 00837 00838 for (index = inT16 (ceil (kn)); index < inT16 (floor (sp)); index++) { 00839 if (all_gap_stats->pile_count (index) == 0) { 00840 if (zero_width == 0) 00841 zero_start = index; 00842 zero_width++; 00843 } 00844 else { 00845 if (zero_width >= reqd_zero_width) 00846 break; 00847 else { 00848 zero_width = 0; 00849 } 00850 } 00851 } 00852 index--; 00853 if (tosp_debug_level > 10) 00854 tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", 00855 reqd_zero_width, zero_width, zero_start, row->space_threshold); 00856 if ((zero_width < reqd_zero_width) || 00857 ((row->space_threshold >= zero_start) && 00858 (row->space_threshold <= index))) 00859 return; 00860 if (tosp_debug_level > 10) 00861 tprintf (" 2"); 00862 if (row->space_threshold < zero_start) { 00863 if (tosp_debug_level > 5) 00864 tprintf 00865 ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", 00866 kn, sp, zero_start, index, row->space_threshold, zero_start); 00867 row->space_threshold = zero_start; 00868 } 00869 if (row->space_threshold > index) { 00870 if (tosp_debug_level > 5) 00871 tprintf 00872 ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", 00873 kn, sp, zero_start, index, row->space_threshold, index); 00874 row->space_threshold = index; 00875 } 00876 } 00877 00878 00879 /********************************************************************** 00880 * make_prop_words 00881 * 00882 * Convert a TO_BLOCK to a BLOCK. 00883 **********************************************************************/ 00884 ROW *Textord::make_prop_words( 00885 TO_ROW *row, // row to make 00886 FCOORD rotation // for drawing 00887 ) { 00888 BOOL8 bol; //start of line 00889 /* prev_ values are for start of word being built. non prev_ values are for 00890 the gap between the word being built and the next one. */ 00891 BOOL8 prev_fuzzy_sp; //probably space 00892 BOOL8 prev_fuzzy_non; //probably not 00893 uinT8 prev_blanks; //in front of word 00894 BOOL8 fuzzy_sp; //probably space 00895 BOOL8 fuzzy_non; //probably not 00896 uinT8 blanks; //in front of word 00897 BOOL8 prev_gap_was_a_space = FALSE; 00898 BOOL8 break_at_next_gap = FALSE; 00899 ROW *real_row; //output row 00900 C_OUTLINE_IT cout_it; 00901 C_BLOB_LIST cblobs; 00902 C_BLOB_IT cblob_it = &cblobs; 00903 WERD_LIST words; 00904 WERD_IT word_it; //new words 00905 WERD *word; //new word 00906 WERD_IT rep_char_it; //repeated char words 00907 inT32 next_rep_char_word_right = MAX_INT32; 00908 float repetition_spacing; //gap between repetitions 00909 inT32 xstarts[2]; //row ends 00910 inT32 prev_x; //end of prev blob 00911 BLOBNBOX *bblob; //current blob 00912 TBOX blob_box; //bounding box 00913 BLOBNBOX_IT box_it; //iterator 00914 TBOX prev_blob_box; 00915 TBOX next_blob_box; 00916 inT16 prev_gap = MAX_INT16; 00917 inT16 current_gap = MAX_INT16; 00918 inT16 next_gap = MAX_INT16; 00919 inT16 prev_within_xht_gap = MAX_INT16; 00920 inT16 current_within_xht_gap = MAX_INT16; 00921 inT16 next_within_xht_gap = MAX_INT16; 00922 inT16 word_count = 0; 00923 00924 rep_char_it.set_to_list (&(row->rep_words)); 00925 if (!rep_char_it.empty ()) { 00926 next_rep_char_word_right = 00927 rep_char_it.data ()->bounding_box ().right (); 00928 } 00929 00930 prev_x = -MAX_INT16; 00931 cblob_it.set_to_list (&cblobs); 00932 box_it.set_to_list (row->blob_list ()); 00933 word_it.set_to_list (&words); 00934 bol = TRUE; 00935 prev_blanks = 0; 00936 prev_fuzzy_sp = FALSE; 00937 prev_fuzzy_non = FALSE; 00938 if (!box_it.empty ()) { 00939 xstarts[0] = box_it.data ()->bounding_box ().left (); 00940 if (xstarts[0] > next_rep_char_word_right) { 00941 /* We need to insert a repeated char word at the start of the row */ 00942 word = rep_char_it.extract (); 00943 word_it.add_after_then_move (word); 00944 /* Set spaces before repeated char word */ 00945 word->set_flag (W_BOL, TRUE); 00946 bol = FALSE; 00947 word->set_blanks (0); 00948 //NO uncertainty 00949 word->set_flag (W_FUZZY_SP, FALSE); 00950 word->set_flag (W_FUZZY_NON, FALSE); 00951 xstarts[0] = word->bounding_box ().left (); 00952 /* Set spaces after repeated char word (and leave current word set) */ 00953 repetition_spacing = find_mean_blob_spacing (word); 00954 current_gap = box_it.data ()->bounding_box ().left () - 00955 next_rep_char_word_right; 00956 current_within_xht_gap = current_gap; 00957 if (current_gap > tosp_rep_space * repetition_spacing) { 00958 prev_blanks = (uinT8) floor (current_gap / row->space_size); 00959 if (prev_blanks < 1) 00960 prev_blanks = 1; 00961 } 00962 else 00963 prev_blanks = 0; 00964 if (tosp_debug_level > 5) 00965 tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ", 00966 box_it.data ()->bounding_box ().left (), 00967 box_it.data ()->bounding_box ().bottom (), 00968 repetition_spacing, current_gap); 00969 prev_fuzzy_sp = FALSE; 00970 prev_fuzzy_non = FALSE; 00971 if (rep_char_it.empty ()) { 00972 next_rep_char_word_right = MAX_INT32; 00973 } 00974 else { 00975 rep_char_it.forward (); 00976 next_rep_char_word_right = 00977 rep_char_it.data ()->bounding_box ().right (); 00978 } 00979 } 00980 00981 peek_at_next_gap(row, 00982 box_it, 00983 next_blob_box, 00984 next_gap, 00985 next_within_xht_gap); 00986 do { 00987 bblob = box_it.data (); 00988 blob_box = bblob->bounding_box (); 00989 if (bblob->joined_to_prev ()) { 00990 if (bblob->cblob () != NULL) { 00991 cout_it.set_to_list (cblob_it.data ()->out_list ()); 00992 cout_it.move_to_last (); 00993 cout_it.add_list_after (bblob->cblob ()->out_list ()); 00994 delete bblob->cblob (); 00995 } 00996 } else { 00997 if (bblob->cblob() != NULL) 00998 cblob_it.add_after_then_move (bblob->cblob ()); 00999 prev_x = blob_box.right (); 01000 } 01001 box_it.forward (); //next one 01002 bblob = box_it.data (); 01003 blob_box = bblob->bounding_box (); 01004 01005 if (!bblob->joined_to_prev() && bblob->cblob() != NULL) { 01006 /* Real Blob - not multiple outlines or pre-chopped */ 01007 prev_gap = current_gap; 01008 prev_within_xht_gap = current_within_xht_gap; 01009 prev_blob_box = next_blob_box; 01010 current_gap = next_gap; 01011 current_within_xht_gap = next_within_xht_gap; 01012 peek_at_next_gap(row, 01013 box_it, 01014 next_blob_box, 01015 next_gap, 01016 next_within_xht_gap); 01017 01018 inT16 prev_gap_arg = prev_gap; 01019 inT16 next_gap_arg = next_gap; 01020 if (tosp_only_use_xht_gaps) { 01021 prev_gap_arg = prev_within_xht_gap; 01022 next_gap_arg = next_within_xht_gap; 01023 } 01024 // Decide if a word-break should be inserted 01025 if (blob_box.left () > next_rep_char_word_right || 01026 make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, 01027 current_gap, current_within_xht_gap, 01028 next_blob_box, next_gap_arg, 01029 blanks, fuzzy_sp, fuzzy_non, 01030 prev_gap_was_a_space, 01031 break_at_next_gap) || 01032 box_it.at_first()) { 01033 /* Form a new word out of the blobs collected */ 01034 word = new WERD (&cblobs, prev_blanks, NULL); 01035 word_count++; 01036 word_it.add_after_then_move (word); 01037 if (bol) { 01038 word->set_flag (W_BOL, TRUE); 01039 bol = FALSE; 01040 } 01041 if (prev_fuzzy_sp) 01042 //probably space 01043 word->set_flag (W_FUZZY_SP, TRUE); 01044 else if (prev_fuzzy_non) 01045 word->set_flag (W_FUZZY_NON, TRUE); 01046 //probably not 01047 01048 if (blob_box.left () > next_rep_char_word_right) { 01049 /* We need to insert a repeated char word */ 01050 word = rep_char_it.extract (); 01051 word_it.add_after_then_move (word); 01052 01053 /* Set spaces before repeated char word */ 01054 repetition_spacing = find_mean_blob_spacing (word); 01055 current_gap = word->bounding_box ().left () - prev_x; 01056 current_within_xht_gap = current_gap; 01057 if (current_gap > tosp_rep_space * repetition_spacing) { 01058 blanks = 01059 (uinT8) floor (current_gap / row->space_size); 01060 if (blanks < 1) 01061 blanks = 1; 01062 } 01063 else 01064 blanks = 0; 01065 if (tosp_debug_level > 5) 01066 tprintf 01067 ("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);", 01068 word->bounding_box ().left (), 01069 word->bounding_box ().bottom (), 01070 repetition_spacing, current_gap, blanks); 01071 word->set_blanks (blanks); 01072 //NO uncertainty 01073 word->set_flag (W_FUZZY_SP, FALSE); 01074 word->set_flag (W_FUZZY_NON, FALSE); 01075 01076 /* Set spaces after repeated char word (and leave current word set) */ 01077 current_gap = 01078 blob_box.left () - next_rep_char_word_right; 01079 if (current_gap > tosp_rep_space * repetition_spacing) { 01080 blanks = (uinT8) (current_gap / row->space_size); 01081 if (blanks < 1) 01082 blanks = 1; 01083 } 01084 else 01085 blanks = 0; 01086 if (tosp_debug_level > 5) 01087 tprintf (" Rgap:%d (%d blanks)\n", 01088 current_gap, blanks); 01089 fuzzy_sp = FALSE; 01090 fuzzy_non = FALSE; 01091 01092 if (rep_char_it.empty ()) { 01093 next_rep_char_word_right = MAX_INT32; 01094 } 01095 else { 01096 rep_char_it.forward (); 01097 next_rep_char_word_right = 01098 rep_char_it.data ()->bounding_box ().right (); 01099 } 01100 } 01101 01102 if (box_it.at_first () && rep_char_it.empty ()) { 01103 //at end of line 01104 word->set_flag (W_EOL, TRUE); 01105 xstarts[1] = prev_x; 01106 } 01107 else { 01108 prev_blanks = blanks; 01109 prev_fuzzy_sp = fuzzy_sp; 01110 prev_fuzzy_non = fuzzy_non; 01111 } 01112 } 01113 } 01114 } 01115 while (!box_it.at_first ()); //until back at start 01116 01117 /* Insert any further repeated char words */ 01118 while (!rep_char_it.empty ()) { 01119 word = rep_char_it.extract (); 01120 word_it.add_after_then_move (word); 01121 01122 /* Set spaces before repeated char word */ 01123 repetition_spacing = find_mean_blob_spacing (word); 01124 current_gap = word->bounding_box ().left () - prev_x; 01125 if (current_gap > tosp_rep_space * repetition_spacing) { 01126 blanks = (uinT8) floor (current_gap / row->space_size); 01127 if (blanks < 1) 01128 blanks = 1; 01129 } 01130 else 01131 blanks = 0; 01132 if (tosp_debug_level > 5) 01133 tprintf 01134 ("Repch wd at EOL (%d,%d). rep spacing %d; Lgap:%d (%d blanks)\n", 01135 word->bounding_box ().left (), word->bounding_box ().bottom (), 01136 repetition_spacing, current_gap, blanks); 01137 word->set_blanks (blanks); 01138 //NO uncertainty 01139 word->set_flag (W_FUZZY_SP, FALSE); 01140 word->set_flag (W_FUZZY_NON, FALSE); 01141 prev_x = word->bounding_box ().right (); 01142 if (rep_char_it.empty ()) { 01143 //at end of line 01144 word->set_flag (W_EOL, TRUE); 01145 xstarts[1] = prev_x; 01146 } 01147 else { 01148 rep_char_it.forward (); 01149 } 01150 } 01151 real_row = new ROW (row, 01152 (inT16) row->kern_size, (inT16) row->space_size); 01153 word_it.set_to_list (real_row->word_list ()); 01154 //put words in row 01155 word_it.add_list_after (&words); 01156 real_row->recalc_bounding_box (); 01157 01158 if (tosp_debug_level > 4) { 01159 tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n", 01160 word_count, 01161 real_row->bounding_box ().left (), 01162 real_row->bounding_box ().bottom (), 01163 real_row->bounding_box ().right (), 01164 real_row->bounding_box ().top ()); 01165 } 01166 return real_row; 01167 } 01168 return NULL; 01169 } 01170 01171 /********************************************************************** 01172 * make_blob_words 01173 * 01174 * Converts words into blobs so that each blob is a single character. 01175 * Used for chopper test. 01176 **********************************************************************/ 01177 ROW *Textord::make_blob_words( 01178 TO_ROW *row, // row to make 01179 FCOORD rotation // for drawing 01180 ) { 01181 bool bol; // start of line 01182 ROW *real_row; // output row 01183 C_OUTLINE_IT cout_it; 01184 C_BLOB_LIST cblobs; 01185 C_BLOB_IT cblob_it = &cblobs; 01186 WERD_LIST words; 01187 WERD_IT word_it; // new words 01188 WERD *word; // new word 01189 BLOBNBOX *bblob; // current blob 01190 TBOX blob_box; // bounding box 01191 BLOBNBOX_IT box_it; // iterator 01192 inT16 word_count = 0; 01193 01194 cblob_it.set_to_list(&cblobs); 01195 box_it.set_to_list(row->blob_list()); 01196 word_it.set_to_list(&words); 01197 bol = TRUE; 01198 if (!box_it.empty()) { 01199 01200 do { 01201 bblob = box_it.data(); 01202 blob_box = bblob->bounding_box(); 01203 if (bblob->joined_to_prev()) { 01204 if (bblob->cblob() != NULL) { 01205 cout_it.set_to_list(cblob_it.data()->out_list()); 01206 cout_it.move_to_last(); 01207 cout_it.add_list_after(bblob->cblob()->out_list()); 01208 delete bblob->cblob(); 01209 } 01210 } else { 01211 if (bblob->cblob() != NULL) 01212 cblob_it.add_after_then_move(bblob->cblob()); 01213 } 01214 box_it.forward(); // next one 01215 bblob = box_it.data(); 01216 blob_box = bblob->bounding_box(); 01217 01218 if (!bblob->joined_to_prev() && !cblobs.empty()) { 01219 word = new WERD(&cblobs, 1, NULL); 01220 word_count++; 01221 word_it.add_after_then_move(word); 01222 if (bol) { 01223 word->set_flag(W_BOL, TRUE); 01224 bol = FALSE; 01225 } 01226 if (box_it.at_first()) { // at end of line 01227 word->set_flag(W_EOL, TRUE); 01228 } 01229 } 01230 } 01231 while (!box_it.at_first()); // until back at start 01232 /* Setup the row with created words. */ 01233 real_row = new ROW(row, (inT16) row->kern_size, (inT16) row->space_size); 01234 word_it.set_to_list(real_row->word_list()); 01235 //put words in row 01236 word_it.add_list_after(&words); 01237 real_row->recalc_bounding_box(); 01238 if (tosp_debug_level > 4) { 01239 tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n", 01240 word_count, 01241 real_row->bounding_box().left(), 01242 real_row->bounding_box().bottom(), 01243 real_row->bounding_box().right(), 01244 real_row->bounding_box().top()); 01245 } 01246 return real_row; 01247 } 01248 return NULL; 01249 } 01250 01251 BOOL8 Textord::make_a_word_break( 01252 TO_ROW *row, // row being made 01253 TBOX blob_box, // for next_blob // how many blanks? 01254 inT16 prev_gap, 01255 TBOX prev_blob_box, 01256 inT16 real_current_gap, 01257 inT16 within_xht_current_gap, 01258 TBOX next_blob_box, 01259 inT16 next_gap, 01260 uinT8 &blanks, 01261 BOOL8 &fuzzy_sp, 01262 BOOL8 &fuzzy_non, 01263 BOOL8& prev_gap_was_a_space, 01264 BOOL8& break_at_next_gap) { 01265 BOOL8 space; 01266 inT16 current_gap; 01267 float fuzzy_sp_to_kn_limit; 01268 01269 if (break_at_next_gap) { 01270 break_at_next_gap = FALSE; 01271 return TRUE; 01272 } 01273 /* Inhibit using the reduced gap if 01274 The kerning is large - chars are not kerned and reducing "f"s can cause 01275 erroneous blanks 01276 OR The real gap is less than 0 01277 OR The real gap is less than the kerning estimate 01278 */ 01279 if ((row->kern_size > tosp_large_kerning * row->xheight) || 01280 ((tosp_dont_fool_with_small_kerns >= 0) && 01281 (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) 01282 //Ignore the difference 01283 within_xht_current_gap = real_current_gap; 01284 01285 if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) 01286 current_gap = within_xht_current_gap; 01287 else 01288 current_gap = real_current_gap; 01289 01290 if (tosp_old_to_method) { 01291 //Boring old method 01292 space = current_gap > row->max_nonspace; 01293 if (space && (current_gap < MAX_INT16)) { 01294 if (current_gap < row->min_space) { 01295 if (current_gap > row->space_threshold) { 01296 blanks = 1; 01297 fuzzy_sp = TRUE; 01298 fuzzy_non = FALSE; 01299 } 01300 else { 01301 blanks = 0; 01302 fuzzy_sp = FALSE; 01303 fuzzy_non = TRUE; 01304 } 01305 } 01306 else { 01307 blanks = (uinT8) (current_gap / row->space_size); 01308 if (blanks < 1) 01309 blanks = 1; 01310 fuzzy_sp = FALSE; 01311 fuzzy_non = FALSE; 01312 } 01313 } 01314 return space; 01315 } 01316 else { 01317 /* New exciting heuristic method */ 01318 if (prev_blob_box.null_box ()) // Beginning of row 01319 prev_gap_was_a_space = TRUE; 01320 01321 //Default as old TO 01322 space = current_gap > row->space_threshold; 01323 01324 /* Set defaults for the word break incase we find one. Currently there are 01325 no fuzzy spaces. Depending on the reliability of the different heuristics 01326 we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY 01327 be used if the function returns TRUE - ie the word is to be broken. 01328 */ 01329 blanks = (uinT8) (current_gap / row->space_size); 01330 if (blanks < 1) 01331 blanks = 1; 01332 fuzzy_sp = FALSE; 01333 fuzzy_non = FALSE; 01334 /* 01335 If xht measure causes gap to flip one of the 3 thresholds act accordingly - 01336 despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to 01337 context. 01338 */ 01339 if (tosp_use_xht_gaps && 01340 (real_current_gap <= row->max_nonspace) && 01341 (within_xht_current_gap > row->max_nonspace)) { 01342 space = TRUE; 01343 fuzzy_non = TRUE; 01344 #ifndef GRAPHICS_DISABLED 01345 mark_gap (blob_box, 20, 01346 prev_gap, prev_blob_box.width (), 01347 current_gap, next_blob_box.width (), next_gap); 01348 #endif 01349 } 01350 else if (tosp_use_xht_gaps && 01351 (real_current_gap <= row->space_threshold) && 01352 (within_xht_current_gap > row->space_threshold)) { 01353 space = TRUE; 01354 if (tosp_flip_fuzz_kn_to_sp) 01355 fuzzy_sp = TRUE; 01356 else 01357 fuzzy_non = TRUE; 01358 #ifndef GRAPHICS_DISABLED 01359 mark_gap (blob_box, 21, 01360 prev_gap, prev_blob_box.width (), 01361 current_gap, next_blob_box.width (), next_gap); 01362 #endif 01363 } 01364 else if (tosp_use_xht_gaps && 01365 (real_current_gap < row->min_space) && 01366 (within_xht_current_gap >= row->min_space)) { 01367 space = TRUE; 01368 #ifndef GRAPHICS_DISABLED 01369 mark_gap (blob_box, 22, 01370 prev_gap, prev_blob_box.width (), 01371 current_gap, next_blob_box.width (), next_gap); 01372 #endif 01373 } 01374 else if (tosp_force_wordbreak_on_punct && 01375 !suspected_punct_blob(row, prev_blob_box) && 01376 suspected_punct_blob(row, blob_box)) { 01377 break_at_next_gap = TRUE; 01378 } 01379 /* Now continue with normal heuristics */ 01380 else if ((current_gap < row->min_space) && 01381 (current_gap > row->space_threshold)) { 01382 /* Heuristics to turn dubious spaces to kerns */ 01383 if (tosp_pass_wide_fuzz_sp_to_context > 0) 01384 fuzzy_sp_to_kn_limit = row->kern_size + 01385 tosp_pass_wide_fuzz_sp_to_context * 01386 (row->space_size - row->kern_size); 01387 else 01388 fuzzy_sp_to_kn_limit = 99999.0f; 01389 01390 /* If current gap is significantly smaller than the previous space the other 01391 side of a narrow blob then this gap is a kern. */ 01392 if ((prev_blob_box.width () > 0) && 01393 narrow_blob (row, prev_blob_box) && 01394 prev_gap_was_a_space && 01395 (current_gap <= tosp_gap_factor * prev_gap)) { 01396 if ((tosp_all_flips_fuzzy) || 01397 (current_gap > fuzzy_sp_to_kn_limit)) { 01398 if (tosp_flip_fuzz_sp_to_kn) 01399 fuzzy_non = TRUE; 01400 else 01401 fuzzy_sp = TRUE; 01402 } 01403 else 01404 space = FALSE; 01405 #ifndef GRAPHICS_DISABLED 01406 mark_gap (blob_box, 1, 01407 prev_gap, prev_blob_box.width (), 01408 current_gap, next_blob_box.width (), next_gap); 01409 #endif 01410 } 01411 /* If current gap not much bigger than the previous kern the other side of a 01412 narrow blob then this gap is a kern as well */ 01413 else if ((prev_blob_box.width () > 0) && 01414 narrow_blob (row, prev_blob_box) && 01415 !prev_gap_was_a_space && 01416 (current_gap * tosp_gap_factor <= prev_gap)) { 01417 if ((tosp_all_flips_fuzzy) || 01418 (current_gap > fuzzy_sp_to_kn_limit)) { 01419 if (tosp_flip_fuzz_sp_to_kn) 01420 fuzzy_non = TRUE; 01421 else 01422 fuzzy_sp = TRUE; 01423 } 01424 else 01425 space = FALSE; 01426 #ifndef GRAPHICS_DISABLED 01427 mark_gap (blob_box, 2, 01428 prev_gap, prev_blob_box.width (), 01429 current_gap, next_blob_box.width (), next_gap); 01430 #endif 01431 } 01432 else if ((next_blob_box.width () > 0) && 01433 narrow_blob (row, next_blob_box) && 01434 (next_gap > row->space_threshold) && 01435 (current_gap <= tosp_gap_factor * next_gap)) { 01436 if ((tosp_all_flips_fuzzy) || 01437 (current_gap > fuzzy_sp_to_kn_limit)) { 01438 if (tosp_flip_fuzz_sp_to_kn) 01439 fuzzy_non = TRUE; 01440 else 01441 fuzzy_sp = TRUE; 01442 } 01443 else 01444 space = FALSE; 01445 #ifndef GRAPHICS_DISABLED 01446 mark_gap (blob_box, 3, 01447 prev_gap, prev_blob_box.width (), 01448 current_gap, next_blob_box.width (), next_gap); 01449 #endif 01450 } 01451 else if ((next_blob_box.width () > 0) && 01452 narrow_blob (row, next_blob_box) && 01453 (next_gap <= row->space_threshold) && 01454 (current_gap * tosp_gap_factor <= next_gap)) { 01455 if ((tosp_all_flips_fuzzy) || 01456 (current_gap > fuzzy_sp_to_kn_limit)) { 01457 if (tosp_flip_fuzz_sp_to_kn) 01458 fuzzy_non = TRUE; 01459 else 01460 fuzzy_sp = TRUE; 01461 } 01462 else 01463 space = FALSE; 01464 #ifndef GRAPHICS_DISABLED 01465 mark_gap (blob_box, 4, 01466 prev_gap, prev_blob_box.width (), 01467 current_gap, next_blob_box.width (), next_gap); 01468 #endif 01469 } 01470 else if ((((next_blob_box.width () > 0) && 01471 narrow_blob (row, next_blob_box)) || 01472 ((prev_blob_box.width () > 0) && 01473 narrow_blob (row, prev_blob_box)))) { 01474 fuzzy_sp = TRUE; 01475 #ifndef GRAPHICS_DISABLED 01476 mark_gap (blob_box, 6, 01477 prev_gap, prev_blob_box.width (), 01478 current_gap, next_blob_box.width (), next_gap); 01479 #endif 01480 } 01481 } 01482 else if ((current_gap > row->max_nonspace) && 01483 (current_gap <= row->space_threshold)) { 01484 01485 /* Heuristics to turn dubious kerns to spaces */ 01486 /* TRIED THIS BUT IT MADE THINGS WORSE 01487 if ( prev_gap == MAX_INT16 ) 01488 prev_gap = 0; // start of row 01489 if ( next_gap == MAX_INT16 ) 01490 next_gap = 0; // end of row 01491 */ 01492 if ((prev_blob_box.width () > 0) && 01493 (next_blob_box.width () > 0) && 01494 (current_gap >= 01495 tosp_kern_gap_factor1 * MAX (prev_gap, next_gap)) && 01496 wide_blob (row, prev_blob_box) && 01497 wide_blob (row, next_blob_box)) { 01498 01499 space = TRUE; 01500 /* 01501 tosp_flip_caution is an attempt to stop the default changing in cases 01502 where there is a large difference between the kern and space estimates. 01503 See problem in 'chiefs' where "have" gets split in the quotation. 01504 */ 01505 if ((tosp_flip_fuzz_kn_to_sp) && 01506 ((tosp_flip_caution <= 0) || 01507 (tosp_flip_caution * row->kern_size > row->space_size))) 01508 fuzzy_sp = TRUE; 01509 else 01510 fuzzy_non = TRUE; 01511 #ifndef GRAPHICS_DISABLED 01512 mark_gap (blob_box, 7, 01513 prev_gap, prev_blob_box.width (), 01514 current_gap, next_blob_box.width (), next_gap); 01515 #endif 01516 } else if (prev_blob_box.width() > 0 && 01517 next_blob_box.width() > 0 && 01518 current_gap > 5 && // Rule 9 handles small gap, big ratio. 01519 current_gap >= 01520 tosp_kern_gap_factor2 * MAX(prev_gap, next_gap) && 01521 !(narrow_blob(row, prev_blob_box) || 01522 suspected_punct_blob(row, prev_blob_box)) && 01523 !(narrow_blob(row, next_blob_box) || 01524 suspected_punct_blob(row, next_blob_box))) { 01525 space = TRUE; 01526 fuzzy_non = TRUE; 01527 #ifndef GRAPHICS_DISABLED 01528 mark_gap (blob_box, 8, 01529 prev_gap, prev_blob_box.width (), 01530 current_gap, next_blob_box.width (), next_gap); 01531 #endif 01532 } 01533 else if ((tosp_kern_gap_factor3 > 0) && 01534 (prev_blob_box.width () > 0) && 01535 (next_blob_box.width () > 0) && 01536 (current_gap >= tosp_kern_gap_factor3 * MAX (prev_gap, next_gap)) && 01537 (!tosp_rule_9_test_punct || 01538 (!suspected_punct_blob (row, prev_blob_box) && 01539 !suspected_punct_blob (row, next_blob_box)))) { 01540 space = TRUE; 01541 fuzzy_non = TRUE; 01542 #ifndef GRAPHICS_DISABLED 01543 mark_gap (blob_box, 9, 01544 prev_gap, prev_blob_box.width (), 01545 current_gap, next_blob_box.width (), next_gap); 01546 #endif 01547 } 01548 } 01549 if (tosp_debug_level > 10) 01550 tprintf("word break = %d current_gap = %d, prev_gap = %d, " 01551 "next_gap = %d\n", space ? 1 : 0, current_gap, 01552 prev_gap, next_gap); 01553 prev_gap_was_a_space = space && !(fuzzy_non); 01554 return space; 01555 } 01556 } 01557 01558 BOOL8 Textord::narrow_blob(TO_ROW *row, TBOX blob_box) { 01559 BOOL8 result; 01560 result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) || 01561 (((float) blob_box.width () / blob_box.height ()) <= 01562 tosp_narrow_aspect_ratio)); 01563 return result; 01564 } 01565 01566 BOOL8 Textord::wide_blob(TO_ROW *row, TBOX blob_box) { 01567 BOOL8 result; 01568 if (tosp_wide_fraction > 0) { 01569 if (tosp_wide_aspect_ratio > 0) 01570 result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) && 01571 (((float) blob_box.width () / blob_box.height ()) > 01572 tosp_wide_aspect_ratio)); 01573 else 01574 result = (blob_box.width () >= tosp_wide_fraction * row->xheight); 01575 } 01576 else 01577 result = !narrow_blob (row, blob_box); 01578 return result; 01579 } 01580 01581 BOOL8 Textord::suspected_punct_blob(TO_ROW *row, TBOX box) { 01582 BOOL8 result; 01583 float baseline; 01584 float blob_x_centre; 01585 /* Find baseline of centre of blob */ 01586 blob_x_centre = (box.right () + box.left ()) / 2.0; 01587 baseline = row->baseline.y (blob_x_centre); 01588 01589 result = (box.height () <= 0.66 * row->xheight) || 01590 (box.top () < baseline + row->xheight / 2.0) || 01591 (box.bottom () > baseline + row->xheight / 2.0); 01592 return result; 01593 } 01594 01595 01596 void Textord::peek_at_next_gap(TO_ROW *row, 01597 BLOBNBOX_IT box_it, 01598 TBOX &next_blob_box, 01599 inT16 &next_gap, 01600 inT16 &next_within_xht_gap) { 01601 TBOX next_reduced_blob_box; 01602 TBOX bit_beyond; 01603 BLOBNBOX_IT reduced_box_it = box_it; 01604 01605 next_blob_box = box_next (&box_it); 01606 next_reduced_blob_box = reduced_box_next (row, &reduced_box_it); 01607 if (box_it.at_first ()) { 01608 next_gap = MAX_INT16; 01609 next_within_xht_gap = MAX_INT16; 01610 } 01611 else { 01612 bit_beyond = box_it.data ()->bounding_box (); 01613 next_gap = bit_beyond.left () - next_blob_box.right (); 01614 bit_beyond = reduced_box_next (row, &reduced_box_it); 01615 next_within_xht_gap = 01616 bit_beyond.left () - next_reduced_blob_box.right (); 01617 } 01618 } 01619 01620 01621 #ifndef GRAPHICS_DISABLED 01622 void Textord::mark_gap( 01623 TBOX blob, // blob following gap 01624 inT16 rule, // heuristic id 01625 inT16 prev_gap, 01626 inT16 prev_blob_width, 01627 inT16 current_gap, 01628 inT16 next_blob_width, 01629 inT16 next_gap) { 01630 ScrollView::Color col; //of ellipse marking flipped gap 01631 01632 switch (rule) { 01633 case 1: 01634 col = ScrollView::RED; 01635 break; 01636 case 2: 01637 col = ScrollView::CYAN; 01638 break; 01639 case 3: 01640 col = ScrollView::GREEN; 01641 break; 01642 case 4: 01643 col = ScrollView::BLACK; 01644 break; 01645 case 5: 01646 col = ScrollView::MAGENTA; 01647 break; 01648 case 6: 01649 col = ScrollView::BLUE; 01650 break; 01651 01652 case 7: 01653 col = ScrollView::WHITE; 01654 break; 01655 case 8: 01656 col = ScrollView::YELLOW; 01657 break; 01658 case 9: 01659 col = ScrollView::BLACK; 01660 break; 01661 01662 case 20: 01663 col = ScrollView::CYAN; 01664 break; 01665 case 21: 01666 col = ScrollView::GREEN; 01667 break; 01668 case 22: 01669 col = ScrollView::MAGENTA; 01670 break; 01671 default: 01672 col = ScrollView::BLACK; 01673 } 01674 if (textord_show_initial_words) { 01675 to_win->Pen(col); 01676 /* if (rule < 20) 01677 //interior_style(to_win, INT_SOLID, FALSE); 01678 else 01679 //interior_style(to_win, INT_HOLLOW, TRUE);*/ 01680 //x radius 01681 to_win->Ellipse (current_gap / 2.0f, 01682 blob.height () / 2.0f, //y radius 01683 //x centre 01684 blob.left () - current_gap / 2.0f, 01685 //y centre 01686 blob.bottom () + blob.height () / 2.0f); 01687 } 01688 if (tosp_debug_level > 5) 01689 tprintf (" (%d,%d) Sp<->Kn Rule %d %d %d %d %d\n", 01690 blob.left () - current_gap / 2, blob.bottom (), rule, 01691 prev_gap, prev_blob_width, current_gap, 01692 next_blob_width, next_gap); 01693 } 01694 #endif 01695 01696 float Textord::find_mean_blob_spacing(WERD *word) { 01697 C_BLOB_IT cblob_it; 01698 TBOX blob_box; 01699 inT32 gap_sum = 0; 01700 inT16 gap_count = 0; 01701 inT16 prev_right; 01702 01703 cblob_it.set_to_list (word->cblob_list ()); 01704 if (!cblob_it.empty ()) { 01705 cblob_it.mark_cycle_pt (); 01706 prev_right = cblob_it.data ()->bounding_box ().right (); 01707 //first blob 01708 cblob_it.forward (); 01709 for (; !cblob_it.cycled_list (); cblob_it.forward ()) { 01710 blob_box = cblob_it.data ()->bounding_box (); 01711 gap_sum += blob_box.left () - prev_right; 01712 gap_count++; 01713 prev_right = blob_box.right (); 01714 } 01715 } 01716 if (gap_count > 0) 01717 return (gap_sum / (float) gap_count); 01718 else 01719 return 0.0f; 01720 } 01721 01722 01723 BOOL8 Textord::ignore_big_gap(TO_ROW *row, 01724 inT32 row_length, 01725 GAPMAP *gapmap, 01726 inT16 left, 01727 inT16 right) { 01728 inT16 gap = right - left + 1; 01729 01730 if (tosp_ignore_big_gaps > 999) 01731 return FALSE; //Dont ignore 01732 if (tosp_ignore_big_gaps > 0) 01733 return (gap > tosp_ignore_big_gaps * row->xheight); 01734 if (gap > tosp_ignore_very_big_gaps * row->xheight) 01735 return TRUE; 01736 if (tosp_ignore_big_gaps == 0) { 01737 if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) 01738 return TRUE; 01739 if ((gap > 1.75 * row->xheight) && 01740 ((row_length > 35 * row->xheight) || 01741 gapmap->table_gap (left, right))) 01742 return TRUE; 01743 } 01744 else { 01745 /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table */ 01746 if ((gap > gapmap_big_gaps * row->xheight) && 01747 gapmap->table_gap (left, right)) 01748 return TRUE; 01749 } 01750 return FALSE; 01751 } 01752 01753 01754 /********************************************************************** 01755 * reduced_box_next 01756 * 01757 * Compute the bounding box of this blob with merging of x overlaps 01758 * but no pre-chopping. 01759 * Then move the iterator on to the start of the next blob. 01760 * DONT reduce the box for small things - eg punctuation. 01761 **********************************************************************/ 01762 TBOX Textord::reduced_box_next( 01763 TO_ROW *row, // current row 01764 BLOBNBOX_IT *it // iterator to blobds 01765 ) { 01766 BLOBNBOX *blob; //current blob 01767 BLOBNBOX *head_blob; //place to store box 01768 TBOX full_box; //full blob boundg box 01769 TBOX reduced_box; //box of significant part 01770 inT16 left_above_xht; //ABOVE xht left limit 01771 inT16 new_left_above_xht; //ABOVE xht left limit 01772 01773 blob = it->data (); 01774 if (blob->red_box_set ()) { 01775 reduced_box = blob->reduced_box (); 01776 do { 01777 it->forward(); 01778 blob = it->data(); 01779 } 01780 while (blob->cblob() == NULL || blob->joined_to_prev()); 01781 return reduced_box; 01782 } 01783 head_blob = blob; 01784 full_box = blob->bounding_box (); 01785 reduced_box = reduced_box_for_blob (blob, row, &left_above_xht); 01786 do { 01787 it->forward (); 01788 blob = it->data (); 01789 if (blob->cblob() == NULL) 01790 //was pre-chopped 01791 full_box += blob->bounding_box (); 01792 else if (blob->joined_to_prev ()) { 01793 reduced_box += 01794 reduced_box_for_blob(blob, row, &new_left_above_xht); 01795 left_above_xht = MIN (left_above_xht, new_left_above_xht); 01796 } 01797 } 01798 //until next real blob 01799 while (blob->cblob() == NULL || blob->joined_to_prev()); 01800 01801 if ((reduced_box.width () > 0) && 01802 ((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ()) 01803 < left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) { 01804 #ifndef GRAPHICS_DISABLED 01805 if (textord_show_initial_words) 01806 reduced_box.plot (to_win, ScrollView::YELLOW, ScrollView::YELLOW); 01807 #endif 01808 } 01809 else 01810 reduced_box = full_box; 01811 head_blob->set_reduced_box (reduced_box); 01812 return reduced_box; 01813 } 01814 01815 01816 /************************************************************************* 01817 * reduced_box_for_blob() 01818 * Find box for blob which is the same height and y position as the whole blob, 01819 * but whose left limit is the left most position of the blob ABOVE the 01820 * baseline and whose right limit is the right most position of the blob BELOW 01821 * the xheight. 01822 * 01823 * 01824 * !!!!!!! WONT WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on 01825 * "home". Perhaps we need something which say if the width ABOVE the 01826 * xht alone includes the whole of the reduced width, then use the full 01827 * blob box - Might still fail on italic F 01828 * 01829 * Alternatively we could be a little less severe and only reduce the 01830 * left and right edges by half the difference between the full box and 01831 * the reduced box. 01832 * 01833 * NOTE that we need to rotate all the coordinates as 01834 * find_blob_limits finds the y min and max within a specified x band 01835 *************************************************************************/ 01836 TBOX Textord::reduced_box_for_blob( 01837 BLOBNBOX *blob, 01838 TO_ROW *row, 01839 inT16 *left_above_xht) { 01840 float baseline; 01841 float blob_x_centre; 01842 float left_limit; 01843 float right_limit; 01844 float junk; 01845 TBOX blob_box; 01846 01847 /* Find baseline of centre of blob */ 01848 01849 blob_box = blob->bounding_box (); 01850 blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0; 01851 baseline = row->baseline.y (blob_x_centre); 01852 01853 /* 01854 Find LH limit of blob ABOVE the xht. This is so that we can detect certain 01855 caps ht chars which should NOT have their box reduced: T, Y, V, W etc 01856 */ 01857 left_limit = (float) MAX_INT32; 01858 junk = (float) -MAX_INT32; 01859 find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), 01860 static_cast<float>(MAX_INT16), left_limit, junk); 01861 if (left_limit > junk) 01862 *left_above_xht = MAX_INT16; //No area above xht 01863 else 01864 *left_above_xht = (inT16) floor (left_limit); 01865 /* 01866 Find reduced LH limit of blob - the left extent of the region ABOVE the 01867 baseline. 01868 */ 01869 left_limit = (float) MAX_INT32; 01870 junk = (float) -MAX_INT32; 01871 find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(MAX_INT16), 01872 left_limit, junk); 01873 01874 if (left_limit > junk) 01875 return TBOX (); //no area within xht so return empty box 01876 /* 01877 Find reduced RH limit of blob - the right extent of the region BELOW the xht. 01878 */ 01879 junk = (float) MAX_INT32; 01880 right_limit = (float) -MAX_INT32; 01881 find_cblob_hlimits(blob->cblob(), static_cast<float>(-MAX_INT16), 01882 (baseline + row->xheight), junk, right_limit); 01883 if (junk > right_limit) 01884 return TBOX (); //no area within xht so return empty box 01885 01886 return TBOX (ICOORD ((inT16) floor (left_limit), blob_box.bottom ()), 01887 ICOORD ((inT16) ceil (right_limit), blob_box.top ())); 01888 } 01889 } // namespace tesseract