tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/textord/tospace.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * tospace.cpp
00003  *
00004  * Compute fuzzy word spacing thresholds for each row.
00005  * I.e. set :   max_nonspace
00006  *              space_threshold
00007  *              min_space
00008  *              kern_size
00009  *              space_size
00010  * for each row.
00011  * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
00012  *
00013  * Note: functions in this file were originally not members of any
00014  * class or enclosed by any namespace. Now they are all static members
00015  * of the Textord class.
00016  *
00017  **********************************************************************/
00018 
00019 #include "drawtord.h"
00020 #include "ndminx.h"
00021 #include "statistc.h"
00022 #include "textord.h"
00023 #include "tovars.h"
00024 
00025 // Include automatically generated configuration file if running autoconf.
00026 #ifdef HAVE_CONFIG_H
00027 #include "config_auto.h"
00028 #endif
00029 
00030 #define MAXSPACING      128      /*max expected spacing in pix */
00031 
00032 namespace tesseract {
00033 void Textord::to_spacing(
00034     ICOORD page_tr,        //topright of page
00035     TO_BLOCK_LIST *blocks  //blocks on page
00036                          ) {
00037   TO_BLOCK_IT block_it;          //iterator
00038   TO_BLOCK *block;               //current block;
00039   TO_ROW_IT row_it;              //row iterator
00040   TO_ROW *row;                   //current row
00041   int block_index;               //block number
00042   int row_index;                 //row number
00043   //estimated width of real spaces for whole block
00044   inT16 block_space_gap_width;
00045   //estimated width of non space gaps for whole block
00046   inT16 block_non_space_gap_width;
00047   BOOL8 old_text_ord_proportional;//old fixed/prop result
00048   GAPMAP *gapmap = NULL;          //map of big vert gaps in blk
00049 
00050   block_it.set_to_list (blocks);
00051   block_index = 1;
00052   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00053   block_it.forward ()) {
00054     block = block_it.data ();
00055     gapmap = new GAPMAP (block);
00056     block_spacing_stats(block,
00057                         gapmap,
00058                         old_text_ord_proportional,
00059                         block_space_gap_width,
00060                         block_non_space_gap_width);
00061     // Make sure relative values of block-level space and non-space gap
00062     // widths are reasonable. The ratio of 1:3 is also used in
00063     // block_spacing_stats, to corrrect the block_space_gap_width
00064     // Useful for arabic and hindi, when the non-space gap width is
00065     // often over-estimated and should not be trusted. A similar ratio
00066     // is found in block_spacing_stats.
00067     if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
00068         (float) block_space_gap_width / block_non_space_gap_width < 3.0) {
00069       block_non_space_gap_width = (inT16) floor (block_space_gap_width / 3.0);
00070     }
00071     row_it.set_to_list (block->get_rows ());
00072     row_index = 1;
00073     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00074       row = row_it.data ();
00075       if ((row->pitch_decision == PITCH_DEF_PROP) ||
00076       (row->pitch_decision == PITCH_CORR_PROP)) {
00077         if ((tosp_debug_level > 0) && !old_text_ord_proportional)
00078           tprintf ("Block %d Row %d: Now Proportional\n",
00079             block_index, row_index);
00080         row_spacing_stats(row,
00081                           gapmap,
00082                           block_index,
00083                           row_index,
00084                           block_space_gap_width,
00085                           block_non_space_gap_width);
00086       }
00087       else {
00088         if ((tosp_debug_level > 0) && old_text_ord_proportional)
00089           tprintf
00090             ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
00091             block_index, row_index, row->pitch_decision,
00092             row->fixed_pitch);
00093       }
00094 #ifndef GRAPHICS_DISABLED
00095       if (textord_show_initial_words)
00096         plot_word_decisions (to_win, (inT16) row->fixed_pitch, row);
00097 #endif
00098       row_index++;
00099     }
00100     delete gapmap;
00101     block_index++;
00102   }
00103 }
00104 
00105 
00106 /*************************************************************************
00107  * block_spacing_stats()
00108  *************************************************************************/
00109 
00110 void Textord::block_spacing_stats(
00111     TO_BLOCK *block,
00112     GAPMAP *gapmap,
00113     BOOL8 &old_text_ord_proportional,
00114     inT16 &block_space_gap_width,     //resulting estimate
00115     inT16 &block_non_space_gap_width  //resulting estimate
00116                                   ) {
00117   TO_ROW_IT row_it;              //row iterator
00118   TO_ROW *row;                   //current row
00119   BLOBNBOX_IT blob_it;           //iterator
00120 
00121   STATS centre_to_centre_stats (0, MAXSPACING);
00122   //DEBUG USE ONLY
00123   STATS all_gap_stats (0, MAXSPACING);
00124   STATS space_gap_stats (0, MAXSPACING);
00125   inT16 minwidth = MAX_INT16;    //narrowest blob
00126   TBOX blob_box;
00127   TBOX prev_blob_box;
00128   inT16 centre_to_centre;
00129   inT16 gap_width;
00130   float real_space_threshold;
00131   float iqr_centre_to_centre;    //DEBUG USE ONLY
00132   float iqr_all_gap_stats;       //DEBUG USE ONLY
00133   inT32 end_of_row;
00134   inT32 row_length;
00135 
00136   row_it.set_to_list (block->get_rows ());
00137   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00138     row = row_it.data ();
00139     if (!row->blob_list ()->empty () &&
00140       (!tosp_only_use_prop_rows ||
00141       (row->pitch_decision == PITCH_DEF_PROP) ||
00142     (row->pitch_decision == PITCH_CORR_PROP))) {
00143       blob_it.set_to_list (row->blob_list ());
00144       blob_it.mark_cycle_pt ();
00145       end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
00146       if (tosp_use_pre_chopping)
00147         blob_box = box_next_pre_chopped (&blob_it);
00148       else if (tosp_stats_use_xht_gaps)
00149         blob_box = reduced_box_next (row, &blob_it);
00150       else
00151         blob_box = box_next (&blob_it);
00152       row_length = end_of_row - blob_box.left ();
00153       if (blob_box.width () < minwidth)
00154         minwidth = blob_box.width ();
00155       prev_blob_box = blob_box;
00156       while (!blob_it.cycled_list ()) {
00157         if (tosp_use_pre_chopping)
00158           blob_box = box_next_pre_chopped (&blob_it);
00159         else if (tosp_stats_use_xht_gaps)
00160           blob_box = reduced_box_next (row, &blob_it);
00161         else
00162           blob_box = box_next (&blob_it);
00163         if (blob_box.width () < minwidth)
00164           minwidth = blob_box.width ();
00165         gap_width = blob_box.left () - prev_blob_box.right ();
00166         if (!ignore_big_gap (row, row_length, gapmap,
00167                              prev_blob_box.right (), blob_box.left ())) {
00168           all_gap_stats.add (gap_width, 1);
00169 
00170           centre_to_centre = (blob_box.left () + blob_box.right () -
00171             (prev_blob_box.left () +
00172              prev_blob_box.right ())) / 2;
00173           //DEBUG
00174           centre_to_centre_stats.add (centre_to_centre, 1);
00175           // DEBUG
00176         }
00177         prev_blob_box = blob_box;
00178       }
00179     }
00180   }
00181 
00182                                  //Inadequate samples
00183   if (all_gap_stats.get_total () <= 1) {
00184     block_non_space_gap_width = minwidth;
00185     block_space_gap_width = -1;  //No est. space width
00186                                  //DEBUG
00187     old_text_ord_proportional = TRUE;
00188   }
00189   else {
00190     /* For debug only ..... */
00191     iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) -
00192       centre_to_centre_stats.ile (0.25);
00193     iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25);
00194     old_text_ord_proportional =
00195       iqr_centre_to_centre * 2 > iqr_all_gap_stats;
00196     /* .......For debug only */
00197 
00198     /*
00199     The median of the gaps is used as an estimate of the NON-SPACE gap width.
00200     This RELIES on the assumption that there are more gaps WITHIN words than
00201     BETWEEN words in a block
00202 
00203     Now try to estimate the width of a real space for all real spaces in the
00204     block. Do this by using a crude threshold to ignore "narrow" gaps, then
00205     find the median of the "wide" gaps and use this.
00206     */
00207     block_non_space_gap_width = (inT16) floor (all_gap_stats.median ());
00208     // median gap
00209 
00210     row_it.set_to_list (block->get_rows ());
00211     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00212       row = row_it.data ();
00213       if (!row->blob_list ()->empty () &&
00214         (!tosp_only_use_prop_rows ||
00215         (row->pitch_decision == PITCH_DEF_PROP) ||
00216       (row->pitch_decision == PITCH_CORR_PROP))) {
00217         real_space_threshold =
00218           MAX (tosp_init_guess_kn_mult * block_non_space_gap_width,
00219           tosp_init_guess_xht_mult * row->xheight);
00220         blob_it.set_to_list (row->blob_list ());
00221         blob_it.mark_cycle_pt ();
00222         end_of_row =
00223           blob_it.data_relative (-1)->bounding_box ().right ();
00224         if (tosp_use_pre_chopping)
00225           blob_box = box_next_pre_chopped (&blob_it);
00226         else if (tosp_stats_use_xht_gaps)
00227           blob_box = reduced_box_next (row, &blob_it);
00228         else
00229           blob_box = box_next (&blob_it);
00230         row_length = blob_box.left () - end_of_row;
00231         prev_blob_box = blob_box;
00232         while (!blob_it.cycled_list ()) {
00233           if (tosp_use_pre_chopping)
00234             blob_box = box_next_pre_chopped (&blob_it);
00235           else if (tosp_stats_use_xht_gaps)
00236             blob_box = reduced_box_next (row, &blob_it);
00237           else
00238             blob_box = box_next (&blob_it);
00239           gap_width = blob_box.left () - prev_blob_box.right ();
00240           if ((gap_width > real_space_threshold) &&
00241             !ignore_big_gap (row, row_length, gapmap,
00242             prev_blob_box.right (),
00243           blob_box.left ())) {
00244             /*
00245             If tosp_use_cert_spaces is enabled, the estimate of the space gap is
00246             restricted to obvious spaces - those wider than half the xht or those
00247             with wide blobs on both sides - i.e not things that are suspect 1's or
00248             punctuation that is sometimes widely spaced.
00249             */
00250             if (!tosp_block_use_cert_spaces ||
00251               (gap_width >
00252               tosp_fuzzy_space_factor2 * row->xheight)
00253               ||
00254               ((gap_width >
00255               tosp_fuzzy_space_factor1 * row->xheight)
00256               && (!tosp_narrow_blobs_not_cert
00257               || (!narrow_blob (row, prev_blob_box)
00258               && !narrow_blob (row, blob_box))))
00259               || (wide_blob (row, prev_blob_box)
00260               && wide_blob (row, blob_box)))
00261               space_gap_stats.add (gap_width, 1);
00262           }
00263           prev_blob_box = blob_box;
00264         }
00265       }
00266     }
00267                                  //Inadequate samples
00268     if (space_gap_stats.get_total () <= 2)
00269       block_space_gap_width = -1;//No est. space width
00270     else
00271       block_space_gap_width =
00272         MAX ((inT16) floor (space_gap_stats.median ()),
00273         3 * block_non_space_gap_width);
00274   }
00275 }
00276 
00277 
00278 /*************************************************************************
00279  * row_spacing_stats()
00280  * Set values for min_space, max_non_space based on row stats only
00281  * If failure - return 0 values.
00282  *************************************************************************/
00283 void Textord::row_spacing_stats(
00284     TO_ROW *row,
00285     GAPMAP *gapmap,
00286     inT16 block_idx,
00287     inT16 row_idx,
00288     inT16 block_space_gap_width,    //estimate for block
00289     inT16 block_non_space_gap_width //estimate for block
00290                                 ) {
00291   //iterator
00292   BLOBNBOX_IT blob_it = row->blob_list ();
00293   STATS all_gap_stats (0, MAXSPACING);
00294   STATS cert_space_gap_stats (0, MAXSPACING);
00295   STATS all_space_gap_stats (0, MAXSPACING);
00296   STATS small_gap_stats (0, MAXSPACING);
00297   TBOX blob_box;
00298   TBOX prev_blob_box;
00299   inT16 gap_width;
00300   inT16 real_space_threshold = 0;
00301   inT16 max = 0;
00302   inT16 index;
00303   inT16 large_gap_count = 0;
00304   BOOL8 suspected_table;
00305   inT32 max_max_nonspace;        //upper bound
00306   BOOL8 good_block_space_estimate = block_space_gap_width > 0;
00307   inT32 end_of_row;
00308   inT32 row_length = 0;
00309   float sane_space;
00310   inT32 sane_threshold;
00311 
00312   /* Collect first pass stats for row */
00313 
00314   if (!good_block_space_estimate)
00315     block_space_gap_width = inT16 (floor (row->xheight / 2));
00316   if (!row->blob_list ()->empty ()) {
00317     if (tosp_threshold_bias1 > 0)
00318       real_space_threshold =
00319         block_non_space_gap_width +
00320         inT16 (floor (0.5 +
00321         tosp_threshold_bias1 * (block_space_gap_width -
00322                                 block_non_space_gap_width)));
00323     else
00324       real_space_threshold =     //Old TO method
00325         (block_space_gap_width + block_non_space_gap_width) / 2;
00326     blob_it.set_to_list (row->blob_list ());
00327     blob_it.mark_cycle_pt ();
00328     end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
00329     if (tosp_use_pre_chopping)
00330       blob_box = box_next_pre_chopped (&blob_it);
00331     else if (tosp_stats_use_xht_gaps)
00332       blob_box = reduced_box_next (row, &blob_it);
00333     else
00334       blob_box = box_next (&blob_it);
00335     row_length = end_of_row - blob_box.left ();
00336     prev_blob_box = blob_box;
00337     while (!blob_it.cycled_list ()) {
00338       if (tosp_use_pre_chopping)
00339         blob_box = box_next_pre_chopped (&blob_it);
00340       else if (tosp_stats_use_xht_gaps)
00341         blob_box = reduced_box_next (row, &blob_it);
00342       else
00343         blob_box = box_next (&blob_it);
00344       gap_width = blob_box.left () - prev_blob_box.right ();
00345       if (ignore_big_gap (row, row_length, gapmap,
00346         prev_blob_box.right (), blob_box.left ()))
00347         large_gap_count++;
00348       else {
00349         if (gap_width >= real_space_threshold) {
00350           if (!tosp_row_use_cert_spaces ||
00351             (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
00352             ((gap_width > tosp_fuzzy_space_factor1 * row->xheight)
00353             && (!tosp_narrow_blobs_not_cert
00354             || (!narrow_blob (row, prev_blob_box)
00355             && !narrow_blob (row, blob_box))))
00356             || (wide_blob (row, prev_blob_box)
00357             && wide_blob (row, blob_box)))
00358             cert_space_gap_stats.add (gap_width, 1);
00359           all_space_gap_stats.add (gap_width, 1);
00360         }
00361         else
00362           small_gap_stats.add (gap_width, 1);
00363         all_gap_stats.add (gap_width, 1);
00364       }
00365       prev_blob_box = blob_box;
00366     }
00367   }
00368   suspected_table = (large_gap_count > 1) ||
00369       ((large_gap_count > 0) &&
00370        (all_gap_stats.get_total () <= tosp_few_samples));
00371 
00372   /* Now determine row kern size, space size and threshold */
00373 
00374   if ((cert_space_gap_stats.get_total () >=
00375     tosp_enough_space_samples_for_median) ||
00376     ((suspected_table ||
00377     all_gap_stats.get_total () <= tosp_short_row) &&
00378     cert_space_gap_stats.get_total () > 0)) {
00379     old_to_method(row,
00380                   &all_gap_stats,
00381                   &cert_space_gap_stats,
00382                   &small_gap_stats,
00383                   block_space_gap_width,
00384                   block_non_space_gap_width);
00385   } else {
00386     if (!tosp_recovery_isolated_row_stats ||
00387         !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table,
00388                              block_idx, row_idx)) {
00389       if (tosp_row_use_cert_spaces && (tosp_debug_level > 5))
00390         tprintf ("B:%d R:%d -- Inadequate certain spaces.\n",
00391           block_idx, row_idx);
00392       if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
00393                                  //Use block default
00394         row->space_size = block_space_gap_width;
00395         if (all_gap_stats.get_total () > tosp_redo_kern_limit)
00396           row->kern_size = all_gap_stats.median ();
00397         else
00398           row->kern_size = block_non_space_gap_width;
00399         row->space_threshold =
00400           inT32 (floor ((row->space_size + row->kern_size) /
00401                         tosp_old_sp_kn_th_factor));
00402       }
00403       else
00404         old_to_method(row,
00405                       &all_gap_stats,
00406                       &all_space_gap_stats,
00407                       &small_gap_stats,
00408                       block_space_gap_width,
00409                       block_non_space_gap_width);
00410     }
00411   }
00412 
00413   if (tosp_improve_thresh && !suspected_table)
00414     improve_row_threshold(row, &all_gap_stats);
00415 
00416   /* Now lets try to be careful not to do anything silly with tables when we
00417   are ignoring big gaps*/
00418   if (tosp_sanity_method == 0) {
00419     if (suspected_table &&
00420     (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
00421       if (tosp_debug_level > 5)
00422         tprintf ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f.\n",
00423           block_idx, row_idx,
00424           row->kern_size, row->space_threshold, row->space_size);
00425       row->space_threshold =
00426         (inT32) (tosp_table_kn_sp_ratio * row->kern_size);
00427       row->space_size = MAX (row->space_threshold + 1, row->xheight);
00428     }
00429   }
00430   else if (tosp_sanity_method == 1) {
00431     sane_space = row->space_size;
00432     /* NEVER let space size get too close to kern size */
00433     if ((row->space_size < tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5))
00434       || ((row->space_size - row->kern_size) <
00435     (tosp_silly_kn_sp_gap * row->xheight))) {
00436       if (good_block_space_estimate &&
00437         (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size))
00438         sane_space = block_space_gap_width;
00439       else
00440         sane_space =
00441           MAX (tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5),
00442           row->xheight / 2);
00443       if (tosp_debug_level > 5)
00444         tprintf
00445           ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n",
00446           block_idx, row_idx, row->kern_size, row->space_threshold,
00447           row->space_size, sane_space);
00448       row->space_size = sane_space;
00449       row->space_threshold =
00450         inT32 (floor ((row->space_size + row->kern_size) /
00451                       tosp_old_sp_kn_th_factor));
00452     }
00453     /* NEVER let threshold get VERY far away from kern */
00454     sane_threshold = inT32 (floor (tosp_max_sane_kn_thresh *
00455       MAX (row->kern_size, 2.5)));
00456     if (row->space_threshold > sane_threshold) {
00457       if (tosp_debug_level > 5)
00458         tprintf ("B:%d R:%d -- DONT BELIEVE THRESH %3.2f %d %3.2f->%d.\n",
00459           block_idx, row_idx,
00460           row->kern_size,
00461           row->space_threshold, row->space_size, sane_threshold);
00462       row->space_threshold = sane_threshold;
00463       if (row->space_size <= sane_threshold)
00464         row->space_size = row->space_threshold + 1.0f;
00465     }
00466     /* Beware of tables - there may be NO spaces */
00467     if (suspected_table) {
00468       sane_space = MAX (tosp_table_kn_sp_ratio * row->kern_size,
00469         tosp_table_xht_sp_ratio * row->xheight);
00470       sane_threshold = inT32 (floor ((sane_space + row->kern_size) / 2));
00471 
00472       if ((row->space_size < sane_space) ||
00473       (row->space_threshold < sane_threshold)) {
00474         if (tosp_debug_level > 5)
00475           tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n",
00476             block_idx, row_idx,
00477             row->kern_size,
00478             row->space_threshold, row->space_size);
00479                                  //the minimum sane value
00480         row->space_threshold = (inT32) sane_space;
00481         row->space_size = MAX (row->space_threshold + 1, row->xheight);
00482       }
00483     }
00484   }
00485 
00486   /* Now lets try to put some error limits on the threshold */
00487 
00488   if (tosp_old_to_method) {
00489     /* Old textord made a space if gap >= threshold */
00490                                  //NO FUZZY SPACES YET
00491     row->max_nonspace = row->space_threshold;
00492                                  //NO FUZZY SPACES       YET
00493     row->min_space = row->space_threshold + 1;
00494   }
00495   else {
00496     /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
00497     row->min_space =
00498       MIN (inT32 (ceil (tosp_fuzzy_space_factor * row->xheight)),
00499       inT32 (row->space_size));
00500     if (row->min_space <= row->space_threshold)
00501                                  //Dont be silly
00502       row->min_space = row->space_threshold + 1;
00503     /*
00504     Lets try to guess the max certain kern gap by looking at the cluster of
00505     kerns for the row. The row is proportional so the kerns should cluster
00506     tightly at the bottom of the distribution. We also expect most gaps to be
00507     kerns. Find the maximum of the kern piles between 0 and twice the kern
00508     estimate. Piles before the first one with less than 1/10 the maximum
00509     number of samples can be taken as certain kerns.
00510 
00511       Of course, there are some cases where the kern peak and space peaks merge,
00512       so we will put an UPPER limit on the max certain kern gap of some fraction
00513       below the threshold.
00514     */
00515 
00516     max_max_nonspace = inT32 ((row->space_threshold + row->kern_size) / 2);
00517 
00518                                  //default
00519     row->max_nonspace = max_max_nonspace;
00520     for (index = 0; index <= max_max_nonspace; index++) {
00521       if (all_gap_stats.pile_count (index) > max)
00522         max = all_gap_stats.pile_count (index);
00523       if ((index > row->kern_size) &&
00524       (all_gap_stats.pile_count (index) < 0.1 * max)) {
00525         row->max_nonspace = index;
00526         break;
00527       }
00528     }
00529   }
00530 
00531   /* Yet another algorithm - simpler this time - just choose a fraction of the
00532   threshold to space range */
00533 
00534   if ((tosp_fuzzy_sp_fraction > 0) &&
00535     (row->space_size > row->space_threshold))
00536     row->min_space = MAX (row->min_space,
00537       (inT32) ceil (row->space_threshold +
00538       tosp_fuzzy_sp_fraction *
00539       (row->space_size -
00540       row->space_threshold)));
00541 
00542   /* Ensure that ANY space less than some multiplier times the kern size is
00543   fuzzy.  In tables there is a risk of erroneously setting a small space size
00544   when there are no real spaces. Sometimes tables have text squashed into
00545   columns so that the kn->sp ratio is small anyway - this means that we cant
00546   use this to force a wider separation - hence we rely on context to join any
00547   dubious breaks. */
00548 
00549   if ((tosp_table_fuzzy_kn_sp_ratio > 0) &&
00550     (suspected_table || tosp_fuzzy_limit_all))
00551     row->min_space = MAX (row->min_space,
00552       (inT32) ceil (tosp_table_fuzzy_kn_sp_ratio *
00553       row->kern_size));
00554 
00555   if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
00556     row->max_nonspace = (inT32) floor (0.5 + row->kern_size +
00557       tosp_fuzzy_kn_fraction *
00558       (row->space_threshold -
00559       row->kern_size));
00560   }
00561   if (row->max_nonspace > row->space_threshold) {
00562                                  //Dont be silly
00563     row->max_nonspace = row->space_threshold;
00564   }
00565 
00566   if (tosp_debug_level > 5)
00567     tprintf
00568       ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n",
00569       block_idx, row_idx, row_length, block_non_space_gap_width,
00570       block_space_gap_width, real_space_threshold, row->kern_size,
00571       row->max_nonspace, row->space_threshold, row->min_space,
00572       row->space_size);
00573   if (tosp_debug_level > 10)
00574     tprintf("row->kern_size = %3.2f, row->space_size = %3.2f, "
00575             "row->space_threshold = %d\n",
00576             row->kern_size, row->space_size, row->space_threshold);
00577 }
00578 
00579 void Textord::old_to_method(
00580     TO_ROW *row,
00581     STATS *all_gap_stats,
00582     STATS *space_gap_stats,
00583     STATS *small_gap_stats,
00584     inT16 block_space_gap_width,     //estimate for block
00585     inT16 block_non_space_gap_width  //estimate for block
00586                             ) {
00587   /* First, estimate row space size */
00588   /* Old to condition was > 2 */
00589   if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) {
00590   //Adequate samples
00591     /* Set space size to median of spaces BUT limits it if it seems wildly out */
00592     row->space_size = space_gap_stats->median ();
00593     if (row->space_size > block_space_gap_width * 1.5) {
00594       if (tosp_old_to_bug_fix)
00595         row->space_size = block_space_gap_width * 1.5;
00596       else
00597                                  //BUG??? should be *1.5
00598         row->space_size = block_space_gap_width;
00599     }
00600     if (row->space_size < (block_non_space_gap_width * 2) + 1)
00601       row->space_size = (block_non_space_gap_width * 2) + 1;
00602   }
00603                                  //Only 1 or 2 samples
00604   else if (space_gap_stats->get_total () >= 1) {
00605                                  //hence mean not median
00606     row->space_size = space_gap_stats->mean ();
00607     if (row->space_size > block_space_gap_width * 1.5) {
00608       if (tosp_old_to_bug_fix)
00609         row->space_size = block_space_gap_width * 1.5;
00610       else
00611                                  //BUG??? should be *1.5
00612         row->space_size = block_space_gap_width;
00613     }
00614     if (row->space_size < (block_non_space_gap_width * 3) + 1)
00615       row->space_size = (block_non_space_gap_width * 3) + 1;
00616   }
00617   else {
00618                                  //Use block default
00619     row->space_size = block_space_gap_width;
00620   }
00621 
00622   /* Next, estimate row kern size */
00623   if ((tosp_only_small_gaps_for_kern) &&
00624     (small_gap_stats->get_total () > tosp_redo_kern_limit))
00625     row->kern_size = small_gap_stats->median ();
00626   else if (all_gap_stats->get_total () > tosp_redo_kern_limit)
00627     row->kern_size = all_gap_stats->median ();
00628   else                          //old TO -SAME FOR ALL ROWS
00629     row->kern_size = block_non_space_gap_width;
00630 
00631   /* Finally, estimate row space threshold */
00632   if (tosp_threshold_bias2 > 0) {
00633     row->space_threshold =
00634         inT32 (floor (0.5 + row->kern_size +
00635                       tosp_threshold_bias2 * (row->space_size -
00636                                               row->kern_size)));
00637   } else {
00638     /*
00639       NOTE old text ord uses (space_size + kern_size + 1)/2  as the threshold
00640     and holds this in a float. The use is with a >= test
00641     NEW textord uses an integer threshold and a > test
00642     It comes to the same thing.
00643       (Though there is a difference in that old textor has integer space_size
00644       and kern_size.)
00645     */
00646     row->space_threshold =
00647         inT32 (floor ((row->space_size + row->kern_size) / 2));
00648   }
00649 
00650   // Apply the same logic and ratios as in row_spacing_stats to
00651   // restrict relative values of the row's space_size, kern_size, and
00652   // space_threshold
00653   if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&
00654       ((row->space_size <
00655         tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5)) ||
00656        ((row->space_size - row->kern_size) <
00657         tosp_silly_kn_sp_gap * row->xheight))) {
00658     if (row->kern_size > 2.5)
00659       row->kern_size = row->space_size / tosp_min_sane_kn_sp;
00660     row->space_threshold = inT32 (floor ((row->space_size + row->kern_size) /
00661                                          tosp_old_sp_kn_th_factor));
00662   }
00663 }
00664 
00665 
00666 /*************************************************************************
00667  * isolated_row_stats()
00668  * Set values for min_space, max_non_space based on row stats only
00669  *************************************************************************/
00670 BOOL8 Textord::isolated_row_stats(TO_ROW *row,
00671                                   GAPMAP *gapmap,
00672                                   STATS *all_gap_stats,
00673                                   BOOL8 suspected_table,
00674                                   inT16 block_idx,
00675                                   inT16 row_idx) {
00676   float kern_estimate;
00677   float crude_threshold_estimate;
00678   inT16 small_gaps_count;
00679   inT16 total;
00680   //iterator
00681   BLOBNBOX_IT blob_it = row->blob_list ();
00682   STATS cert_space_gap_stats (0, MAXSPACING);
00683   STATS all_space_gap_stats (0, MAXSPACING);
00684   STATS small_gap_stats (0, MAXSPACING);
00685   TBOX blob_box;
00686   TBOX prev_blob_box;
00687   inT16 gap_width;
00688   inT32 end_of_row;
00689   inT32 row_length;
00690 
00691   kern_estimate = all_gap_stats->median ();
00692   crude_threshold_estimate = MAX (tosp_init_guess_kn_mult * kern_estimate,
00693     tosp_init_guess_xht_mult * row->xheight);
00694   small_gaps_count = stats_count_under (all_gap_stats,
00695     (inT16)
00696     ceil (crude_threshold_estimate));
00697   total = all_gap_stats->get_total ();
00698 
00699   if ((total <= tosp_redo_kern_limit) ||
00700     ((small_gaps_count / (float) total) < tosp_enough_small_gaps) ||
00701   (total - small_gaps_count < 1)) {
00702     if (tosp_debug_level > 5)
00703       tprintf ("B:%d R:%d -- Cant do isolated row stats.\n",
00704         block_idx, row_idx);
00705     return FALSE;
00706   }
00707   blob_it.set_to_list (row->blob_list ());
00708   blob_it.mark_cycle_pt ();
00709   end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
00710   if (tosp_use_pre_chopping)
00711     blob_box = box_next_pre_chopped (&blob_it);
00712   else if (tosp_stats_use_xht_gaps)
00713     blob_box = reduced_box_next (row, &blob_it);
00714   else
00715     blob_box = box_next (&blob_it);
00716   row_length = end_of_row - blob_box.left ();
00717   prev_blob_box = blob_box;
00718   while (!blob_it.cycled_list ()) {
00719     if (tosp_use_pre_chopping)
00720       blob_box = box_next_pre_chopped (&blob_it);
00721     else if (tosp_stats_use_xht_gaps)
00722       blob_box = reduced_box_next (row, &blob_it);
00723     else
00724       blob_box = box_next (&blob_it);
00725     gap_width = blob_box.left () - prev_blob_box.right ();
00726     if (!ignore_big_gap (row, row_length, gapmap,
00727       prev_blob_box.right (), blob_box.left ()) &&
00728     (gap_width > crude_threshold_estimate)) {
00729       if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
00730         ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
00731         (!tosp_narrow_blobs_not_cert ||
00732         (!narrow_blob (row, prev_blob_box) &&
00733         !narrow_blob (row, blob_box)))) ||
00734         (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box)))
00735         cert_space_gap_stats.add (gap_width, 1);
00736       all_space_gap_stats.add (gap_width, 1);
00737     }
00738     if (gap_width < crude_threshold_estimate)
00739       small_gap_stats.add (gap_width, 1);
00740 
00741     prev_blob_box = blob_box;
00742   }
00743   if (cert_space_gap_stats.get_total () >=
00744     tosp_enough_space_samples_for_median)
00745                                  //median
00746     row->space_size = cert_space_gap_stats.median ();
00747   else if (suspected_table && (cert_space_gap_stats.get_total () > 0))
00748                                  //to avoid spaced
00749     row->space_size = cert_space_gap_stats.mean ();
00750   //      1's in tables
00751   else if (all_space_gap_stats.get_total () >=
00752     tosp_enough_space_samples_for_median)
00753                                  //median
00754     row->space_size = all_space_gap_stats.median ();
00755   else
00756     row->space_size = all_space_gap_stats.mean ();
00757 
00758   if (tosp_only_small_gaps_for_kern)
00759     row->kern_size = small_gap_stats.median ();
00760   else
00761     row->kern_size = all_gap_stats->median ();
00762   row->space_threshold =
00763     inT32 (floor ((row->space_size + row->kern_size) / 2));
00764   /* Sanity check */
00765   if ((row->kern_size >= row->space_threshold) ||
00766     (row->space_threshold >= row->space_size) ||
00767   (row->space_threshold <= 0)) {
00768     if (tosp_debug_level > 5)
00769       tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n",
00770         block_idx, row_idx,
00771         row->kern_size, row->space_threshold, row->space_size);
00772     row->kern_size = 0.0f;
00773     row->space_threshold = 0;
00774     row->space_size = 0.0f;
00775     return FALSE;
00776   }
00777 
00778   if (tosp_debug_level > 5)
00779     tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n",
00780       block_idx, row_idx,
00781       row->kern_size, row->space_threshold, row->space_size);
00782   return TRUE;
00783 }
00784 
00785 inT16 Textord::stats_count_under(STATS *stats, inT16 threshold) {
00786   inT16 index;
00787   inT16 total = 0;
00788 
00789   for (index = 0; index < threshold; index++)
00790     total += stats->pile_count (index);
00791   return total;
00792 }
00793 
00794 
00795 /*************************************************************************
00796  * improve_row_threshold()
00797  *    Try to recognise a "normal line" -
00798  *           > 25 gaps
00799  *     &&    space > 3 * kn  && space > 10
00800  *              (I.e. reasonably large space and kn:sp ratio)
00801  *     &&    > 3/4 # gaps < kn + (sp - kn)/3
00802  *              (I.e. most gaps are well away from space estimate)
00803  *     &&    a gap of max( 3, (sp - kn)/3 ) empty histogram positions is found
00804  *           somewhere in the histogram between kn and sp
00805  *     THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
00806  *          NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!!
00807  *          try moving the default threshold to within this band but leave the
00808  *          fuzzy limit calculation as at present.
00809  *************************************************************************/
00810 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
00811   float sp = row->space_size;
00812   float kn = row->kern_size;
00813   inT16 reqd_zero_width = 0;
00814   inT16 zero_width = 0;
00815   inT16 zero_start = 0;
00816   inT16 index = 0;
00817 
00818   if (tosp_debug_level > 10)
00819     tprintf ("Improve row threshold 0");
00820   if ((all_gap_stats->get_total () <= 25) ||
00821     (sp <= 10) ||
00822     (sp <= 3 * kn) ||
00823     (stats_count_under (all_gap_stats,
00824     (inT16) ceil (kn + (sp - kn) / 3 + 0.5)) <
00825     (0.75 * all_gap_stats->get_total ())))
00826     return;
00827   if (tosp_debug_level > 10)
00828     tprintf (" 1");
00829   /*
00830   Look for the first region of all 0's in the histogram which is wider than
00831   max( 3, (sp - kn)/3 ) and starts between kn and sp. If found, and current
00832   threshold is not within it, move the threshold so that is is just inside it.
00833   */
00834   reqd_zero_width = (inT16) floor ((sp - kn) / 3 + 0.5);
00835   if (reqd_zero_width < 3)
00836     reqd_zero_width = 3;
00837 
00838   for (index = inT16 (ceil (kn)); index < inT16 (floor (sp)); index++) {
00839     if (all_gap_stats->pile_count (index) == 0) {
00840       if (zero_width == 0)
00841         zero_start = index;
00842       zero_width++;
00843     }
00844     else {
00845       if (zero_width >= reqd_zero_width)
00846         break;
00847       else {
00848         zero_width = 0;
00849       }
00850     }
00851   }
00852   index--;
00853   if (tosp_debug_level > 10)
00854     tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n",
00855       reqd_zero_width, zero_width, zero_start, row->space_threshold);
00856   if ((zero_width < reqd_zero_width) ||
00857     ((row->space_threshold >= zero_start) &&
00858     (row->space_threshold <= index)))
00859     return;
00860   if (tosp_debug_level > 10)
00861     tprintf (" 2");
00862   if (row->space_threshold < zero_start) {
00863     if (tosp_debug_level > 5)
00864       tprintf
00865         ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n",
00866         kn, sp, zero_start, index, row->space_threshold, zero_start);
00867     row->space_threshold = zero_start;
00868   }
00869   if (row->space_threshold > index) {
00870     if (tosp_debug_level > 5)
00871       tprintf
00872         ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d  thresh:%d -> %d\n",
00873         kn, sp, zero_start, index, row->space_threshold, index);
00874     row->space_threshold = index;
00875   }
00876 }
00877 
00878 
00879 /**********************************************************************
00880  * make_prop_words
00881  *
00882  * Convert a TO_BLOCK to a BLOCK.
00883  **********************************************************************/
00884 ROW *Textord::make_prop_words(
00885     TO_ROW *row,     // row to make
00886     FCOORD rotation  // for drawing
00887                               ) {
00888   BOOL8 bol;                     //start of line
00889   /* prev_ values are for start of word being built. non prev_ values are for
00890   the gap between the word being built and the next one. */
00891   BOOL8 prev_fuzzy_sp;           //probably space
00892   BOOL8 prev_fuzzy_non;          //probably not
00893   uinT8 prev_blanks;             //in front of word
00894   BOOL8 fuzzy_sp;                //probably space
00895   BOOL8 fuzzy_non;               //probably not
00896   uinT8 blanks;                  //in front of word
00897   BOOL8 prev_gap_was_a_space = FALSE;
00898   BOOL8 break_at_next_gap = FALSE;
00899   ROW *real_row;                 //output row
00900   C_OUTLINE_IT cout_it;
00901   C_BLOB_LIST cblobs;
00902   C_BLOB_IT cblob_it = &cblobs;
00903   WERD_LIST words;
00904   WERD_IT word_it;               //new words
00905   WERD *word;                    //new word
00906   WERD_IT rep_char_it;           //repeated char words
00907   inT32 next_rep_char_word_right = MAX_INT32;
00908   float repetition_spacing;      //gap between repetitions
00909   inT32 xstarts[2];              //row ends
00910   inT32 prev_x;                  //end of prev blob
00911   BLOBNBOX *bblob;               //current blob
00912   TBOX blob_box;                  //bounding box
00913   BLOBNBOX_IT box_it;            //iterator
00914   TBOX prev_blob_box;
00915   TBOX next_blob_box;
00916   inT16 prev_gap = MAX_INT16;
00917   inT16 current_gap = MAX_INT16;
00918   inT16 next_gap = MAX_INT16;
00919   inT16 prev_within_xht_gap = MAX_INT16;
00920   inT16 current_within_xht_gap = MAX_INT16;
00921   inT16 next_within_xht_gap = MAX_INT16;
00922   inT16 word_count = 0;
00923 
00924   rep_char_it.set_to_list (&(row->rep_words));
00925   if (!rep_char_it.empty ()) {
00926     next_rep_char_word_right =
00927       rep_char_it.data ()->bounding_box ().right ();
00928   }
00929 
00930   prev_x = -MAX_INT16;
00931   cblob_it.set_to_list (&cblobs);
00932   box_it.set_to_list (row->blob_list ());
00933   word_it.set_to_list (&words);
00934   bol = TRUE;
00935   prev_blanks = 0;
00936   prev_fuzzy_sp = FALSE;
00937   prev_fuzzy_non = FALSE;
00938   if (!box_it.empty ()) {
00939     xstarts[0] = box_it.data ()->bounding_box ().left ();
00940     if (xstarts[0] > next_rep_char_word_right) {
00941       /* We need to insert a repeated char word at the start of the row */
00942       word = rep_char_it.extract ();
00943       word_it.add_after_then_move (word);
00944       /* Set spaces before repeated char word */
00945       word->set_flag (W_BOL, TRUE);
00946       bol = FALSE;
00947       word->set_blanks (0);
00948                                  //NO uncertainty
00949       word->set_flag (W_FUZZY_SP, FALSE);
00950       word->set_flag (W_FUZZY_NON, FALSE);
00951       xstarts[0] = word->bounding_box ().left ();
00952       /* Set spaces after repeated char word (and leave current word set) */
00953       repetition_spacing = find_mean_blob_spacing (word);
00954       current_gap = box_it.data ()->bounding_box ().left () -
00955         next_rep_char_word_right;
00956       current_within_xht_gap = current_gap;
00957       if (current_gap > tosp_rep_space * repetition_spacing) {
00958         prev_blanks = (uinT8) floor (current_gap / row->space_size);
00959         if (prev_blanks < 1)
00960           prev_blanks = 1;
00961       }
00962       else
00963         prev_blanks = 0;
00964       if (tosp_debug_level > 5)
00965         tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f;  Rgap:%d  ",
00966           box_it.data ()->bounding_box ().left (),
00967           box_it.data ()->bounding_box ().bottom (),
00968           repetition_spacing, current_gap);
00969       prev_fuzzy_sp = FALSE;
00970       prev_fuzzy_non = FALSE;
00971       if (rep_char_it.empty ()) {
00972         next_rep_char_word_right = MAX_INT32;
00973       }
00974       else {
00975         rep_char_it.forward ();
00976         next_rep_char_word_right =
00977           rep_char_it.data ()->bounding_box ().right ();
00978       }
00979     }
00980 
00981     peek_at_next_gap(row,
00982                      box_it,
00983                      next_blob_box,
00984                      next_gap,
00985                      next_within_xht_gap);
00986     do {
00987       bblob = box_it.data ();
00988       blob_box = bblob->bounding_box ();
00989       if (bblob->joined_to_prev ()) {
00990         if (bblob->cblob () != NULL) {
00991           cout_it.set_to_list (cblob_it.data ()->out_list ());
00992           cout_it.move_to_last ();
00993           cout_it.add_list_after (bblob->cblob ()->out_list ());
00994           delete bblob->cblob ();
00995         }
00996       } else {
00997         if (bblob->cblob() != NULL)
00998           cblob_it.add_after_then_move (bblob->cblob ());
00999         prev_x = blob_box.right ();
01000       }
01001       box_it.forward ();         //next one
01002       bblob = box_it.data ();
01003       blob_box = bblob->bounding_box ();
01004 
01005       if (!bblob->joined_to_prev() && bblob->cblob() != NULL) {
01006         /* Real Blob - not multiple outlines or pre-chopped */
01007         prev_gap = current_gap;
01008         prev_within_xht_gap = current_within_xht_gap;
01009         prev_blob_box = next_blob_box;
01010         current_gap = next_gap;
01011         current_within_xht_gap = next_within_xht_gap;
01012         peek_at_next_gap(row,
01013                          box_it,
01014                          next_blob_box,
01015                          next_gap,
01016                          next_within_xht_gap);
01017 
01018         inT16 prev_gap_arg = prev_gap;
01019         inT16 next_gap_arg = next_gap;
01020         if (tosp_only_use_xht_gaps) {
01021           prev_gap_arg = prev_within_xht_gap;
01022           next_gap_arg = next_within_xht_gap;
01023         }
01024         // Decide if a word-break should be inserted
01025         if (blob_box.left () > next_rep_char_word_right ||
01026             make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box,
01027                               current_gap, current_within_xht_gap,
01028                               next_blob_box, next_gap_arg,
01029                               blanks, fuzzy_sp, fuzzy_non,
01030                               prev_gap_was_a_space,
01031                               break_at_next_gap) ||
01032             box_it.at_first()) {
01033           /* Form a new word out of the blobs collected */
01034           word = new WERD (&cblobs, prev_blanks, NULL);
01035           word_count++;
01036           word_it.add_after_then_move (word);
01037           if (bol) {
01038             word->set_flag (W_BOL, TRUE);
01039             bol = FALSE;
01040           }
01041           if (prev_fuzzy_sp)
01042                                  //probably space
01043             word->set_flag (W_FUZZY_SP, TRUE);
01044           else if (prev_fuzzy_non)
01045             word->set_flag (W_FUZZY_NON, TRUE);
01046           //probably not
01047 
01048           if (blob_box.left () > next_rep_char_word_right) {
01049             /* We need to insert a repeated char word */
01050             word = rep_char_it.extract ();
01051             word_it.add_after_then_move (word);
01052 
01053             /* Set spaces before repeated char word */
01054             repetition_spacing = find_mean_blob_spacing (word);
01055             current_gap = word->bounding_box ().left () - prev_x;
01056             current_within_xht_gap = current_gap;
01057             if (current_gap > tosp_rep_space * repetition_spacing) {
01058               blanks =
01059                 (uinT8) floor (current_gap / row->space_size);
01060               if (blanks < 1)
01061                 blanks = 1;
01062             }
01063             else
01064               blanks = 0;
01065             if (tosp_debug_level > 5)
01066               tprintf
01067                 ("Repch wd (%d,%d) rep gap %5.2f;  Lgap:%d (%d blanks);",
01068                 word->bounding_box ().left (),
01069                 word->bounding_box ().bottom (),
01070                 repetition_spacing, current_gap, blanks);
01071             word->set_blanks (blanks);
01072                                  //NO uncertainty
01073             word->set_flag (W_FUZZY_SP, FALSE);
01074             word->set_flag (W_FUZZY_NON, FALSE);
01075 
01076             /* Set spaces after repeated char word (and leave current word set) */
01077             current_gap =
01078               blob_box.left () - next_rep_char_word_right;
01079             if (current_gap > tosp_rep_space * repetition_spacing) {
01080               blanks = (uinT8) (current_gap / row->space_size);
01081               if (blanks < 1)
01082                 blanks = 1;
01083             }
01084             else
01085               blanks = 0;
01086             if (tosp_debug_level > 5)
01087               tprintf (" Rgap:%d (%d blanks)\n",
01088                 current_gap, blanks);
01089             fuzzy_sp = FALSE;
01090             fuzzy_non = FALSE;
01091 
01092             if (rep_char_it.empty ()) {
01093               next_rep_char_word_right = MAX_INT32;
01094             }
01095             else {
01096               rep_char_it.forward ();
01097               next_rep_char_word_right =
01098                 rep_char_it.data ()->bounding_box ().right ();
01099             }
01100           }
01101 
01102           if (box_it.at_first () && rep_char_it.empty ()) {
01103                                  //at end of line
01104             word->set_flag (W_EOL, TRUE);
01105             xstarts[1] = prev_x;
01106           }
01107           else {
01108             prev_blanks = blanks;
01109             prev_fuzzy_sp = fuzzy_sp;
01110             prev_fuzzy_non = fuzzy_non;
01111           }
01112         }
01113       }
01114     }
01115     while (!box_it.at_first ()); //until back at start
01116 
01117     /* Insert any further repeated char words */
01118     while (!rep_char_it.empty ()) {
01119       word = rep_char_it.extract ();
01120       word_it.add_after_then_move (word);
01121 
01122       /* Set spaces before repeated char word */
01123       repetition_spacing = find_mean_blob_spacing (word);
01124       current_gap = word->bounding_box ().left () - prev_x;
01125       if (current_gap > tosp_rep_space * repetition_spacing) {
01126         blanks = (uinT8) floor (current_gap / row->space_size);
01127         if (blanks < 1)
01128           blanks = 1;
01129       }
01130       else
01131         blanks = 0;
01132       if (tosp_debug_level > 5)
01133         tprintf
01134           ("Repch wd at EOL (%d,%d). rep spacing %d; Lgap:%d (%d blanks)\n",
01135           word->bounding_box ().left (), word->bounding_box ().bottom (),
01136           repetition_spacing, current_gap, blanks);
01137       word->set_blanks (blanks);
01138                                  //NO uncertainty
01139       word->set_flag (W_FUZZY_SP, FALSE);
01140       word->set_flag (W_FUZZY_NON, FALSE);
01141       prev_x = word->bounding_box ().right ();
01142       if (rep_char_it.empty ()) {
01143                                  //at end of line
01144         word->set_flag (W_EOL, TRUE);
01145         xstarts[1] = prev_x;
01146       }
01147       else {
01148         rep_char_it.forward ();
01149       }
01150     }
01151     real_row = new ROW (row,
01152       (inT16) row->kern_size, (inT16) row->space_size);
01153     word_it.set_to_list (real_row->word_list ());
01154                                  //put words in row
01155     word_it.add_list_after (&words);
01156     real_row->recalc_bounding_box ();
01157 
01158     if (tosp_debug_level > 4) {
01159       tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n",
01160         word_count,
01161         real_row->bounding_box ().left (),
01162         real_row->bounding_box ().bottom (),
01163         real_row->bounding_box ().right (),
01164         real_row->bounding_box ().top ());
01165     }
01166     return real_row;
01167   }
01168   return NULL;
01169 }
01170 
01171 /**********************************************************************
01172  * make_blob_words
01173  *
01174  * Converts words into blobs so that each blob is a single character.
01175  *  Used for chopper test.
01176  **********************************************************************/
01177 ROW *Textord::make_blob_words(
01178     TO_ROW *row,     // row to make
01179     FCOORD rotation  // for drawing
01180                               ) {
01181   bool bol;                      // start of line
01182   ROW *real_row;                 // output row
01183   C_OUTLINE_IT cout_it;
01184   C_BLOB_LIST cblobs;
01185   C_BLOB_IT cblob_it = &cblobs;
01186   WERD_LIST words;
01187   WERD_IT word_it;               // new words
01188   WERD *word;                    // new word
01189   BLOBNBOX *bblob;               // current blob
01190   TBOX blob_box;                 // bounding box
01191   BLOBNBOX_IT box_it;            // iterator
01192   inT16 word_count = 0;
01193 
01194   cblob_it.set_to_list(&cblobs);
01195   box_it.set_to_list(row->blob_list());
01196   word_it.set_to_list(&words);
01197   bol = TRUE;
01198   if (!box_it.empty()) {
01199 
01200     do {
01201       bblob = box_it.data();
01202       blob_box = bblob->bounding_box();
01203       if (bblob->joined_to_prev()) {
01204         if (bblob->cblob() != NULL) {
01205           cout_it.set_to_list(cblob_it.data()->out_list());
01206           cout_it.move_to_last();
01207           cout_it.add_list_after(bblob->cblob()->out_list());
01208           delete bblob->cblob();
01209         }
01210       } else {
01211         if (bblob->cblob() != NULL)
01212           cblob_it.add_after_then_move(bblob->cblob());
01213       }
01214       box_it.forward();         // next one
01215       bblob = box_it.data();
01216       blob_box = bblob->bounding_box();
01217 
01218       if (!bblob->joined_to_prev() && !cblobs.empty()) {
01219         word = new WERD(&cblobs, 1, NULL);
01220         word_count++;
01221         word_it.add_after_then_move(word);
01222         if (bol) {
01223           word->set_flag(W_BOL, TRUE);
01224           bol = FALSE;
01225         }
01226         if (box_it.at_first()) { // at end of line
01227           word->set_flag(W_EOL, TRUE);
01228         }
01229       }
01230     }
01231     while (!box_it.at_first()); // until back at start
01232     /* Setup the row with created words. */
01233     real_row = new ROW(row, (inT16) row->kern_size, (inT16) row->space_size);
01234     word_it.set_to_list(real_row->word_list());
01235                                  //put words in row
01236     word_it.add_list_after(&words);
01237     real_row->recalc_bounding_box();
01238     if (tosp_debug_level > 4) {
01239       tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n",
01240         word_count,
01241         real_row->bounding_box().left(),
01242         real_row->bounding_box().bottom(),
01243         real_row->bounding_box().right(),
01244         real_row->bounding_box().top());
01245     }
01246     return real_row;
01247   }
01248   return NULL;
01249 }
01250 
01251 BOOL8 Textord::make_a_word_break(
01252     TO_ROW *row,   // row being made
01253     TBOX blob_box, // for next_blob // how many blanks?
01254     inT16 prev_gap,
01255     TBOX prev_blob_box,
01256     inT16 real_current_gap,
01257     inT16 within_xht_current_gap,
01258     TBOX next_blob_box,
01259     inT16 next_gap,
01260     uinT8 &blanks,
01261     BOOL8 &fuzzy_sp,
01262     BOOL8 &fuzzy_non,
01263     BOOL8& prev_gap_was_a_space,
01264     BOOL8& break_at_next_gap) {
01265   BOOL8 space;
01266   inT16 current_gap;
01267   float fuzzy_sp_to_kn_limit;
01268 
01269   if (break_at_next_gap) {
01270     break_at_next_gap = FALSE;
01271     return TRUE;
01272   }
01273   /* Inhibit using the reduced gap if
01274     The kerning is large - chars are not kerned and reducing "f"s can cause
01275     erroneous blanks
01276   OR  The real gap is less than 0
01277   OR  The real gap is less than the kerning estimate
01278   */
01279   if ((row->kern_size > tosp_large_kerning * row->xheight) ||
01280       ((tosp_dont_fool_with_small_kerns >= 0) &&
01281        (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size)))
01282                                  //Ignore the difference
01283     within_xht_current_gap = real_current_gap;
01284 
01285   if (tosp_use_xht_gaps && tosp_only_use_xht_gaps)
01286     current_gap = within_xht_current_gap;
01287   else
01288     current_gap = real_current_gap;
01289 
01290   if (tosp_old_to_method) {
01291                                  //Boring old method
01292     space = current_gap > row->max_nonspace;
01293     if (space && (current_gap < MAX_INT16)) {
01294       if (current_gap < row->min_space) {
01295         if (current_gap > row->space_threshold) {
01296           blanks = 1;
01297           fuzzy_sp = TRUE;
01298           fuzzy_non = FALSE;
01299         }
01300         else {
01301           blanks = 0;
01302           fuzzy_sp = FALSE;
01303           fuzzy_non = TRUE;
01304         }
01305       }
01306       else {
01307         blanks = (uinT8) (current_gap / row->space_size);
01308         if (blanks < 1)
01309           blanks = 1;
01310         fuzzy_sp = FALSE;
01311         fuzzy_non = FALSE;
01312       }
01313     }
01314     return space;
01315   }
01316   else {
01317   /* New exciting heuristic method */
01318     if (prev_blob_box.null_box ())  // Beginning of row
01319       prev_gap_was_a_space = TRUE;
01320 
01321                                  //Default as old TO
01322     space = current_gap > row->space_threshold;
01323 
01324     /* Set defaults for the word break incase we find one.  Currently there are
01325     no fuzzy spaces. Depending on the reliability of the different heuristics
01326     we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
01327     be used if the function returns TRUE - ie the word is to be broken.
01328     */
01329     blanks = (uinT8) (current_gap / row->space_size);
01330     if (blanks < 1)
01331       blanks = 1;
01332     fuzzy_sp = FALSE;
01333     fuzzy_non = FALSE;
01334     /*
01335     If xht measure causes gap to flip one of the 3 thresholds act accordingly -
01336     despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
01337     context.
01338     */
01339     if (tosp_use_xht_gaps &&
01340       (real_current_gap <= row->max_nonspace) &&
01341     (within_xht_current_gap > row->max_nonspace)) {
01342       space = TRUE;
01343       fuzzy_non = TRUE;
01344 #ifndef GRAPHICS_DISABLED
01345       mark_gap (blob_box, 20,
01346         prev_gap, prev_blob_box.width (),
01347         current_gap, next_blob_box.width (), next_gap);
01348 #endif
01349     }
01350     else if (tosp_use_xht_gaps &&
01351       (real_current_gap <= row->space_threshold) &&
01352     (within_xht_current_gap > row->space_threshold)) {
01353       space = TRUE;
01354       if (tosp_flip_fuzz_kn_to_sp)
01355         fuzzy_sp = TRUE;
01356       else
01357         fuzzy_non = TRUE;
01358 #ifndef GRAPHICS_DISABLED
01359       mark_gap (blob_box, 21,
01360         prev_gap, prev_blob_box.width (),
01361         current_gap, next_blob_box.width (), next_gap);
01362 #endif
01363     }
01364     else if (tosp_use_xht_gaps &&
01365       (real_current_gap < row->min_space) &&
01366     (within_xht_current_gap >= row->min_space)) {
01367       space = TRUE;
01368 #ifndef GRAPHICS_DISABLED
01369       mark_gap (blob_box, 22,
01370         prev_gap, prev_blob_box.width (),
01371         current_gap, next_blob_box.width (), next_gap);
01372 #endif
01373     }
01374     else if (tosp_force_wordbreak_on_punct &&
01375              !suspected_punct_blob(row, prev_blob_box) &&
01376              suspected_punct_blob(row, blob_box)) {
01377       break_at_next_gap = TRUE;
01378     }
01379     /* Now continue with normal heuristics */
01380     else if ((current_gap < row->min_space) &&
01381     (current_gap > row->space_threshold)) {
01382       /* Heuristics to turn dubious spaces to kerns */
01383       if (tosp_pass_wide_fuzz_sp_to_context > 0)
01384         fuzzy_sp_to_kn_limit = row->kern_size +
01385           tosp_pass_wide_fuzz_sp_to_context *
01386           (row->space_size - row->kern_size);
01387       else
01388         fuzzy_sp_to_kn_limit = 99999.0f;
01389 
01390       /* If current gap is significantly smaller than the previous space the other
01391       side of a narrow blob then this gap is a kern. */
01392       if ((prev_blob_box.width () > 0) &&
01393         narrow_blob (row, prev_blob_box) &&
01394         prev_gap_was_a_space &&
01395       (current_gap <= tosp_gap_factor * prev_gap)) {
01396         if ((tosp_all_flips_fuzzy) ||
01397         (current_gap > fuzzy_sp_to_kn_limit)) {
01398           if (tosp_flip_fuzz_sp_to_kn)
01399             fuzzy_non = TRUE;
01400           else
01401             fuzzy_sp = TRUE;
01402         }
01403         else
01404           space = FALSE;
01405 #ifndef GRAPHICS_DISABLED
01406         mark_gap (blob_box, 1,
01407           prev_gap, prev_blob_box.width (),
01408           current_gap, next_blob_box.width (), next_gap);
01409 #endif
01410       }
01411       /* If current gap not much bigger than the previous kern the other side of a
01412       narrow blob then this gap is a kern as well */
01413       else if ((prev_blob_box.width () > 0) &&
01414         narrow_blob (row, prev_blob_box) &&
01415         !prev_gap_was_a_space &&
01416       (current_gap * tosp_gap_factor <= prev_gap)) {
01417         if ((tosp_all_flips_fuzzy) ||
01418         (current_gap > fuzzy_sp_to_kn_limit)) {
01419           if (tosp_flip_fuzz_sp_to_kn)
01420             fuzzy_non = TRUE;
01421           else
01422             fuzzy_sp = TRUE;
01423         }
01424         else
01425           space = FALSE;
01426 #ifndef GRAPHICS_DISABLED
01427         mark_gap (blob_box, 2,
01428           prev_gap, prev_blob_box.width (),
01429           current_gap, next_blob_box.width (), next_gap);
01430 #endif
01431       }
01432       else if ((next_blob_box.width () > 0) &&
01433         narrow_blob (row, next_blob_box) &&
01434         (next_gap > row->space_threshold) &&
01435       (current_gap <= tosp_gap_factor * next_gap)) {
01436         if ((tosp_all_flips_fuzzy) ||
01437         (current_gap > fuzzy_sp_to_kn_limit)) {
01438           if (tosp_flip_fuzz_sp_to_kn)
01439             fuzzy_non = TRUE;
01440           else
01441             fuzzy_sp = TRUE;
01442         }
01443         else
01444           space = FALSE;
01445 #ifndef GRAPHICS_DISABLED
01446         mark_gap (blob_box, 3,
01447           prev_gap, prev_blob_box.width (),
01448           current_gap, next_blob_box.width (), next_gap);
01449 #endif
01450       }
01451       else if ((next_blob_box.width () > 0) &&
01452         narrow_blob (row, next_blob_box) &&
01453         (next_gap <= row->space_threshold) &&
01454       (current_gap * tosp_gap_factor <= next_gap)) {
01455         if ((tosp_all_flips_fuzzy) ||
01456         (current_gap > fuzzy_sp_to_kn_limit)) {
01457           if (tosp_flip_fuzz_sp_to_kn)
01458             fuzzy_non = TRUE;
01459           else
01460             fuzzy_sp = TRUE;
01461         }
01462         else
01463           space = FALSE;
01464 #ifndef GRAPHICS_DISABLED
01465         mark_gap (blob_box, 4,
01466           prev_gap, prev_blob_box.width (),
01467           current_gap, next_blob_box.width (), next_gap);
01468 #endif
01469       }
01470       else if ((((next_blob_box.width () > 0) &&
01471         narrow_blob (row, next_blob_box)) ||
01472         ((prev_blob_box.width () > 0) &&
01473       narrow_blob (row, prev_blob_box)))) {
01474         fuzzy_sp = TRUE;
01475 #ifndef GRAPHICS_DISABLED
01476         mark_gap (blob_box, 6,
01477           prev_gap, prev_blob_box.width (),
01478           current_gap, next_blob_box.width (), next_gap);
01479 #endif
01480       }
01481     }
01482     else if ((current_gap > row->max_nonspace) &&
01483              (current_gap <= row->space_threshold)) {
01484 
01485       /* Heuristics to turn dubious kerns to spaces */
01486       /* TRIED THIS BUT IT MADE THINGS WORSE
01487           if ( prev_gap == MAX_INT16 )
01488             prev_gap = 0;  // start of row
01489           if ( next_gap == MAX_INT16 )
01490             next_gap = 0;  // end of row
01491       */
01492       if ((prev_blob_box.width () > 0) &&
01493         (next_blob_box.width () > 0) &&
01494         (current_gap >=
01495         tosp_kern_gap_factor1 * MAX (prev_gap, next_gap)) &&
01496         wide_blob (row, prev_blob_box) &&
01497       wide_blob (row, next_blob_box)) {
01498 
01499         space = TRUE;
01500         /*
01501         tosp_flip_caution is an attempt to stop the default changing in cases
01502         where there is a large difference between the kern and space estimates.
01503           See problem in 'chiefs' where "have" gets split in the quotation.
01504         */
01505         if ((tosp_flip_fuzz_kn_to_sp) &&
01506           ((tosp_flip_caution <= 0) ||
01507           (tosp_flip_caution * row->kern_size > row->space_size)))
01508           fuzzy_sp = TRUE;
01509         else
01510           fuzzy_non = TRUE;
01511 #ifndef GRAPHICS_DISABLED
01512         mark_gap (blob_box, 7,
01513           prev_gap, prev_blob_box.width (),
01514           current_gap, next_blob_box.width (), next_gap);
01515 #endif
01516       } else if (prev_blob_box.width() > 0 &&
01517                  next_blob_box.width() > 0 &&
01518                  current_gap > 5 &&  // Rule 9 handles small gap, big ratio.
01519                  current_gap >=
01520                    tosp_kern_gap_factor2 * MAX(prev_gap, next_gap) &&
01521                  !(narrow_blob(row, prev_blob_box) ||
01522                    suspected_punct_blob(row, prev_blob_box)) &&
01523                  !(narrow_blob(row, next_blob_box) ||
01524                    suspected_punct_blob(row, next_blob_box))) {
01525         space = TRUE;
01526         fuzzy_non = TRUE;
01527 #ifndef GRAPHICS_DISABLED
01528         mark_gap (blob_box, 8,
01529           prev_gap, prev_blob_box.width (),
01530           current_gap, next_blob_box.width (), next_gap);
01531 #endif
01532       }
01533       else if ((tosp_kern_gap_factor3 > 0) &&
01534                (prev_blob_box.width () > 0) &&
01535                (next_blob_box.width () > 0) &&
01536                (current_gap >= tosp_kern_gap_factor3 * MAX (prev_gap, next_gap)) &&
01537                (!tosp_rule_9_test_punct ||
01538                 (!suspected_punct_blob (row, prev_blob_box) &&
01539                  !suspected_punct_blob (row, next_blob_box)))) {
01540         space = TRUE;
01541         fuzzy_non = TRUE;
01542 #ifndef GRAPHICS_DISABLED
01543         mark_gap (blob_box, 9,
01544           prev_gap, prev_blob_box.width (),
01545           current_gap, next_blob_box.width (), next_gap);
01546 #endif
01547       }
01548     }
01549     if (tosp_debug_level > 10)
01550       tprintf("word break = %d current_gap = %d, prev_gap = %d, "
01551               "next_gap = %d\n", space ? 1 : 0, current_gap,
01552               prev_gap, next_gap);
01553     prev_gap_was_a_space = space && !(fuzzy_non);
01554     return space;
01555   }
01556 }
01557 
01558 BOOL8 Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
01559   BOOL8 result;
01560   result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) ||
01561     (((float) blob_box.width () / blob_box.height ()) <=
01562     tosp_narrow_aspect_ratio));
01563   return result;
01564 }
01565 
01566 BOOL8 Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
01567   BOOL8 result;
01568   if (tosp_wide_fraction > 0) {
01569     if (tosp_wide_aspect_ratio > 0)
01570       result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) &&
01571         (((float) blob_box.width () / blob_box.height ()) >
01572         tosp_wide_aspect_ratio));
01573     else
01574       result = (blob_box.width () >= tosp_wide_fraction * row->xheight);
01575   }
01576   else
01577     result = !narrow_blob (row, blob_box);
01578   return result;
01579 }
01580 
01581 BOOL8 Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
01582   BOOL8 result;
01583   float baseline;
01584   float blob_x_centre;
01585   /* Find baseline of centre of blob */
01586   blob_x_centre = (box.right () + box.left ()) / 2.0;
01587   baseline = row->baseline.y (blob_x_centre);
01588 
01589   result = (box.height () <= 0.66 * row->xheight) ||
01590            (box.top () < baseline + row->xheight / 2.0) ||
01591            (box.bottom () > baseline + row->xheight / 2.0);
01592   return result;
01593 }
01594 
01595 
01596 void Textord::peek_at_next_gap(TO_ROW *row,
01597                                BLOBNBOX_IT box_it,
01598                                TBOX &next_blob_box,
01599                                inT16 &next_gap,
01600                                inT16 &next_within_xht_gap) {
01601   TBOX next_reduced_blob_box;
01602   TBOX bit_beyond;
01603   BLOBNBOX_IT reduced_box_it = box_it;
01604 
01605   next_blob_box = box_next (&box_it);
01606   next_reduced_blob_box = reduced_box_next (row, &reduced_box_it);
01607   if (box_it.at_first ()) {
01608     next_gap = MAX_INT16;
01609     next_within_xht_gap = MAX_INT16;
01610   }
01611   else {
01612     bit_beyond = box_it.data ()->bounding_box ();
01613     next_gap = bit_beyond.left () - next_blob_box.right ();
01614     bit_beyond = reduced_box_next (row, &reduced_box_it);
01615     next_within_xht_gap =
01616       bit_beyond.left () - next_reduced_blob_box.right ();
01617   }
01618 }
01619 
01620 
01621 #ifndef GRAPHICS_DISABLED
01622 void Textord::mark_gap(
01623     TBOX blob,   // blob following gap
01624     inT16 rule,  // heuristic id
01625     inT16 prev_gap,
01626     inT16 prev_blob_width,
01627     inT16 current_gap,
01628     inT16 next_blob_width,
01629     inT16 next_gap) {
01630   ScrollView::Color col;                    //of ellipse marking flipped gap
01631 
01632   switch (rule) {
01633     case 1:
01634       col = ScrollView::RED;
01635       break;
01636     case 2:
01637       col = ScrollView::CYAN;
01638       break;
01639     case 3:
01640       col = ScrollView::GREEN;
01641       break;
01642     case 4:
01643       col = ScrollView::BLACK;
01644       break;
01645     case 5:
01646       col = ScrollView::MAGENTA;
01647       break;
01648     case 6:
01649       col = ScrollView::BLUE;
01650       break;
01651 
01652     case 7:
01653       col = ScrollView::WHITE;
01654       break;
01655     case 8:
01656       col = ScrollView::YELLOW;
01657       break;
01658     case 9:
01659       col = ScrollView::BLACK;
01660       break;
01661 
01662     case 20:
01663       col = ScrollView::CYAN;
01664       break;
01665     case 21:
01666       col = ScrollView::GREEN;
01667       break;
01668     case 22:
01669       col = ScrollView::MAGENTA;
01670       break;
01671     default:
01672       col = ScrollView::BLACK;
01673   }
01674   if (textord_show_initial_words) {
01675     to_win->Pen(col);
01676   /*  if (rule < 20)
01677       //interior_style(to_win, INT_SOLID, FALSE);
01678     else
01679       //interior_style(to_win, INT_HOLLOW, TRUE);*/
01680                                  //x radius
01681     to_win->Ellipse (current_gap / 2.0f,
01682       blob.height () / 2.0f,     //y radius
01683                                  //x centre
01684       blob.left () - current_gap / 2.0f,
01685                                  //y centre
01686       blob.bottom () + blob.height () / 2.0f);
01687  }
01688   if (tosp_debug_level > 5)
01689     tprintf ("  (%d,%d) Sp<->Kn Rule %d %d %d %d %d\n",
01690       blob.left () - current_gap / 2, blob.bottom (), rule,
01691       prev_gap, prev_blob_width, current_gap,
01692       next_blob_width, next_gap);
01693 }
01694 #endif
01695 
01696 float Textord::find_mean_blob_spacing(WERD *word) {
01697   C_BLOB_IT cblob_it;
01698   TBOX blob_box;
01699   inT32 gap_sum = 0;
01700   inT16 gap_count = 0;
01701   inT16 prev_right;
01702 
01703   cblob_it.set_to_list (word->cblob_list ());
01704   if (!cblob_it.empty ()) {
01705     cblob_it.mark_cycle_pt ();
01706     prev_right = cblob_it.data ()->bounding_box ().right ();
01707     //first blob
01708     cblob_it.forward ();
01709     for (; !cblob_it.cycled_list (); cblob_it.forward ()) {
01710       blob_box = cblob_it.data ()->bounding_box ();
01711       gap_sum += blob_box.left () - prev_right;
01712       gap_count++;
01713       prev_right = blob_box.right ();
01714     }
01715   }
01716   if (gap_count > 0)
01717     return (gap_sum / (float) gap_count);
01718   else
01719     return 0.0f;
01720 }
01721 
01722 
01723 BOOL8 Textord::ignore_big_gap(TO_ROW *row,
01724                               inT32 row_length,
01725                               GAPMAP *gapmap,
01726                               inT16 left,
01727                               inT16 right) {
01728   inT16 gap = right - left + 1;
01729 
01730   if (tosp_ignore_big_gaps > 999)
01731     return FALSE;                //Dont ignore
01732   if (tosp_ignore_big_gaps > 0)
01733     return (gap > tosp_ignore_big_gaps * row->xheight);
01734   if (gap > tosp_ignore_very_big_gaps * row->xheight)
01735     return TRUE;
01736   if (tosp_ignore_big_gaps == 0) {
01737     if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight))
01738       return TRUE;
01739     if ((gap > 1.75 * row->xheight) &&
01740       ((row_length > 35 * row->xheight) ||
01741       gapmap->table_gap (left, right)))
01742       return TRUE;
01743   }
01744   else {
01745   /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table */
01746     if ((gap > gapmap_big_gaps * row->xheight) &&
01747       gapmap->table_gap (left, right))
01748       return TRUE;
01749   }
01750   return FALSE;
01751 }
01752 
01753 
01754 /**********************************************************************
01755  * reduced_box_next
01756  *
01757  * Compute the bounding box of this blob with merging of x overlaps
01758  * but no pre-chopping.
01759  * Then move the iterator on to the start of the next blob.
01760  * DONT reduce the box for small things - eg punctuation.
01761  **********************************************************************/
01762 TBOX Textord::reduced_box_next(
01763     TO_ROW *row,     // current row
01764     BLOBNBOX_IT *it  // iterator to blobds
01765                                ) {
01766   BLOBNBOX *blob;                //current blob
01767   BLOBNBOX *head_blob;           //place to store box
01768   TBOX full_box;                  //full blob boundg box
01769   TBOX reduced_box;               //box of significant part
01770   inT16 left_above_xht;          //ABOVE xht left limit
01771   inT16 new_left_above_xht;      //ABOVE xht left limit
01772 
01773   blob = it->data ();
01774   if (blob->red_box_set ()) {
01775     reduced_box = blob->reduced_box ();
01776     do {
01777       it->forward();
01778       blob = it->data();
01779     }
01780     while (blob->cblob() == NULL || blob->joined_to_prev());
01781     return reduced_box;
01782   }
01783   head_blob = blob;
01784   full_box = blob->bounding_box ();
01785   reduced_box = reduced_box_for_blob (blob, row, &left_above_xht);
01786   do {
01787     it->forward ();
01788     blob = it->data ();
01789     if (blob->cblob() == NULL)
01790                                  //was pre-chopped
01791       full_box += blob->bounding_box ();
01792     else if (blob->joined_to_prev ()) {
01793       reduced_box +=
01794         reduced_box_for_blob(blob, row, &new_left_above_xht);
01795       left_above_xht = MIN (left_above_xht, new_left_above_xht);
01796     }
01797   }
01798                                  //until next real blob
01799   while (blob->cblob() == NULL || blob->joined_to_prev());
01800 
01801   if ((reduced_box.width () > 0) &&
01802     ((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ())
01803   < left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) {
01804 #ifndef GRAPHICS_DISABLED
01805     if (textord_show_initial_words)
01806       reduced_box.plot (to_win, ScrollView::YELLOW, ScrollView::YELLOW);
01807 #endif
01808   }
01809   else
01810     reduced_box = full_box;
01811   head_blob->set_reduced_box (reduced_box);
01812   return reduced_box;
01813 }
01814 
01815 
01816 /*************************************************************************
01817  * reduced_box_for_blob()
01818  * Find box for blob which is the same height and y position as the whole blob,
01819  * but whose left limit is the left most position of the blob ABOVE the
01820  * baseline and whose right limit is the right most position of the blob BELOW
01821  * the xheight.
01822  *
01823  *
01824  * !!!!!!! WONT WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
01825  *         "home".  Perhaps we need something which say if the width ABOVE the
01826  *         xht alone includes the whole of the reduced width, then use the full
01827  *         blob box - Might still fail on italic F
01828  *
01829  *         Alternatively we could be a little less severe and only reduce the
01830  *         left and right edges by half the difference between the full box and
01831  *         the reduced box.
01832  *
01833  * NOTE that we need to rotate all the coordinates as
01834  * find_blob_limits finds the y min and max within a specified x band
01835  *************************************************************************/
01836 TBOX Textord::reduced_box_for_blob(
01837     BLOBNBOX *blob,
01838     TO_ROW *row,
01839     inT16 *left_above_xht) {
01840   float baseline;
01841   float blob_x_centre;
01842   float left_limit;
01843   float right_limit;
01844   float junk;
01845   TBOX blob_box;
01846 
01847   /* Find baseline of centre of blob */
01848 
01849   blob_box = blob->bounding_box ();
01850   blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
01851   baseline = row->baseline.y (blob_x_centre);
01852 
01853   /*
01854   Find LH limit of blob ABOVE the xht. This is so that we can detect certain
01855   caps ht chars which should NOT have their box reduced: T, Y, V, W etc
01856   */
01857   left_limit = (float) MAX_INT32;
01858   junk = (float) -MAX_INT32;
01859   find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight),
01860                      static_cast<float>(MAX_INT16), left_limit, junk);
01861   if (left_limit > junk)
01862     *left_above_xht = MAX_INT16; //No area above xht
01863   else
01864     *left_above_xht = (inT16) floor (left_limit);
01865   /*
01866   Find reduced LH limit of blob - the left extent of the region ABOVE the
01867   baseline.
01868   */
01869   left_limit = (float) MAX_INT32;
01870   junk = (float) -MAX_INT32;
01871   find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(MAX_INT16),
01872                      left_limit, junk);
01873 
01874   if (left_limit > junk)
01875     return TBOX ();               //no area within xht so return empty box
01876   /*
01877   Find reduced RH limit of blob - the right extent of the region BELOW the xht.
01878   */
01879   junk = (float) MAX_INT32;
01880   right_limit = (float) -MAX_INT32;
01881   find_cblob_hlimits(blob->cblob(), static_cast<float>(-MAX_INT16),
01882                      (baseline + row->xheight), junk, right_limit);
01883   if (junk > right_limit)
01884     return TBOX ();               //no area within xht so return empty box
01885 
01886   return TBOX (ICOORD ((inT16) floor (left_limit), blob_box.bottom ()),
01887     ICOORD ((inT16) ceil (right_limit), blob_box.top ()));
01888 }
01889 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines