tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/textord/makerow.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        makerow.cpp  (Formerly makerows.c)
00003  * Description: Code to arrange blobs into rows of text.
00004  * Author:              Ray Smith
00005  * Created:             Mon Sep 21 14:34:48 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef __UNIX__
00021 #include          <assert.h>
00022 #endif
00023 #include          "stderr.h"
00024 #include          "blobbox.h"
00025 #include          "ccstruct.h"
00026 #include          "detlinefit.h"
00027 #include          "statistc.h"
00028 #include          "drawtord.h"
00029 #include          "blkocc.h"
00030 #include          "sortflts.h"
00031 #include          "oldbasel.h"
00032 #include          "textord.h"
00033 #include          "tordmain.h"
00034 #include          "underlin.h"
00035 #include          "makerow.h"
00036 #include          "tprintf.h"
00037 #include          "tovars.h"
00038 
00039 // Include automatically generated configuration file if running autoconf.
00040 #ifdef HAVE_CONFIG_H
00041 #include "config_auto.h"
00042 #endif
00043 
00044 BOOL_VAR(textord_heavy_nr, FALSE, "Vigorously remove noise");
00045 BOOL_VAR(textord_show_initial_rows, FALSE, "Display row accumulation");
00046 BOOL_VAR(textord_show_parallel_rows, FALSE, "Display page correlated rows");
00047 BOOL_VAR(textord_show_expanded_rows, FALSE, "Display rows after expanding");
00048 BOOL_VAR(textord_show_final_rows, FALSE, "Display rows after final fitting");
00049 BOOL_VAR(textord_show_final_blobs, FALSE, "Display blob bounds after pre-ass");
00050 BOOL_VAR(textord_test_landscape, FALSE, "Tests refer to land/port");
00051 BOOL_VAR(textord_parallel_baselines, TRUE, "Force parallel baselines");
00052 BOOL_VAR(textord_straight_baselines, FALSE, "Force straight baselines");
00053 BOOL_VAR(textord_old_baselines, TRUE, "Use old baseline algorithm");
00054 BOOL_VAR(textord_old_xheight, FALSE, "Use old xheight algorithm");
00055 BOOL_VAR(textord_fix_xheight_bug, TRUE, "Use spline baseline");
00056 BOOL_VAR(textord_fix_makerow_bug, TRUE, "Prevent multiple baselines");
00057 BOOL_VAR(textord_debug_xheights, FALSE, "Test xheight algorithms");
00058 BOOL_VAR(textord_biased_skewcalc, TRUE, "Bias skew estimates with line length");
00059 BOOL_VAR(textord_interpolating_skew, TRUE, "Interpolate across gaps");
00060 INT_VAR(textord_skewsmooth_offset, 4, "For smooth factor");
00061 INT_VAR(textord_skewsmooth_offset2, 1, "For smooth factor");
00062 INT_VAR(textord_test_x, -MAX_INT32, "coord of test pt");
00063 INT_VAR(textord_test_y, -MAX_INT32, "coord of test pt");
00064 INT_VAR(textord_min_blobs_in_row, 4, "Min blobs before gradient counted");
00065 INT_VAR(textord_spline_minblobs, 8, "Min blobs in each spline segment");
00066 INT_VAR(textord_spline_medianwin, 6, "Size of window for spline segmentation");
00067 INT_VAR(textord_max_blob_overlaps, 4,
00068         "Max number of blobs a big blob can overlap");
00069 INT_VAR(textord_min_xheight, 10, "Min credible pixel xheight");
00070 double_VAR(textord_spline_shift_fraction, 0.02,
00071            "Fraction of line spacing for quad");
00072 double_VAR(textord_spline_outlier_fraction, 0.1,
00073            "Fraction of line spacing for outlier");
00074 double_VAR(textord_skew_ile, 0.5, "Ile of gradients for page skew");
00075 double_VAR(textord_skew_lag, 0.02, "Lag for skew on row accumulation");
00076 double_VAR(textord_linespace_iqrlimit, 0.2, "Max iqr/median for linespace");
00077 double_VAR(textord_width_limit, 8, "Max width of blobs to make rows");
00078 double_VAR(textord_chop_width, 1.5, "Max width before chopping");
00079 double_VAR(textord_expansion_factor, 1.0,
00080            "Factor to expand rows by in expand_rows");
00081 double_VAR(textord_overlap_x, 0.375, "Fraction of linespace for good overlap");
00082 double_VAR(textord_minxh, 0.25, "fraction of linesize for min xheight");
00083 double_VAR(textord_min_linesize, 1.25, "* blob height for initial linesize");
00084 double_VAR(textord_excess_blobsize, 1.3,
00085            "New row made if blob makes row this big");
00086 double_VAR(textord_occupancy_threshold, 0.4, "Fraction of neighbourhood");
00087 double_VAR(textord_underline_width, 2.0, "Multiple of line_size for underline");
00088 double_VAR(textord_min_blob_height_fraction, 0.75,
00089            "Min blob height/top to include blob top into xheight stats");
00090 double_VAR(textord_xheight_mode_fraction, 0.4,
00091            "Min pile height to make xheight");
00092 double_VAR(textord_ascheight_mode_fraction, 0.08,
00093            "Min pile height to make ascheight");
00094 double_VAR(textord_descheight_mode_fraction, 0.08,
00095            "Min pile height to make descheight");
00096 double_VAR(textord_ascx_ratio_min, 1.25, "Min cap/xheight");
00097 double_VAR(textord_ascx_ratio_max, 1.8, "Max cap/xheight");
00098 double_VAR(textord_descx_ratio_min, 0.25, "Min desc/xheight");
00099 double_VAR(textord_descx_ratio_max, 0.6, "Max desc/xheight");
00100 double_VAR(textord_xheight_error_margin, 0.1, "Accepted variation");
00101 INT_VAR(textord_lms_line_trials, 12, "Number of linew fits to do");
00102 BOOL_VAR(textord_new_initial_xheight, TRUE, "Use test xheight mechanism");
00103 BOOL_VAR(textord_debug_blob, FALSE, "Print test blob information");
00104 
00105 #define MAX_HEIGHT_MODES  12
00106 
00107 const int kMinLeaderCount = 5;
00108 
00109 // Factored-out helper to build a single row from a list of blobs.
00110 // Returns the mean blob size.
00111 static float MakeRowFromBlobs(float line_size,
00112                               BLOBNBOX_IT* blob_it, TO_ROW_IT* row_it) {
00113   blob_it->sort(blob_x_order);
00114   blob_it->move_to_first();
00115   TO_ROW* row = NULL;
00116   float total_size = 0.0f;
00117   int blob_count = 0;
00118   // Add all the blobs to a single TO_ROW.
00119   for (; !blob_it->empty(); blob_it->forward()) {
00120     BLOBNBOX* blob = blob_it->extract();
00121     int top = blob->bounding_box().top();
00122     int bottom = blob->bounding_box().bottom();
00123     if (row == NULL) {
00124       row = new TO_ROW(blob, top, bottom, line_size);
00125       row_it->add_before_then_move(row);
00126     } else {
00127       row->add_blob(blob, top, bottom, line_size);
00128     }
00129     total_size += top - bottom;
00130     ++blob_count;
00131   }
00132   return blob_count > 0 ? total_size / blob_count : total_size;
00133 }
00134 
00135 // Helper to make a row using the children of a single blob.
00136 // Returns the mean size of the blobs created.
00137 float MakeRowFromSubBlobs(TO_BLOCK* block, C_BLOB* blob, TO_ROW_IT* row_it) {
00138   // The blobs made from the children will go in the small_blobs list.
00139   BLOBNBOX_IT bb_it(&block->small_blobs);
00140   C_OUTLINE_IT ol_it(blob->out_list());
00141   // Get the children.
00142   ol_it.set_to_list(ol_it.data()->child());
00143   if (ol_it.empty())
00144     return 0.0f;
00145   for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
00146     // Deep copy the child outline and use that to make a blob.
00147     C_BLOB* blob = new C_BLOB(C_OUTLINE::deep_copy(ol_it.data()));
00148     // Correct direction as needed.
00149     blob->CheckInverseFlagAndDirection();
00150     BLOBNBOX* bbox = new BLOBNBOX(blob);
00151     bb_it.add_after_then_move(bbox);
00152   }
00153   // Now we can make a row from the blobs.
00154   return MakeRowFromBlobs(block->line_size, &bb_it, row_it);
00155 }
00156 
00164 float make_single_row(ICOORD page_tr, TO_BLOCK* block, TO_BLOCK_LIST* blocks) {
00165   BLOBNBOX_IT blob_it = &block->blobs;
00166   TO_ROW_IT row_it = block->get_rows();
00167 
00168   // Include all the small blobs and large blobs.
00169   blob_it.add_list_after(&block->small_blobs);
00170   blob_it.add_list_after(&block->noise_blobs);
00171   blob_it.add_list_after(&block->large_blobs);
00172   if (block->blobs.singleton()) {
00173     blob_it.move_to_first();
00174     float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);
00175     if (size > block->line_size)
00176       block->line_size = size;
00177   }
00178   MakeRowFromBlobs(block->line_size, &blob_it, &row_it);
00179   // Fit an LMS line to the rows.
00180   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward())
00181     fit_lms_line(row_it.data());
00182   float gradient;
00183   float fit_error;
00184   // Compute the skew based on the fitted line.
00185   compute_page_skew(blocks, gradient, fit_error);
00186   return gradient;
00187 }
00188 
00194 float make_rows(ICOORD page_tr, TO_BLOCK_LIST *port_blocks) {
00195   float port_m;                  // global skew
00196   float port_err;                // global noise
00197   TO_BLOCK_IT block_it;          // iterator
00198 
00199   block_it.set_to_list(port_blocks);
00200   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00201        block_it.forward())
00202   make_initial_textrows(page_tr, block_it.data(), FCOORD(1.0f, 0.0f),
00203       !(BOOL8) textord_test_landscape);
00204                                  // compute globally
00205   compute_page_skew(port_blocks, port_m, port_err);
00206   block_it.set_to_list(port_blocks);
00207   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
00208     cleanup_rows_making(page_tr, block_it.data(), port_m, FCOORD(1.0f, 0.0f),
00209                  block_it.data()->block->bounding_box().left(),
00210                  !(BOOL8)textord_test_landscape);
00211   }
00212   return port_m;                 // global skew
00213 }
00214 
00220 void make_initial_textrows(                  //find lines
00221                            ICOORD page_tr,
00222                            TO_BLOCK *block,  //block to do
00223                            FCOORD rotation,  //for drawing
00224                            BOOL8 testing_on  //correct orientation
00225                           ) {
00226   TO_ROW_IT row_it = block->get_rows ();
00227 
00228 #ifndef GRAPHICS_DISABLED
00229   ScrollView::Color colour;                 //of row
00230 
00231   if (textord_show_initial_rows && testing_on) {
00232     if (to_win == NULL)
00233       create_to_win(page_tr);
00234   }
00235 #endif
00236                                  //guess skew
00237   assign_blobs_to_rows (block, NULL, 0, TRUE, TRUE, textord_show_initial_rows && testing_on);
00238   row_it.move_to_first ();
00239   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00240     fit_lms_line (row_it.data ());
00241 #ifndef GRAPHICS_DISABLED
00242   if (textord_show_initial_rows && testing_on) {
00243     colour = ScrollView::RED;
00244     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00245       plot_to_row (row_it.data (), colour, rotation);
00246       colour = (ScrollView::Color) (colour + 1);
00247       if (colour > ScrollView::MAGENTA)
00248         colour = ScrollView::RED;
00249     }
00250   }
00251 #endif
00252 }
00253 
00254 
00260 void fit_lms_line(TO_ROW *row) {
00261   float m, c;                    // fitted line
00262   tesseract::DetLineFit lms;
00263   BLOBNBOX_IT blob_it = row->blob_list();
00264 
00265   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00266     const TBOX& box = blob_it.data()->bounding_box();
00267     lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
00268   }
00269   double error = lms.Fit(&m, &c);
00270   row->set_line(m, c, error);
00271 }
00272 
00273 
00280 void compute_page_skew(                        //get average gradient
00281                        TO_BLOCK_LIST *blocks,  //list of blocks
00282                        float &page_m,          //average gradient
00283                        float &page_err         //average error
00284                       ) {
00285   inT32 row_count;               //total rows
00286   inT32 blob_count;              //total_blobs
00287   inT32 row_err;                 //integer error
00288   float *gradients;              //of rows
00289   float *errors;                 //of rows
00290   inT32 row_index;               //of total
00291   TO_ROW *row;                   //current row
00292   TO_BLOCK_IT block_it = blocks; //iterator
00293   TO_ROW_IT row_it;
00294 
00295   row_count = 0;
00296   blob_count = 0;
00297   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00298        block_it.forward ()) {
00299     POLY_BLOCK* pb = block_it.data()->block->poly_block();
00300     if (pb != NULL && !pb->IsText())
00301       continue;  // Pretend non-text blocks don't exist.
00302     row_count += block_it.data ()->get_rows ()->length ();
00303     //count up rows
00304     row_it.set_to_list (block_it.data ()->get_rows ());
00305     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00306       blob_count += row_it.data ()->blob_list ()->length ();
00307   }
00308   if (row_count == 0) {
00309     page_m = 0.0f;
00310     page_err = 0.0f;
00311     return;
00312   }
00313   gradients = (float *) alloc_mem (blob_count * sizeof (float));
00314   //get mem
00315   errors = (float *) alloc_mem (blob_count * sizeof (float));
00316   if (gradients == NULL || errors == NULL)
00317     MEMORY_OUT.error ("compute_page_skew", ABORT, NULL);
00318 
00319   row_index = 0;
00320   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00321        block_it.forward ()) {
00322     POLY_BLOCK* pb = block_it.data()->block->poly_block();
00323     if (pb != NULL && !pb->IsText())
00324       continue;  // Pretend non-text blocks don't exist.
00325     row_it.set_to_list (block_it.data ()->get_rows ());
00326     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00327       row = row_it.data ();
00328       blob_count = row->blob_list ()->length ();
00329       row_err = (inT32) ceil (row->line_error ());
00330       if (row_err <= 0)
00331         row_err = 1;
00332       if (textord_biased_skewcalc) {
00333         blob_count /= row_err;
00334         for (blob_count /= row_err; blob_count > 0; blob_count--) {
00335           gradients[row_index] = row->line_m ();
00336           errors[row_index] = row->line_error ();
00337           row_index++;
00338         }
00339       }
00340       else if (blob_count >= textord_min_blobs_in_row) {
00341                                  //get gradient
00342         gradients[row_index] = row->line_m ();
00343         errors[row_index] = row->line_error ();
00344         row_index++;
00345       }
00346     }
00347   }
00348   if (row_index == 0) {
00349                                  //desperate
00350     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00351          block_it.forward ()) {
00352       POLY_BLOCK* pb = block_it.data()->block->poly_block();
00353       if (pb != NULL && !pb->IsText())
00354         continue;  // Pretend non-text blocks don't exist.
00355       row_it.set_to_list (block_it.data ()->get_rows ());
00356       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
00357            row_it.forward ()) {
00358         row = row_it.data ();
00359         gradients[row_index] = row->line_m ();
00360         errors[row_index] = row->line_error ();
00361         row_index++;
00362       }
00363     }
00364   }
00365   row_count = row_index;
00366   row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
00367     gradients, row_count);
00368   page_m = gradients[row_index];
00369   row_index = choose_nth_item ((inT32) (row_count * textord_skew_ile),
00370     errors, row_count);
00371   page_err = errors[row_index];
00372   free_mem(gradients);
00373   free_mem(errors);
00374 }
00375 
00376 const double kNoiseSize = 0.5;  // Fraction of xheight.
00377 const int kMinSize = 8;  // Min pixels to be xheight.
00378 
00383 static bool dot_of_i(BLOBNBOX* dot, BLOBNBOX* i, TO_ROW* row) {
00384   const TBOX& ibox = i->bounding_box();
00385   const TBOX& dotbox = dot->bounding_box();
00386 
00387   // Must overlap horizontally by enough and be high enough.
00388   int overlap = MIN(dotbox.right(), ibox.right()) -
00389                 MAX(dotbox.left(), ibox.left());
00390   if (ibox.height() <= 2 * dotbox.height() ||
00391       (overlap * 2 < ibox.width() && overlap < dotbox.width()))
00392     return false;
00393 
00394   // If the i is tall and thin then it is good.
00395   if (ibox.height() > ibox.width() * 2)
00396     return true;  // The i or ! must be tall and thin.
00397 
00398   // It might still be tall and thin, but it might be joined to something.
00399   // So search the outline for a piece of large height close to the edges
00400   // of the dot.
00401   const double kHeightFraction = 0.6;
00402   double target_height = MIN(dotbox.bottom(), ibox.top());
00403   target_height -= row->line_m()*dotbox.left() + row->line_c();
00404   target_height *= kHeightFraction;
00405   int left_min = dotbox.left() - dotbox.width();
00406   int middle = (dotbox.left() + dotbox.right())/2;
00407   int right_max = dotbox.right() + dotbox.width();
00408   int left_miny = 0;
00409   int left_maxy = 0;
00410   int right_miny = 0;
00411   int right_maxy = 0;
00412   bool found_left = false;
00413   bool found_right = false;
00414   bool in_left = false;
00415   bool in_right = false;
00416   C_BLOB* blob = i->cblob();
00417   C_OUTLINE_IT o_it = blob->out_list();
00418   for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {
00419     C_OUTLINE* outline = o_it.data();
00420     int length = outline->pathlength();
00421     ICOORD pos = outline->start_pos();
00422     for (int step = 0; step < length; pos += outline->step(step++)) {
00423       int x = pos.x();
00424       int y = pos.y();
00425       if (x >= left_min && x < middle && !found_left) {
00426         // We are in the left part so find min and max y.
00427         if (in_left) {
00428           if (y > left_maxy) left_maxy = y;
00429           if (y < left_miny) left_miny = y;
00430         } else {
00431           left_maxy = left_miny = y;
00432           in_left = true;
00433         }
00434       } else if (in_left) {
00435         // We just left the left so look for size.
00436         if (left_maxy - left_miny > target_height) {
00437           if (found_right)
00438             return true;
00439           found_left = true;
00440         }
00441         in_left = false;
00442       }
00443       if (x <= right_max && x > middle && !found_right) {
00444         // We are in the right part so find min and max y.
00445         if (in_right) {
00446           if (y > right_maxy) right_maxy = y;
00447           if (y < right_miny) right_miny = y;
00448         } else {
00449           right_maxy = right_miny = y;
00450           in_right = true;
00451         }
00452       } else if (in_right) {
00453         // We just left the right so look for size.
00454         if (right_maxy - right_miny > target_height) {
00455           if (found_left)
00456             return true;
00457           found_right = true;
00458         }
00459         in_right = false;
00460       }
00461     }
00462   }
00463   return false;
00464 }
00465 
00466 void vigorous_noise_removal(TO_BLOCK* block) {
00467   TO_ROW_IT row_it = block->get_rows ();
00468   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00469     TO_ROW* row = row_it.data();
00470     BLOBNBOX_IT b_it = row->blob_list();
00471     // Estimate the xheight on the row.
00472     int max_height = 0;
00473     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00474       BLOBNBOX* blob = b_it.data();
00475       if (blob->bounding_box().height() > max_height)
00476         max_height = blob->bounding_box().height();
00477     }
00478     STATS hstats(0, max_height + 1);
00479     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00480       BLOBNBOX* blob = b_it.data();
00481       int height = blob->bounding_box().height();
00482       if (height >= kMinSize)
00483         hstats.add(blob->bounding_box().height(), 1);
00484     }
00485     float xheight = hstats.median();
00486     // Delete small objects.
00487     BLOBNBOX* prev = NULL;
00488     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00489       BLOBNBOX* blob = b_it.data();
00490       const TBOX& box = blob->bounding_box();
00491       if (box.height() < kNoiseSize * xheight) {
00492         // Small so delete unless it looks like an i dot.
00493         if (prev != NULL) {
00494           if (dot_of_i(blob, prev, row))
00495             continue;  // Looks OK.
00496         }
00497         if (!b_it.at_last()) {
00498           BLOBNBOX* next = b_it.data_relative(1);
00499           if (dot_of_i(blob, next, row))
00500             continue;  // Looks OK.
00501         }
00502         // It might be noise so get rid of it.
00503         if (blob->cblob() != NULL)
00504           delete blob->cblob();
00505         delete b_it.extract();
00506       } else {
00507         prev = blob;
00508       }
00509     }
00510   }
00511 }
00512 
00518 void cleanup_rows_making(                   //find lines
00519                   ICOORD page_tr,    //top right
00520                   TO_BLOCK *block,   //block to do
00521                   float gradient,    //gradient to fit
00522                   FCOORD rotation,   //for drawing
00523                   inT32 block_edge,  //edge of block
00524                   BOOL8 testing_on  //correct orientation
00525                  ) {
00526                                  //iterators
00527   BLOBNBOX_IT blob_it = &block->blobs;
00528   TO_ROW_IT row_it = block->get_rows ();
00529 
00530 #ifndef GRAPHICS_DISABLED
00531   if (textord_show_parallel_rows && testing_on) {
00532     if (to_win == NULL)
00533       create_to_win(page_tr);
00534   }
00535 #endif
00536                                  //get row coords
00537   fit_parallel_rows(block,
00538                     gradient,
00539                     rotation,
00540                     block_edge,
00541                     textord_show_parallel_rows &&testing_on);
00542   delete_non_dropout_rows(block,
00543                           gradient,
00544                           rotation,
00545                           block_edge,
00546                           textord_show_parallel_rows &&testing_on);
00547   expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
00548   blob_it.set_to_list (&block->blobs);
00549   row_it.set_to_list (block->get_rows ());
00550   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00551     blob_it.add_list_after (row_it.data ()->blob_list ());
00552   //give blobs back
00553   assign_blobs_to_rows (block, &gradient, 1, FALSE, FALSE, FALSE);
00554   //now new rows must be genuine
00555   blob_it.set_to_list (&block->blobs);
00556   blob_it.add_list_after (&block->large_blobs);
00557   assign_blobs_to_rows (block, &gradient, 2, TRUE, TRUE, FALSE);
00558   //safe to use big ones now
00559   blob_it.set_to_list (&block->blobs);
00560                                  //throw all blobs in
00561   blob_it.add_list_after (&block->noise_blobs);
00562   blob_it.add_list_after (&block->small_blobs);
00563   assign_blobs_to_rows (block, &gradient, 3, FALSE, FALSE, FALSE);
00564 }
00565 
00571 void delete_non_dropout_rows(                   //find lines
00572                              TO_BLOCK *block,   //block to do
00573                              float gradient,    //global skew
00574                              FCOORD rotation,   //deskew vector
00575                              inT32 block_edge,  //left edge
00576                              BOOL8 testing_on   //correct orientation
00577                             ) {
00578   TBOX block_box;                 //deskewed block
00579   inT32 *deltas;                 //change in occupation
00580   inT32 *occupation;             //of pixel coords
00581   inT32 max_y;                   //in block
00582   inT32 min_y;
00583   inT32 line_index;              //of scan line
00584   inT32 line_count;              //no of scan lines
00585   inT32 distance;                //to drop-out
00586   inT32 xleft;                   //of block
00587   inT32 ybottom;                 //of block
00588   TO_ROW *row;                   //current row
00589   TO_ROW_IT row_it = block->get_rows ();
00590   BLOBNBOX_IT blob_it = &block->blobs;
00591 
00592   if (row_it.length () == 0)
00593     return;                      //empty block
00594   block_box = deskew_block_coords (block, gradient);
00595   xleft = block->block->bounding_box ().left ();
00596   ybottom = block->block->bounding_box ().bottom ();
00597   min_y = block_box.bottom () - 1;
00598   max_y = block_box.top () + 1;
00599   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00600     line_index = (inT32) floor (row_it.data ()->intercept ());
00601     if (line_index <= min_y)
00602       min_y = line_index - 1;
00603     if (line_index >= max_y)
00604       max_y = line_index + 1;
00605   }
00606   line_count = max_y - min_y + 1;
00607   if (line_count <= 0)
00608     return;                      //empty block
00609   deltas = (inT32 *) alloc_mem (line_count * sizeof (inT32));
00610   occupation = (inT32 *) alloc_mem (line_count * sizeof (inT32));
00611   if (deltas == NULL || occupation == NULL)
00612     MEMORY_OUT.error ("compute_line_spacing", ABORT, NULL);
00613 
00614   compute_line_occupation(block, gradient, min_y, max_y, occupation, deltas);
00615   compute_occupation_threshold ((inT32)
00616     ceil (block->line_spacing *
00617     (tesseract::CCStruct::kDescenderFraction +
00618     tesseract::CCStruct::kAscenderFraction)),
00619     (inT32) ceil (block->line_spacing *
00620     (tesseract::CCStruct::kXHeightFraction +
00621     tesseract::CCStruct::kAscenderFraction)),
00622     max_y - min_y + 1, occupation, deltas);
00623 #ifndef GRAPHICS_DISABLED
00624   if (testing_on) {
00625     draw_occupation(xleft, ybottom, min_y, max_y, occupation, deltas);
00626   }
00627 #endif
00628   compute_dropout_distances(occupation, deltas, line_count);
00629   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00630     row = row_it.data ();
00631     line_index = (inT32) floor (row->intercept ());
00632     distance = deltas[line_index - min_y];
00633     if (find_best_dropout_row (row, distance, block->line_spacing / 2,
00634     line_index, &row_it, testing_on)) {
00635 #ifndef GRAPHICS_DISABLED
00636       if (testing_on)
00637         plot_parallel_row(row, gradient, block_edge,
00638                           ScrollView::WHITE, rotation);
00639 #endif
00640       blob_it.add_list_after (row_it.data ()->blob_list ());
00641       delete row_it.extract ();  //too far away
00642     }
00643   }
00644   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00645     blob_it.add_list_after (row_it.data ()->blob_list ());
00646   }
00647 
00648   free_mem(deltas);
00649   free_mem(occupation);
00650 }
00651 
00652 
00659 BOOL8 find_best_dropout_row(                    //find neighbours
00660                             TO_ROW *row,        //row to test
00661                             inT32 distance,     //dropout dist
00662                             float dist_limit,   //threshold distance
00663                             inT32 line_index,   //index of row
00664                             TO_ROW_IT *row_it,  //current position
00665                             BOOL8 testing_on    //correct orientation
00666                            ) {
00667   inT32 next_index;              //of neigbouring row
00668   inT32 row_offset;              //from current row
00669   inT32 abs_dist;                //absolute distance
00670   inT8 row_inc;                  //increment to row_index
00671   TO_ROW *next_row;              //nextious row
00672 
00673   if (testing_on)
00674     tprintf ("Row at %g(%g), dropout dist=%d,",
00675       row->intercept (), row->parallel_c (), distance);
00676   if (distance < 0) {
00677     row_inc = 1;
00678     abs_dist = -distance;
00679   }
00680   else {
00681     row_inc = -1;
00682     abs_dist = distance;
00683   }
00684   if (abs_dist > dist_limit) {
00685     if (testing_on) {
00686       tprintf (" too far - deleting\n");
00687     }
00688     return TRUE;
00689   }
00690   if ((distance < 0 && !row_it->at_last ())
00691   || (distance >= 0 && !row_it->at_first ())) {
00692     row_offset = row_inc;
00693     do {
00694       next_row = row_it->data_relative (row_offset);
00695       next_index = (inT32) floor (next_row->intercept ());
00696       if ((distance < 0
00697         && next_index < line_index
00698         && next_index > line_index + distance + distance)
00699         || (distance >= 0
00700         && next_index > line_index
00701       && next_index < line_index + distance + distance)) {
00702         if (testing_on) {
00703           tprintf (" nearer neighbour (%d) at %g\n",
00704             line_index + distance - next_index,
00705             next_row->intercept ());
00706         }
00707         return TRUE;             //other is nearer
00708       }
00709       else if (next_index == line_index
00710       || next_index == line_index + distance + distance) {
00711         if (row->believability () <= next_row->believability ()) {
00712           if (testing_on) {
00713             tprintf (" equal but more believable at %g (%g/%g)\n",
00714               next_row->intercept (),
00715               row->believability (),
00716               next_row->believability ());
00717           }
00718           return TRUE;           //other is more believable
00719         }
00720       }
00721       row_offset += row_inc;
00722     }
00723     while ((next_index == line_index
00724       || next_index == line_index + distance + distance)
00725       && row_offset < row_it->length ());
00726     if (testing_on)
00727       tprintf (" keeping\n");
00728   }
00729   return FALSE;
00730 }
00731 
00732 
00739 TBOX deskew_block_coords(                  //block box
00740                         TO_BLOCK *block,  //block to do
00741                         float gradient    //global skew
00742                        ) {
00743   TBOX result;                    //block bounds
00744   TBOX blob_box;                  //of block
00745   FCOORD rotation;               //deskew vector
00746   float length;                  //of gradient vector
00747   TO_ROW_IT row_it = block->get_rows ();
00748   TO_ROW *row;                   //current row
00749   BLOBNBOX *blob;                //current blob
00750   BLOBNBOX_IT blob_it;           //iterator
00751 
00752   length = sqrt (gradient * gradient + 1);
00753   rotation = FCOORD (1 / length, -gradient / length);
00754   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00755     row = row_it.data ();
00756     blob_it.set_to_list (row->blob_list ());
00757     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00758     blob_it.forward ()) {
00759       blob = blob_it.data ();
00760       blob_box = blob->bounding_box ();
00761       blob_box.rotate (rotation);//de-skew it
00762       result += blob_box;
00763     }
00764   }
00765   return result;
00766 }
00767 
00768 
00775 void compute_line_occupation(                    //project blobs
00776                              TO_BLOCK *block,    //block to do
00777                              float gradient,     //global skew
00778                              inT32 min_y,        //min coord in block
00779                              inT32 max_y,        //in block
00780                              inT32 *occupation,  //output projection
00781                              inT32 *deltas       //derivative
00782                             ) {
00783   inT32 line_count;              //maxy-miny+1
00784   inT32 line_index;              //of scan line
00785   int index;                     //array index for daft compilers
00786   float top, bottom;             //coords of blob
00787   inT32 width;                   //of blob
00788   TO_ROW *row;                   //current row
00789   TO_ROW_IT row_it = block->get_rows ();
00790   BLOBNBOX *blob;                //current blob
00791   BLOBNBOX_IT blob_it;           //iterator
00792   float length;                  //of skew vector
00793   TBOX blob_box;                  //bounding box
00794   FCOORD rotation;               //inverse of skew
00795 
00796   line_count = max_y - min_y + 1;
00797   length = sqrt (gradient * gradient + 1);
00798   rotation = FCOORD (1 / length, -gradient / length);
00799   for (line_index = 0; line_index < line_count; line_index++)
00800     deltas[line_index] = 0;
00801   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00802     row = row_it.data ();
00803     blob_it.set_to_list (row->blob_list ());
00804     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00805     blob_it.forward ()) {
00806       blob = blob_it.data ();
00807       blob_box = blob->bounding_box ();
00808       blob_box.rotate (rotation);//de-skew it
00809       top = blob_box.top ();
00810       bottom = blob_box.bottom ();
00811       width =
00812         (inT32) floor ((FLOAT32) (blob_box.right () - blob_box.left ()));
00813       if ((inT32) floor (bottom) < min_y
00814         || (inT32) floor (bottom) - min_y >= line_count)
00815         fprintf (stderr,
00816           "Bad y coord of bottom, " INT32FORMAT "(" INT32FORMAT ","
00817           INT32FORMAT ")\n", (inT32) floor (bottom), min_y, max_y);
00818                                  //count transitions
00819       index = (inT32) floor (bottom) - min_y;
00820       deltas[index] += width;
00821       if ((inT32) floor (top) < min_y
00822         || (inT32) floor (top) - min_y >= line_count)
00823         fprintf (stderr,
00824           "Bad y coord of top, " INT32FORMAT "(" INT32FORMAT ","
00825           INT32FORMAT ")\n", (inT32) floor (top), min_y, max_y);
00826       index = (inT32) floor (top) - min_y;
00827       deltas[index] -= width;
00828     }
00829   }
00830   occupation[0] = deltas[0];
00831   for (line_index = 1; line_index < line_count; line_index++)
00832     occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
00833 }
00834 
00835 
00841 void compute_occupation_threshold(                    //project blobs
00842                                   inT32 low_window,   //below result point
00843                                   inT32 high_window,  //above result point
00844                                   inT32 line_count,   //array sizes
00845                                   inT32 *occupation,  //input projection
00846                                   inT32 *thresholds   //output thresholds
00847                                  ) {
00848   inT32 line_index;              //of thresholds line
00849   inT32 low_index;               //in occupation
00850   inT32 high_index;              //in occupation
00851   inT32 sum;                     //current average
00852   inT32 divisor;                 //to get thresholds
00853   inT32 min_index;               //of min occ
00854   inT32 min_occ;                 //min in locality
00855   inT32 test_index;              //for finding min
00856 
00857   divisor =
00858     (inT32) ceil ((low_window + high_window) / textord_occupancy_threshold);
00859   if (low_window + high_window < line_count) {
00860     for (sum = 0, high_index = 0; high_index < low_window; high_index++)
00861       sum += occupation[high_index];
00862     for (low_index = 0; low_index < high_window; low_index++, high_index++)
00863       sum += occupation[high_index];
00864     min_occ = occupation[0];
00865     min_index = 0;
00866     for (test_index = 1; test_index < high_index; test_index++) {
00867       if (occupation[test_index] <= min_occ) {
00868         min_occ = occupation[test_index];
00869         min_index = test_index;  //find min in region
00870       }
00871     }
00872     for (line_index = 0; line_index < low_window; line_index++)
00873       thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
00874     //same out to end
00875     for (low_index = 0; high_index < line_count; low_index++, high_index++) {
00876       sum -= occupation[low_index];
00877       sum += occupation[high_index];
00878       if (occupation[high_index] <= min_occ) {
00879                                  //find min in region
00880         min_occ = occupation[high_index];
00881         min_index = high_index;
00882       }
00883                                  //lost min from region
00884       if (min_index <= low_index) {
00885         min_occ = occupation[low_index + 1];
00886         min_index = low_index + 1;
00887         for (test_index = low_index + 2; test_index <= high_index;
00888         test_index++) {
00889           if (occupation[test_index] <= min_occ) {
00890             min_occ = occupation[test_index];
00891                                  //find min in region
00892             min_index = test_index;
00893           }
00894         }
00895       }
00896       thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
00897     }
00898   }
00899   else {
00900     min_occ = occupation[0];
00901     min_index = 0;
00902     for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
00903       if (occupation[low_index] < min_occ) {
00904         min_occ = occupation[low_index];
00905         min_index = low_index;
00906       }
00907       sum += occupation[low_index];
00908     }
00909     line_index = 0;
00910   }
00911   for (; line_index < line_count; line_index++)
00912     thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
00913   //same out to end
00914 }
00915 
00916 
00922 void compute_dropout_distances(                    //project blobs
00923                                inT32 *occupation,  //input projection
00924                                inT32 *thresholds,  //output thresholds
00925                                inT32 line_count    //array sizes
00926                               ) {
00927   inT32 line_index;              //of thresholds line
00928   inT32 distance;                //from prev dropout
00929   inT32 next_dist;               //to next dropout
00930   inT32 back_index;              //for back filling
00931   inT32 prev_threshold;          //before overwrite
00932 
00933   distance = -line_count;
00934   line_index = 0;
00935   do {
00936     do {
00937       distance--;
00938       prev_threshold = thresholds[line_index];
00939                                  //distance from prev
00940       thresholds[line_index] = distance;
00941       line_index++;
00942     }
00943     while (line_index < line_count
00944       && (occupation[line_index] < thresholds[line_index]
00945       || occupation[line_index - 1] >= prev_threshold));
00946     if (line_index < line_count) {
00947       back_index = line_index - 1;
00948       next_dist = 1;
00949       while (next_dist < -distance && back_index >= 0) {
00950         thresholds[back_index] = next_dist;
00951         back_index--;
00952         next_dist++;
00953         distance++;
00954       }
00955       distance = 1;
00956     }
00957   }
00958   while (line_index < line_count);
00959 }
00960 
00961 
00969 void expand_rows(                   //find lines
00970                  ICOORD page_tr,    //top right
00971                  TO_BLOCK *block,   //block to do
00972                  float gradient,    //gradient to fit
00973                  FCOORD rotation,   //for drawing
00974                  inT32 block_edge,  //edge of block
00975                  BOOL8 testing_on   //correct orientation
00976                 ) {
00977   BOOL8 swallowed_row;           //eaten a neighbour
00978   float y_max, y_min;            //new row limits
00979   float y_bottom, y_top;         //allowed limits
00980   TO_ROW *test_row;              //next row
00981   TO_ROW *row;                   //current row
00982                                  //iterators
00983   BLOBNBOX_IT blob_it = &block->blobs;
00984   TO_ROW_IT row_it = block->get_rows ();
00985 
00986 #ifndef GRAPHICS_DISABLED
00987   if (textord_show_expanded_rows && testing_on) {
00988     if (to_win == NULL)
00989       create_to_win(page_tr);
00990   }
00991 #endif
00992 
00993   adjust_row_limits(block);  //shift min,max.
00994   if (textord_new_initial_xheight) {
00995     if (block->get_rows ()->length () == 0)
00996       return;
00997     compute_row_stats(block, textord_show_expanded_rows &&testing_on);
00998   }
00999   assign_blobs_to_rows (block, &gradient, 4, TRUE, FALSE, FALSE);
01000   //get real membership
01001   if (block->get_rows ()->length () == 0)
01002     return;
01003   fit_parallel_rows(block,
01004                     gradient,
01005                     rotation,
01006                     block_edge,
01007                     textord_show_expanded_rows &&testing_on);
01008   if (!textord_new_initial_xheight)
01009     compute_row_stats(block, textord_show_expanded_rows &&testing_on);
01010   row_it.move_to_last ();
01011   do {
01012     row = row_it.data ();
01013     y_max = row->max_y ();       //get current limits
01014     y_min = row->min_y ();
01015     y_bottom = row->intercept () - block->line_size * textord_expansion_factor *
01016       tesseract::CCStruct::kDescenderFraction;
01017     y_top = row->intercept () + block->line_size * textord_expansion_factor *
01018         (tesseract::CCStruct::kXHeightFraction +
01019          tesseract::CCStruct::kAscenderFraction);
01020     if (y_min > y_bottom) {      //expansion allowed
01021       if (textord_show_expanded_rows && testing_on)
01022         tprintf("Expanding bottom of row at %f from %f to %f\n",
01023                 row->intercept(), y_min, y_bottom);
01024                                  //expandable
01025       swallowed_row = TRUE;
01026       while (swallowed_row && !row_it.at_last ()) {
01027         swallowed_row = FALSE;
01028                                  //get next one
01029         test_row = row_it.data_relative (1);
01030                                  //overlaps space
01031         if (test_row->max_y () > y_bottom) {
01032           if (test_row->min_y () > y_bottom) {
01033             if (textord_show_expanded_rows && testing_on)
01034               tprintf("Eating row below at %f\n", test_row->intercept());
01035             row_it.forward ();
01036 #ifndef GRAPHICS_DISABLED
01037             if (textord_show_expanded_rows && testing_on)
01038               plot_parallel_row(test_row,
01039                                 gradient,
01040                                 block_edge,
01041                                 ScrollView::WHITE,
01042                                 rotation);
01043 #endif
01044             blob_it.set_to_list (row->blob_list ());
01045             blob_it.add_list_after (test_row->blob_list ());
01046                                  //swallow complete row
01047             delete row_it.extract ();
01048             row_it.backward ();
01049             swallowed_row = TRUE;
01050           }
01051           else if (test_row->max_y () < y_min) {
01052                                  //shorter limit
01053             y_bottom = test_row->max_y ();
01054             if (textord_show_expanded_rows && testing_on)
01055               tprintf("Truncating limit to %f due to touching row at %f\n",
01056                       y_bottom, test_row->intercept());
01057           }
01058           else {
01059             y_bottom = y_min;    //can't expand it
01060             if (textord_show_expanded_rows && testing_on)
01061               tprintf("Not expanding limit beyond %f due to touching row at %f\n",
01062                       y_bottom, test_row->intercept());
01063           }
01064         }
01065       }
01066       y_min = y_bottom;          //expand it
01067     }
01068     if (y_max < y_top) {         //expansion allowed
01069       if (textord_show_expanded_rows && testing_on)
01070         tprintf("Expanding top of row at %f from %f to %f\n",
01071                 row->intercept(), y_max, y_top);
01072       swallowed_row = TRUE;
01073       while (swallowed_row && !row_it.at_first ()) {
01074         swallowed_row = FALSE;
01075                                  //get one above
01076         test_row = row_it.data_relative (-1);
01077         if (test_row->min_y () < y_top) {
01078           if (test_row->max_y () < y_top) {
01079             if (textord_show_expanded_rows && testing_on)
01080               tprintf("Eating row above at %f\n", test_row->intercept());
01081             row_it.backward ();
01082             blob_it.set_to_list (row->blob_list ());
01083 #ifndef GRAPHICS_DISABLED
01084             if (textord_show_expanded_rows && testing_on)
01085               plot_parallel_row(test_row,
01086                                 gradient,
01087                                 block_edge,
01088                                 ScrollView::WHITE,
01089                                 rotation);
01090 #endif
01091             blob_it.add_list_after (test_row->blob_list ());
01092                                  //swallow complete row
01093             delete row_it.extract ();
01094             row_it.forward ();
01095             swallowed_row = TRUE;
01096           }
01097           else if (test_row->min_y () < y_max) {
01098                                  //shorter limit
01099             y_top = test_row->min_y ();
01100             if (textord_show_expanded_rows && testing_on)
01101               tprintf("Truncating limit to %f due to touching row at %f\n",
01102                       y_top, test_row->intercept());
01103           }
01104           else {
01105             y_top = y_max;       //can't expand it
01106             if (textord_show_expanded_rows && testing_on)
01107               tprintf("Not expanding limit beyond %f due to touching row at %f\n",
01108                       y_top, test_row->intercept());
01109           }
01110         }
01111       }
01112       y_max = y_top;
01113     }
01114                                  //new limits
01115     row->set_limits (y_min, y_max);
01116     row_it.backward ();
01117   }
01118   while (!row_it.at_last ());
01119 }
01120 
01121 
01127 void adjust_row_limits(                 //tidy limits
01128                        TO_BLOCK *block  //block to do
01129                       ) {
01130   TO_ROW *row;                   //current row
01131   float size;                    //size of row
01132   float ymax;                    //top of row
01133   float ymin;                    //bottom of row
01134   TO_ROW_IT row_it = block->get_rows ();
01135 
01136   if (textord_show_expanded_rows)
01137     tprintf("Adjusting row limits for block(%d,%d)\n",
01138             block->block->bounding_box().left(),
01139             block->block->bounding_box().top());
01140   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01141     row = row_it.data ();
01142     size = row->max_y () - row->min_y ();
01143     if (textord_show_expanded_rows)
01144       tprintf("Row at %f has min %f, max %f, size %f\n",
01145               row->intercept(), row->min_y(), row->max_y(), size);
01146     size /= tesseract::CCStruct::kXHeightFraction +
01147         tesseract::CCStruct::kAscenderFraction +
01148         tesseract::CCStruct::kDescenderFraction;
01149     ymax = size * (tesseract::CCStruct::kXHeightFraction +
01150                    tesseract::CCStruct::kAscenderFraction);
01151     ymin = -size * tesseract::CCStruct::kDescenderFraction;
01152     row->set_limits (row->intercept () + ymin, row->intercept () + ymax);
01153     row->merged = FALSE;
01154   }
01155 }
01156 
01157 
01163 void compute_row_stats(                  //find lines
01164                        TO_BLOCK *block,  //block to do
01165                        BOOL8 testing_on  //correct orientation
01166                       ) {
01167   inT32 row_index;               //of median
01168   TO_ROW *row;                   //current row
01169   TO_ROW *prev_row;              //previous row
01170   float iqr;                     //inter quartile range
01171   TO_ROW_IT row_it = block->get_rows ();
01172                                  //number of rows
01173   inT16 rowcount = row_it.length ();
01174   TO_ROW **rows;                 //for choose nth
01175 
01176   rows = (TO_ROW **) alloc_mem (rowcount * sizeof (TO_ROW *));
01177   if (rows == NULL)
01178     MEMORY_OUT.error ("compute_row_stats", ABORT, NULL);
01179   rowcount = 0;
01180   prev_row = NULL;
01181   row_it.move_to_last ();        //start at bottom
01182   do {
01183     row = row_it.data ();
01184     if (prev_row != NULL) {
01185       rows[rowcount++] = prev_row;
01186       prev_row->spacing = row->intercept () - prev_row->intercept ();
01187       if (testing_on)
01188         tprintf ("Row at %g yields spacing of %g\n",
01189           row->intercept (), prev_row->spacing);
01190     }
01191     prev_row = row;
01192     row_it.backward ();
01193   }
01194   while (!row_it.at_last ());
01195   block->key_row = prev_row;
01196   block->baseline_offset =
01197     fmod (prev_row->parallel_c (), block->line_spacing);
01198   if (testing_on)
01199     tprintf ("Blob based spacing=(%g,%g), offset=%g",
01200       block->line_size, block->line_spacing, block->baseline_offset);
01201   if (rowcount > 0) {
01202     row_index = choose_nth_item (rowcount * 3 / 4, rows, rowcount,
01203       sizeof (TO_ROW *), row_spacing_order);
01204     iqr = rows[row_index]->spacing;
01205     row_index = choose_nth_item (rowcount / 4, rows, rowcount,
01206       sizeof (TO_ROW *), row_spacing_order);
01207     iqr -= rows[row_index]->spacing;
01208     row_index = choose_nth_item (rowcount / 2, rows, rowcount,
01209       sizeof (TO_ROW *), row_spacing_order);
01210     block->key_row = rows[row_index];
01211     if (testing_on)
01212       tprintf (" row based=%g(%g)", rows[row_index]->spacing, iqr);
01213     if (rowcount > 2
01214     && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) {
01215       if (!textord_new_initial_xheight) {
01216         if (rows[row_index]->spacing < block->line_spacing
01217           && rows[row_index]->spacing > block->line_size)
01218           //within range
01219           block->line_size = rows[row_index]->spacing;
01220         //spacing=size
01221         else if (rows[row_index]->spacing > block->line_spacing)
01222           block->line_size = block->line_spacing;
01223         //too big so use max
01224       }
01225       else {
01226         if (rows[row_index]->spacing < block->line_spacing)
01227           block->line_size = rows[row_index]->spacing;
01228         else
01229           block->line_size = block->line_spacing;
01230         //too big so use max
01231       }
01232       if (block->line_size < textord_min_xheight)
01233         block->line_size = (float) textord_min_xheight;
01234       block->line_spacing = rows[row_index]->spacing;
01235       block->max_blob_size =
01236         block->line_spacing * textord_excess_blobsize;
01237     }
01238     block->baseline_offset = fmod (rows[row_index]->intercept (),
01239       block->line_spacing);
01240   }
01241   if (testing_on)
01242     tprintf ("\nEstimate line size=%g, spacing=%g, offset=%g\n",
01243       block->line_size, block->line_spacing, block->baseline_offset);
01244   free_mem(rows);
01245 }
01246 
01247 
01277 namespace tesseract {
01278 void Textord::compute_block_xheight(TO_BLOCK *block, float gradient) {
01279   TO_ROW *row;                          // current row
01280   float asc_frac_xheight = CCStruct::kAscenderFraction /
01281       CCStruct::kXHeightFraction;
01282   float desc_frac_xheight = CCStruct::kDescenderFraction /
01283       CCStruct::kXHeightFraction;
01284   inT32 min_height, max_height;         // limits on xheight
01285   TO_ROW_IT row_it = block->get_rows();
01286   if (row_it.empty()) return;  // no rows
01287 
01288   // Compute the best guess of xheight of each row individually.
01289   // Use xheight and ascrise values of the rows where ascenders were found.
01290   get_min_max_xheight(block->line_size, &min_height, &max_height);
01291   STATS row_asc_xheights(min_height, max_height + 1);
01292   STATS row_asc_ascrise(static_cast<int>(min_height * asc_frac_xheight),
01293                         static_cast<int>(max_height * asc_frac_xheight) + 1);
01294   int min_desc_height = static_cast<int>(min_height * desc_frac_xheight);
01295   int max_desc_height = static_cast<int>(max_height * desc_frac_xheight);
01296   STATS row_asc_descdrop(min_desc_height, max_desc_height + 1);
01297   STATS row_desc_xheights(min_height, max_height + 1);
01298   STATS row_desc_descdrop(min_desc_height, max_desc_height + 1);
01299   STATS row_cap_xheights(min_height, max_height + 1);
01300   STATS row_cap_floating_xheights(min_height, max_height + 1);
01301   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
01302     row = row_it.data();
01303     // Compute the xheight of this row if it has not been computed before.
01304     if (row->xheight <= 0.0) {
01305       compute_row_xheight(row, block->block->classify_rotation(),
01306                           gradient, block->line_size);
01307     }
01308     ROW_CATEGORY row_category = get_row_category(row);
01309     if (row_category == ROW_ASCENDERS_FOUND) {
01310       row_asc_xheights.add(static_cast<inT32>(row->xheight),
01311                            row->xheight_evidence);
01312       row_asc_ascrise.add(static_cast<inT32>(row->ascrise),
01313                           row->xheight_evidence);
01314       row_asc_descdrop.add(static_cast<inT32>(-row->descdrop),
01315                            row->xheight_evidence);
01316     } else if (row_category == ROW_DESCENDERS_FOUND) {
01317       row_desc_xheights.add(static_cast<inT32>(row->xheight),
01318                             row->xheight_evidence);
01319       row_desc_descdrop.add(static_cast<inT32>(-row->descdrop),
01320                             row->xheight_evidence);
01321     } else if (row_category == ROW_UNKNOWN) {
01322       fill_heights(row, gradient, min_height, max_height,
01323                    &row_cap_xheights, &row_cap_floating_xheights);
01324     }
01325   }
01326 
01327   float xheight = 0.0;
01328   float ascrise = 0.0;
01329   float descdrop = 0.0;
01330   // Compute our best guess of xheight of this block.
01331   if (row_asc_xheights.get_total() > 0) {
01332     // Determine xheight from rows where ascenders were found.
01333     xheight = row_asc_xheights.median();
01334     ascrise = row_asc_ascrise.median();
01335     descdrop = -row_asc_descdrop.median();
01336   } else if (row_desc_xheights.get_total() > 0) {
01337     // Determine xheight from rows where descenders were found.
01338     xheight = row_desc_xheights.median();
01339     descdrop = -row_desc_descdrop.median();
01340   } else if (row_cap_xheights.get_total() > 0) {
01341     // All the rows in the block were (a/de)scenderless.
01342     // Try to search for two modes in row_cap_heights that could
01343     // be the xheight and the capheight (e.g. some of the rows
01344     // were lowercase, but did not have enough (a/de)scenders.
01345     // If such two modes can not be found, this block is most
01346     // likely all caps (or all small caps, in which case the code
01347     // still works as intended).
01348     compute_xheight_from_modes(&row_cap_xheights, &row_cap_floating_xheights,
01349                                textord_single_height_mode &&
01350                                block->block->classify_rotation().y() == 0.0,
01351                                min_height, max_height, &(xheight), &(ascrise));
01352     if (ascrise == 0) {  // assume only caps in the whole block
01353       xheight = row_cap_xheights.median() * CCStruct::kXHeightCapRatio;
01354     }
01355   } else {  // default block sizes
01356     xheight = block->line_size * CCStruct::kXHeightFraction;
01357   }
01358   // Correct xheight, ascrise and descdrop if necessary.
01359   bool corrected_xheight = false;
01360   if (xheight < textord_min_xheight) {
01361     xheight = static_cast<float>(textord_min_xheight);
01362     corrected_xheight = true;
01363   }
01364   if (corrected_xheight || ascrise <= 0.0) {
01365     ascrise = xheight * asc_frac_xheight;
01366   }
01367   if (corrected_xheight || descdrop >= 0.0) {
01368     descdrop = -(xheight * desc_frac_xheight);
01369   }
01370   block->xheight = xheight;
01371 
01372   if (textord_debug_xheights) {
01373     tprintf("Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\n",
01374             xheight, ascrise, descdrop);
01375   }
01376   // Correct xheight, ascrise, descdrop of rows based on block averages.
01377   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
01378     correct_row_xheight(row_it.data(), xheight, ascrise, descdrop);
01379   }
01380 }
01381 
01390 void Textord::compute_row_xheight(TO_ROW *row,          // row to do
01391                                   const FCOORD& rotation,
01392                                   float gradient,       // global skew
01393                                   int block_line_size) {
01394   // Find blobs representing repeated characters in rows and mark them.
01395   // This information is used for computing row xheight and at a later
01396   // stage when words are formed by make_words.
01397   if (!row->rep_chars_marked()) {
01398     mark_repeated_chars(row);
01399   }
01400 
01401   int min_height, max_height;
01402   get_min_max_xheight(block_line_size, &min_height, &max_height);
01403   STATS heights(min_height, max_height + 1);
01404   STATS floating_heights(min_height, max_height + 1);
01405   fill_heights(row, gradient, min_height, max_height,
01406                &heights, &floating_heights);
01407   row->ascrise = 0.0f;
01408   row->xheight = 0.0f;
01409   row->xheight_evidence =
01410     compute_xheight_from_modes(&heights, &floating_heights,
01411                                textord_single_height_mode &&
01412                                rotation.y() == 0.0,
01413                                min_height, max_height,
01414                                &(row->xheight), &(row->ascrise));
01415   row->descdrop = 0.0f;
01416   if (row->xheight > 0.0) {
01417     row->descdrop = static_cast<float>(
01418         compute_row_descdrop(row, gradient, row->xheight_evidence, &heights));
01419   }
01420 }
01421 
01422 }  // namespace tesseract.
01423 
01430 void fill_heights(TO_ROW *row, float gradient, int min_height,
01431                   int max_height, STATS *heights, STATS *floating_heights) {
01432   float xcentre;                 // centre of blob
01433   float top;                     // top y coord of blob
01434   float height;                  // height of blob
01435   BLOBNBOX *blob;                // current blob
01436   int repeated_set;
01437   BLOBNBOX_IT blob_it = row->blob_list();
01438   if (blob_it.empty()) return;  // no blobs in this row
01439   bool has_rep_chars =
01440     row->rep_chars_marked() && row->num_repeated_sets() > 0;
01441   do {
01442     blob = blob_it.data();
01443     if (!blob->joined_to_prev()) {
01444       xcentre = (blob->bounding_box().left() +
01445                  blob->bounding_box().right()) / 2.0f;
01446       top = blob->bounding_box().top();
01447       height = blob->bounding_box().height();
01448       if (textord_fix_xheight_bug)
01449         top -= row->baseline.y(xcentre);
01450       else
01451         top -= gradient * xcentre + row->parallel_c();
01452       if (top >= min_height && top <= max_height) {
01453         heights->add(static_cast<inT32>(floor(top + 0.5)), 1);
01454         if (height / top < textord_min_blob_height_fraction) {
01455           floating_heights->add(static_cast<inT32>(floor(top + 0.5)), 1);
01456         }
01457       }
01458     }
01459     // Skip repeated chars, since they are likely to skew the height stats.
01460     if (has_rep_chars && blob->repeated_set() != 0) {
01461       repeated_set = blob->repeated_set();
01462       blob_it.forward();
01463       while (!blob_it.at_first() &&
01464              blob_it.data()->repeated_set() == repeated_set) {
01465         blob_it.forward();
01466         if (textord_debug_xheights)
01467           tprintf("Skipping repeated char when computing xheight\n");
01468       }
01469     } else {
01470       blob_it.forward();
01471     }
01472   } while (!blob_it.at_first());
01473 }
01474 
01491 int compute_xheight_from_modes(
01492     STATS *heights, STATS *floating_heights, bool cap_only, int min_height,
01493     int max_height, float *xheight, float *ascrise) {
01494   int blob_index = heights->mode();  // find mode
01495   int blob_count = heights->pile_count(blob_index);  // get count of mode
01496   if (textord_debug_xheights) {
01497     tprintf("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\n",
01498             min_height, max_height, blob_index, blob_count,
01499             heights->get_total());
01500     heights->print();
01501     floating_heights->print();
01502   }
01503   if (blob_count == 0) return 0;
01504   int modes[MAX_HEIGHT_MODES];  // biggest piles
01505   bool in_best_pile = FALSE;
01506   int prev_size = -MAX_INT32;
01507   int best_count = 0;
01508   int mode_count = compute_height_modes(heights, min_height, max_height,
01509                                         modes, MAX_HEIGHT_MODES);
01510   if (cap_only && mode_count > 1)
01511     mode_count = 1;
01512   int x;
01513   if (textord_debug_xheights) {
01514     tprintf("found %d modes: ", mode_count);
01515     for (x = 0; x < mode_count; x++) tprintf("%d ", modes[x]);
01516     tprintf("\n");
01517   }
01518 
01519   for (x = 0; x < mode_count - 1; x++) {
01520     if (modes[x] != prev_size + 1)
01521       in_best_pile = FALSE;    // had empty height
01522     int modes_x_count = heights->pile_count(modes[x]) -
01523       floating_heights->pile_count(modes[x]);
01524     if ((modes_x_count >= blob_count * textord_xheight_mode_fraction) &&
01525         (in_best_pile || modes_x_count > best_count)) {
01526       for (int asc = x + 1; asc < mode_count; asc++) {
01527         float ratio =
01528           static_cast<float>(modes[asc]) / static_cast<float>(modes[x]);
01529         if (textord_ascx_ratio_min < ratio &&
01530             ratio < textord_ascx_ratio_max &&
01531             (heights->pile_count(modes[asc]) >=
01532              blob_count * textord_ascheight_mode_fraction)) {
01533           if (modes_x_count > best_count) {
01534             in_best_pile = true;
01535             best_count = modes_x_count;
01536           }
01537           if (textord_debug_xheights) {
01538             tprintf("X=%d, asc=%d, count=%d, ratio=%g\n",
01539                     modes[x], modes[asc]-modes[x], modes_x_count, ratio);
01540           }
01541           prev_size = modes[x];
01542           *xheight = static_cast<float>(modes[x]);
01543           *ascrise = static_cast<float>(modes[asc] - modes[x]);
01544         }
01545       }
01546     }
01547   }
01548   if (*xheight == 0) {  // single mode
01549     // Remove counts of the "floating" blobs (the one whose height is too
01550     // small in relation to it's top end of the bounding box) from heights
01551     // before computing the single-mode xheight.
01552     // Restore the counts in heights after the mode is found, since
01553     // floating blobs might be useful for determining potential ascenders
01554     // in compute_row_descdrop().
01555     if (floating_heights->get_total() > 0) {
01556       for (x = min_height; x < max_height; ++x) {
01557         heights->add(x, -(floating_heights->pile_count(x)));
01558       }
01559       blob_index = heights->mode();  // find the modified mode
01560       for (x = min_height; x < max_height; ++x) {
01561         heights->add(x, floating_heights->pile_count(x));
01562       }
01563     }
01564     *xheight = static_cast<float>(blob_index);
01565     *ascrise = 0.0f;
01566     best_count = heights->pile_count(blob_index);
01567     if (textord_debug_xheights)
01568       tprintf("Single mode xheight set to %g\n", *xheight);
01569   } else if (textord_debug_xheights) {
01570     tprintf("Multi-mode xheight set to %g, asc=%g\n", *xheight, *ascrise);
01571   }
01572   return best_count;
01573 }
01574 
01587 inT32 compute_row_descdrop(TO_ROW *row, float gradient,
01588                            int xheight_blob_count, STATS *asc_heights) {
01589   // Count how many potential ascenders are in this row.
01590   int i_min = asc_heights->min_bucket();
01591   if ((i_min / row->xheight) < textord_ascx_ratio_min) {
01592     i_min = static_cast<int>(
01593         floor(row->xheight * textord_ascx_ratio_min + 0.5));
01594   }
01595   int i_max = asc_heights->max_bucket();
01596   if ((i_max / row->xheight) > textord_ascx_ratio_max) {
01597     i_max = static_cast<int>(floor(row->xheight * textord_ascx_ratio_max));
01598   }
01599   int num_potential_asc = 0;
01600   for (int i = i_min; i <= i_max; ++i) {
01601     num_potential_asc += asc_heights->pile_count(i);
01602   }
01603   inT32 min_height =
01604     static_cast<inT32>(floor(row->xheight * textord_descx_ratio_min + 0.5));
01605   inT32 max_height =
01606     static_cast<inT32>(floor(row->xheight * textord_descx_ratio_max));
01607   float xcentre;                 // centre of blob
01608   float height;                  // height of blob
01609   BLOBNBOX_IT blob_it = row->blob_list();
01610   BLOBNBOX *blob;                // current blob
01611   STATS heights (min_height, max_height + 1);
01612   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
01613     blob = blob_it.data();
01614     if (!blob->joined_to_prev()) {
01615       xcentre = (blob->bounding_box().left() +
01616                  blob->bounding_box().right()) / 2.0f;
01617       height = (gradient * xcentre + row->parallel_c() -
01618                 blob->bounding_box().bottom());
01619       if (height >= min_height && height <= max_height)
01620         heights.add(static_cast<int>(floor(height + 0.5)), 1);
01621     }
01622   }
01623   int blob_index = heights.mode();  // find mode
01624   int blob_count = heights.pile_count(blob_index);  // get count of mode
01625   float total_fraction =
01626     (textord_descheight_mode_fraction + textord_ascheight_mode_fraction);
01627   if (static_cast<float>(blob_count + num_potential_asc) <
01628       xheight_blob_count * total_fraction) {
01629     blob_count = 0;
01630   }
01631   int descdrop = blob_count > 0 ? -blob_index : 0;
01632   if (textord_debug_xheights) {
01633     tprintf("Descdrop: %d (potential ascenders %d, descenders %d)\n",
01634             descdrop, num_potential_asc, blob_count);
01635     heights.print();
01636   }
01637   return descdrop;
01638 }
01639 
01640 
01647 inT32 compute_height_modes(STATS *heights,    // stats to search
01648                            inT32 min_height,  // bottom of range
01649                            inT32 max_height,  // top of range
01650                            inT32 *modes,      // output array
01651                            inT32 maxmodes) {  // size of modes
01652   inT32 pile_count;              // no in source pile
01653   inT32 src_count;               // no of source entries
01654   inT32 src_index;               // current entry
01655   inT32 least_count;             // height of smalllest
01656   inT32 least_index;             // index of least
01657   inT32 dest_count;              // index in modes
01658 
01659   src_count = max_height + 1 - min_height;
01660   dest_count = 0;
01661   least_count = MAX_INT32;
01662   least_index = -1;
01663   for (src_index = 0; src_index < src_count; src_index++) {
01664     pile_count = heights->pile_count(min_height + src_index);
01665     if (pile_count > 0) {
01666       if (dest_count < maxmodes) {
01667         if (pile_count < least_count) {
01668           // find smallest in array
01669           least_count = pile_count;
01670           least_index = dest_count;
01671         }
01672         modes[dest_count++] = min_height + src_index;
01673       } else if (pile_count >= least_count) {
01674         while (least_index < maxmodes - 1) {
01675           modes[least_index] = modes[least_index + 1];
01676           // shuffle up
01677           least_index++;
01678         }
01679         // new one on end
01680         modes[maxmodes - 1] = min_height + src_index;
01681         if (pile_count == least_count) {
01682           // new smallest
01683           least_index = maxmodes - 1;
01684         } else {
01685           least_count = heights->pile_count(modes[0]);
01686           least_index = 0;
01687           for (dest_count = 1; dest_count < maxmodes; dest_count++) {
01688             pile_count = heights->pile_count(modes[dest_count]);
01689             if (pile_count < least_count) {
01690               // find smallest
01691               least_count = pile_count;
01692               least_index = dest_count;
01693             }
01694           }
01695         }
01696       }
01697     }
01698   }
01699   return dest_count;
01700 }
01701 
01702 
01709 void correct_row_xheight(TO_ROW *row, float xheight,
01710                          float ascrise, float descdrop) {
01711   ROW_CATEGORY row_category = get_row_category(row);
01712   if (textord_debug_xheights) {
01713     tprintf("correcting row xheight: row->xheight %.4f"
01714             ", row->acrise %.4f row->descdrop %.4f\n",
01715             row->xheight, row->ascrise, row->descdrop);
01716   }
01717   bool normal_xheight =
01718     within_error_margin(row->xheight, xheight, textord_xheight_error_margin);
01719   bool cap_xheight =
01720     within_error_margin(row->xheight, xheight + ascrise,
01721                         textord_xheight_error_margin);
01722   // Use the average xheight/ascrise for the following cases:
01723   // -- the xheight of the row could not be determined at all
01724   // -- the row has descenders (e.g. "many groups", "ISBN 12345 p.3")
01725   //    and its xheight is close to either cap height or average xheight
01726   // -- the row does not have ascenders or descenders, but its xheight
01727   //    is close to the average block xheight (e.g. row with "www.mmm.com")
01728   if (row_category == ROW_ASCENDERS_FOUND) {
01729     if (row->descdrop >= 0.0) {
01730       row->descdrop = row->xheight * (descdrop / xheight);
01731     }
01732   } else if (row_category == ROW_INVALID ||
01733              (row_category == ROW_DESCENDERS_FOUND &&
01734               (normal_xheight || cap_xheight)) ||
01735               (row_category == ROW_UNKNOWN && normal_xheight)) {
01736     if (textord_debug_xheights) tprintf("using average xheight\n");
01737     row->xheight = xheight;
01738     row->ascrise = ascrise;
01739     row->descdrop = descdrop;
01740   } else if (row_category == ROW_DESCENDERS_FOUND) {
01741     // Assume this is a row with mostly lowercase letters and it's xheight
01742     // is computed correctly (unfortunately there is no way to distinguish
01743     // this from the case when descenders are found, but the most common
01744     // height is capheight).
01745     if (textord_debug_xheights) tprintf("lowercase, corrected ascrise\n");
01746     row->ascrise = row->xheight * (ascrise / xheight);
01747   } else if (row_category == ROW_UNKNOWN) {
01748   // Otherwise assume this row is an all-caps or small-caps row
01749   // and adjust xheight and ascrise of the row.
01750 
01751     row->all_caps = true;
01752     if (cap_xheight) { // regular all caps
01753       if (textord_debug_xheights) tprintf("all caps\n");
01754       row->xheight = xheight;
01755       row->ascrise = ascrise;
01756       row->descdrop = descdrop;
01757     } else {  // small caps or caps with an odd xheight
01758       if (textord_debug_xheights) {
01759         if (row->xheight < xheight + ascrise && row->xheight > xheight) {
01760           tprintf("small caps\n");
01761         } else {
01762           tprintf("all caps with irregular xheight\n");
01763         }
01764       }
01765       row->ascrise = row->xheight * (ascrise / (xheight + ascrise));
01766       row->xheight -= row->ascrise;
01767       row->descdrop = row->xheight * (descdrop / xheight);
01768     }
01769   }
01770   if (textord_debug_xheights) {
01771     tprintf("corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop"
01772             " = %.4f\n", row->xheight, row->ascrise, row->descdrop);
01773   }
01774 }
01775 
01776 static int CountOverlaps(const TBOX& box, int min_height,
01777                          BLOBNBOX_LIST* blobs) {
01778   int overlaps = 0;
01779   BLOBNBOX_IT blob_it(blobs);
01780   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
01781     BLOBNBOX* blob = blob_it.data();
01782     TBOX blob_box = blob->bounding_box();
01783     if (blob_box.height() >= min_height && box.major_overlap(blob_box)) {
01784       ++overlaps;
01785     }
01786   }
01787   return overlaps;
01788 }
01789 
01796 void separate_underlines(TO_BLOCK *block,  // block to do
01797                          float gradient,   // skew angle
01798                          FCOORD rotation,  // inverse landscape
01799                          BOOL8 testing_on) {  // correct orientation
01800   BLOBNBOX *blob;                // current blob
01801   C_BLOB *rotated_blob;          // rotated blob
01802   TO_ROW *row;                   // current row
01803   float length;                  // of g_vec
01804   TBOX blob_box;
01805   FCOORD blob_rotation;          // inverse of rotation
01806   FCOORD g_vec;                  // skew rotation
01807   BLOBNBOX_IT blob_it;           // iterator
01808                                  // iterator
01809   BLOBNBOX_IT under_it = &block->underlines;
01810   BLOBNBOX_IT large_it = &block->large_blobs;
01811   TO_ROW_IT row_it = block->get_rows();
01812   int min_blob_height = static_cast<int>(textord_min_blob_height_fraction *
01813                                          block->line_size + 0.5);
01814 
01815                                  // length of vector
01816   length = sqrt(1 + gradient * gradient);
01817   g_vec = FCOORD(1 / length, -gradient / length);
01818   blob_rotation = FCOORD(rotation.x(), -rotation.y());
01819   blob_rotation.rotate(g_vec);  // undoing everything
01820   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
01821     row = row_it.data();
01822                                  // get blobs
01823     blob_it.set_to_list(row->blob_list());
01824     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
01825          blob_it.forward()) {
01826       blob = blob_it.data();
01827       blob_box = blob->bounding_box();
01828       if (blob_box.width() > block->line_size * textord_underline_width) {
01829         ASSERT_HOST(blob->cblob() != NULL);
01830         rotated_blob = crotate_cblob (blob->cblob(),
01831           blob_rotation);
01832         if (test_underline(
01833             testing_on && textord_show_final_rows,
01834             rotated_blob, static_cast<inT16>(row->intercept()),
01835             static_cast<inT16>(
01836                 block->line_size *
01837                 (tesseract::CCStruct::kXHeightFraction +
01838                  tesseract::CCStruct::kAscenderFraction / 2.0f)))) {
01839           under_it.add_after_then_move(blob_it.extract());
01840           if (testing_on && textord_show_final_rows) {
01841             tprintf("Underlined blob at:");
01842               rotated_blob->bounding_box().print();
01843             tprintf("Was:");
01844               blob_box.print();
01845           }
01846         } else if (CountOverlaps(blob->bounding_box(), min_blob_height,
01847                                  row->blob_list()) >
01848                    textord_max_blob_overlaps) {
01849           large_it.add_after_then_move(blob_it.extract());
01850           if (testing_on && textord_show_final_rows) {
01851             tprintf("Large blob overlaps %d blobs at:",
01852                     CountOverlaps(blob_box, min_blob_height,
01853                                   row->blob_list()));
01854             blob_box.print();
01855           }
01856         }
01857         delete rotated_blob;
01858       }
01859     }
01860   }
01861 }
01862 
01863 
01869 void pre_associate_blobs(                  //make rough chars
01870                          ICOORD page_tr,   //top right
01871                          TO_BLOCK *block,  //block to do
01872                          FCOORD rotation,  //inverse landscape
01873                          BOOL8 testing_on  //correct orientation
01874                         ) {
01875 #ifndef GRAPHICS_DISABLED
01876   ScrollView::Color colour;                 //of boxes
01877 #endif
01878   BLOBNBOX *blob;                //current blob
01879   BLOBNBOX *nextblob;            //next in list
01880   TBOX blob_box;
01881   FCOORD blob_rotation;          //inverse of rotation
01882   BLOBNBOX_IT blob_it;           //iterator
01883   BLOBNBOX_IT start_it;          //iterator
01884   TO_ROW_IT row_it = block->get_rows ();
01885 
01886 #ifndef GRAPHICS_DISABLED
01887   colour = ScrollView::RED;
01888 #endif
01889 
01890   blob_rotation = FCOORD (rotation.x (), -rotation.y ());
01891   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01892                                  //get blobs
01893     blob_it.set_to_list (row_it.data ()->blob_list ());
01894     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
01895     blob_it.forward ()) {
01896       blob = blob_it.data ();
01897       blob_box = blob->bounding_box ();
01898       start_it = blob_it;        //save start point
01899       //                      if (testing_on && textord_show_final_blobs)
01900       //                      {
01901       //                              tprintf("Blob at (%d,%d)->(%d,%d), addr=%x, count=%d\n",
01902       //                                      blob_box.left(),blob_box.bottom(),
01903       //                                      blob_box.right(),blob_box.top(),
01904       //                                      (void*)blob,blob_it.length());
01905       //                      }
01906       bool overlap;
01907       do {
01908         overlap = false;
01909         if (!blob_it.at_last ()) {
01910           nextblob = blob_it.data_relative(1);
01911           overlap = blob_box.major_x_overlap(nextblob->bounding_box());
01912           if (overlap) {
01913             blob->merge(nextblob); // merge new blob
01914             blob_box = blob->bounding_box(); // get bigger box
01915             blob_it.forward();
01916           }
01917         }
01918       }
01919       while (overlap);
01920       blob->chop (&start_it, &blob_it,
01921         blob_rotation,
01922         block->line_size * tesseract::CCStruct::kXHeightFraction *
01923         textord_chop_width);
01924       //attempt chop
01925     }
01926 #ifndef GRAPHICS_DISABLED
01927     if (testing_on && textord_show_final_blobs) {
01928       if (to_win == NULL)
01929         create_to_win(page_tr);
01930       to_win->Pen(colour);
01931       for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
01932       blob_it.forward ()) {
01933         blob = blob_it.data ();
01934         blob_box = blob->bounding_box ();
01935         blob_box.rotate (rotation);
01936         if (!blob->joined_to_prev ()) {
01937           to_win->Rectangle (blob_box.left (), blob_box.bottom (),
01938             blob_box.right (), blob_box.top ());
01939         }
01940       }
01941       colour = (ScrollView::Color) (colour + 1);
01942       if (colour > ScrollView::MAGENTA)
01943         colour = ScrollView::RED;
01944     }
01945 #endif
01946   }
01947 }
01948 
01949 
01955 void fit_parallel_rows(                   //find lines
01956                        TO_BLOCK *block,   //block to do
01957                        float gradient,    //gradient to fit
01958                        FCOORD rotation,   //for drawing
01959                        inT32 block_edge,  //edge of block
01960                        BOOL8 testing_on   //correct orientation
01961                       ) {
01962 #ifndef GRAPHICS_DISABLED
01963   ScrollView::Color colour;                 //of row
01964 #endif
01965   TO_ROW_IT row_it = block->get_rows ();
01966 
01967   row_it.move_to_first ();
01968   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01969     if (row_it.data ()->blob_list ()->empty ())
01970       delete row_it.extract ();  //nothing in it
01971     else
01972       fit_parallel_lms (gradient, row_it.data ());
01973   }
01974 #ifndef GRAPHICS_DISABLED
01975   if (testing_on) {
01976     colour = ScrollView::RED;
01977     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01978       plot_parallel_row (row_it.data (), gradient,
01979         block_edge, colour, rotation);
01980       colour = (ScrollView::Color) (colour + 1);
01981       if (colour > ScrollView::MAGENTA)
01982         colour = ScrollView::RED;
01983     }
01984   }
01985 #endif
01986   row_it.sort (row_y_order);     //may have gone out of order
01987 }
01988 
01989 
01997 void fit_parallel_lms(float gradient, TO_ROW *row) {
01998   float c;                       // fitted line
01999   int blobcount;                 // no of blobs
02000    tesseract::DetLineFit lms;
02001   BLOBNBOX_IT blob_it = row->blob_list();
02002 
02003   blobcount = 0;
02004   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
02005     if (!blob_it.data()->joined_to_prev()) {
02006       const TBOX& box = blob_it.data()->bounding_box();
02007       lms.Add(ICOORD((box.left() + box.right()) / 2, box.bottom()));
02008       blobcount++;
02009     }
02010   }
02011   double error = lms.ConstrainedFit(gradient, &c);
02012   row->set_parallel_line(gradient, c, error);
02013   if (textord_straight_baselines && blobcount > textord_lms_line_trials) {
02014     error = lms.Fit(&gradient, &c);
02015   }
02016                                  //set the other too
02017   row->set_line(gradient, c, error);
02018 }
02019 
02020 
02026 namespace tesseract {
02027 void Textord::make_spline_rows(TO_BLOCK *block,   // block to do
02028                                float gradient,    // gradient to fit
02029                                BOOL8 testing_on) {
02030 #ifndef GRAPHICS_DISABLED
02031   ScrollView::Color colour;       //of row
02032 #endif
02033   TO_ROW_IT row_it = block->get_rows ();
02034 
02035   row_it.move_to_first ();
02036   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
02037     if (row_it.data ()->blob_list ()->empty ())
02038       delete row_it.extract ();  //nothing in it
02039     else
02040       make_baseline_spline (row_it.data (), block);
02041   }
02042   if (textord_old_baselines) {
02043 #ifndef GRAPHICS_DISABLED
02044     if (testing_on) {
02045       colour = ScrollView::RED;
02046       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
02047       row_it.forward ()) {
02048         row_it.data ()->baseline.plot (to_win, colour);
02049         colour = (ScrollView::Color) (colour + 1);
02050         if (colour > ScrollView::MAGENTA)
02051           colour = ScrollView::RED;
02052       }
02053     }
02054 #endif
02055     make_old_baselines(block, testing_on, gradient);
02056   }
02057 #ifndef GRAPHICS_DISABLED
02058   if (testing_on) {
02059     colour = ScrollView::RED;
02060     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
02061       row_it.data ()->baseline.plot (to_win, colour);
02062       colour = (ScrollView::Color) (colour + 1);
02063       if (colour > ScrollView::MAGENTA)
02064         colour = ScrollView::RED;
02065     }
02066   }
02067 #endif
02068 }
02069 
02070 }  // namespace tesseract.
02071 
02072 
02080 void make_baseline_spline(TO_ROW *row,     //row to fit
02081                           TO_BLOCK *block) {
02082   inT32 *xstarts;                // spline boundaries
02083   double *coeffs;                // quadratic coeffs
02084   inT32 segments;                // no of segments
02085 
02086   xstarts =
02087     (inT32 *) alloc_mem((row->blob_list()->length() + 1) * sizeof(inT32));
02088   if (segment_baseline(row, block, segments, xstarts)
02089   && !textord_straight_baselines && !textord_parallel_baselines) {
02090     coeffs = linear_spline_baseline(row, block, segments, xstarts);
02091   } else {
02092     xstarts[1] = xstarts[segments];
02093     segments = 1;
02094     coeffs = (double *) alloc_mem (3 * sizeof (double));
02095     coeffs[0] = 0;
02096     coeffs[1] = row->line_m ();
02097     coeffs[2] = row->line_c ();
02098   }
02099   row->baseline = QSPLINE (segments, xstarts, coeffs);
02100   free_mem(coeffs);
02101   free_mem(xstarts);
02102 }
02103 
02104 
02112 BOOL8
02113 segment_baseline (               //split baseline
02114 TO_ROW * row,                    //row to fit
02115 TO_BLOCK * block,                //block it came from
02116 inT32 & segments,                //no fo segments
02117 inT32 xstarts[]                  //coords of segments
02118 ) {
02119   BOOL8 needs_curve;             //needs curved line
02120   int blobcount;                 //no of blobs
02121   int blobindex;                 //current blob
02122   int last_state;                //above, on , below
02123   int state;                     //of current blob
02124   float yshift;                  //from baseline
02125   TBOX box;                       //blob box
02126   TBOX new_box;                   //new_it box
02127   float middle;                  //xcentre of blob
02128                                  //blobs
02129   BLOBNBOX_IT blob_it = row->blob_list ();
02130   BLOBNBOX_IT new_it = blob_it;  //front end
02131   SORTED_FLOATS yshifts;         //shifts from baseline
02132 
02133   needs_curve = FALSE;
02134   box = box_next_pre_chopped (&blob_it);
02135   xstarts[0] = box.left ();
02136   segments = 1;
02137   blobcount = row->blob_list ()->length ();
02138   if (textord_oldbl_debug)
02139     tprintf ("Segmenting baseline of %d blobs at (%d,%d)\n",
02140       blobcount, box.left (), box.bottom ());
02141   if (blobcount <= textord_spline_medianwin
02142   || blobcount < textord_spline_minblobs) {
02143     blob_it.move_to_last ();
02144     box = blob_it.data ()->bounding_box ();
02145     xstarts[1] = box.right ();
02146     return FALSE;
02147   }
02148   last_state = 0;
02149   new_it.mark_cycle_pt ();
02150   for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) {
02151     new_box = box_next_pre_chopped (&new_it);
02152     middle = (new_box.left () + new_box.right ()) / 2.0;
02153     yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
02154                                  //record shift
02155     yshifts.add (yshift, blobindex);
02156     if (new_it.cycled_list ()) {
02157       xstarts[1] = new_box.right ();
02158       return FALSE;
02159     }
02160   }
02161   for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++)
02162     box = box_next_pre_chopped (&blob_it);
02163   do {
02164     new_box = box_next_pre_chopped (&new_it);
02165                                  //get middle one
02166     yshift = yshifts[textord_spline_medianwin / 2];
02167     if (yshift > textord_spline_shift_fraction * block->line_size)
02168       state = 1;
02169     else if (-yshift > textord_spline_shift_fraction * block->line_size)
02170       state = -1;
02171     else
02172       state = 0;
02173     if (state != 0)
02174       needs_curve = TRUE;
02175     //              tprintf("State=%d, prev=%d, shift=%g\n",
02176     //                      state,last_state,yshift);
02177     if (state != last_state && blobcount > textord_spline_minblobs) {
02178       xstarts[segments++] = box.left ();
02179       blobcount = 0;
02180     }
02181     last_state = state;
02182     yshifts.remove (blobindex - textord_spline_medianwin);
02183     box = box_next_pre_chopped (&blob_it);
02184     middle = (new_box.left () + new_box.right ()) / 2.0;
02185     yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
02186     yshifts.add (yshift, blobindex);
02187     blobindex++;
02188     blobcount++;
02189   }
02190   while (!new_it.cycled_list ());
02191   if (blobcount > textord_spline_minblobs || segments == 1) {
02192     xstarts[segments] = new_box.right ();
02193   }
02194   else {
02195     xstarts[--segments] = new_box.right ();
02196   }
02197   if (textord_oldbl_debug)
02198     tprintf ("Made %d segments on row at (%d,%d)\n",
02199       segments, box.right (), box.bottom ());
02200   return needs_curve;
02201 }
02202 
02203 
02211 double *
02212 linear_spline_baseline (         //split baseline
02213 TO_ROW * row,                    //row to fit
02214 TO_BLOCK * block,                //block it came from
02215 inT32 & segments,                //no fo segments
02216 inT32 xstarts[]                  //coords of segments
02217 ) {
02218   int blobcount;                 //no of blobs
02219   int blobindex;                 //current blob
02220   int index1, index2;            //blob numbers
02221   int blobs_per_segment;         //blobs in each
02222   TBOX box;                       //blob box
02223   TBOX new_box;                   //new_it box
02224                                  //blobs
02225   BLOBNBOX_IT blob_it = row->blob_list ();
02226   BLOBNBOX_IT new_it = blob_it;  //front end
02227   float b, c;                    //fitted curve
02228   tesseract::DetLineFit lms;
02229   double *coeffs;                //quadratic coeffs
02230   inT32 segment;                 //current segment
02231 
02232   box = box_next_pre_chopped (&blob_it);
02233   xstarts[0] = box.left ();
02234   blobcount = 1;
02235   while (!blob_it.at_first ()) {
02236     blobcount++;
02237     box = box_next_pre_chopped (&blob_it);
02238   }
02239   segments = blobcount / textord_spline_medianwin;
02240   if (segments < 1)
02241     segments = 1;
02242   blobs_per_segment = blobcount / segments;
02243   coeffs = (double *) alloc_mem (segments * 3 * sizeof (double));
02244   if (textord_oldbl_debug)
02245     tprintf
02246       ("Linear splining baseline of %d blobs at (%d,%d), into %d segments of %d blobs\n",
02247       blobcount, box.left (), box.bottom (), segments, blobs_per_segment);
02248   segment = 1;
02249   for (index2 = 0; index2 < blobs_per_segment / 2; index2++)
02250     box_next_pre_chopped(&new_it);
02251   index1 = 0;
02252   blobindex = index2;
02253   do {
02254     blobindex += blobs_per_segment;
02255     lms.Clear();
02256     while (index1 < blobindex || (segment == segments && index1 < blobcount)) {
02257       box = box_next_pre_chopped (&blob_it);
02258       int middle = (box.left() + box.right()) / 2;
02259       lms.Add(ICOORD(middle, box.bottom()));
02260       index1++;
02261       if (index1 == blobindex - blobs_per_segment / 2
02262       || index1 == blobcount - 1) {
02263         xstarts[segment] = box.left ();
02264       }
02265     }
02266     lms.Fit(&b, &c);
02267     coeffs[segment * 3 - 3] = 0;
02268     coeffs[segment * 3 - 2] = b;
02269     coeffs[segment * 3 - 1] = c;
02270     segment++;
02271     if (segment > segments)
02272       break;
02273 
02274     blobindex += blobs_per_segment;
02275     lms.Clear();
02276     while (index2 < blobindex || (segment == segments && index2 < blobcount)) {
02277       new_box = box_next_pre_chopped (&new_it);
02278       int middle = (new_box.left() + new_box.right()) / 2;
02279       lms.Add(ICOORD (middle, new_box.bottom()));
02280       index2++;
02281       if (index2 == blobindex - blobs_per_segment / 2
02282       || index2 == blobcount - 1) {
02283         xstarts[segment] = new_box.left ();
02284       }
02285     }
02286     lms.Fit(&b, &c);
02287     coeffs[segment * 3 - 3] = 0;
02288     coeffs[segment * 3 - 2] = b;
02289     coeffs[segment * 3 - 1] = c;
02290     segment++;
02291   }
02292   while (segment <= segments);
02293   return coeffs;
02294 }
02295 
02296 
02303 void assign_blobs_to_rows(                      //find lines
02304                           TO_BLOCK *block,      //block to do
02305                           float *gradient,      //block skew
02306                           int pass,             //identification
02307                           BOOL8 reject_misses,  //chuck big ones out
02308                           BOOL8 make_new_rows,  //add rows for unmatched
02309                           BOOL8 drawing_skew    //draw smoothed skew
02310                          ) {
02311   OVERLAP_STATE overlap_result;  //what to do with it
02312   float ycoord;                  //current y
02313   float top, bottom;             //of blob
02314   float g_length = 1.0f;         //from gradient
02315   inT16 row_count;               //no of rows
02316   inT16 left_x;                  //left edge
02317   inT16 last_x;                  //previous edge
02318   float block_skew;              //y delta
02319   float smooth_factor;           //for new coords
02320   float near_dist;               //dist to nearest row
02321   ICOORD testpt;                 //testing only
02322   BLOBNBOX *blob;                //current blob
02323   TO_ROW *row;                   //current row
02324   TO_ROW *dest_row = NULL;       //row to put blob in
02325                                  //iterators
02326   BLOBNBOX_IT blob_it = &block->blobs;
02327   TO_ROW_IT row_it = block->get_rows ();
02328 
02329   ycoord =
02330     (block->block->bounding_box ().bottom () +
02331     block->block->bounding_box ().top ()) / 2.0f;
02332   if (gradient != NULL)
02333     g_length = sqrt (1 + *gradient * *gradient);
02334 #ifndef GRAPHICS_DISABLED
02335   if (drawing_skew)
02336     to_win->SetCursor(block->block->bounding_box ().left (), ycoord);
02337 #endif
02338   testpt = ICOORD (textord_test_x, textord_test_y);
02339   blob_it.sort (blob_x_order);
02340   smooth_factor = 1.0;
02341   block_skew = 0.0f;
02342   row_count = row_it.length ();  //might have rows
02343   if (!blob_it.empty ()) {
02344     left_x = blob_it.data ()->bounding_box ().left ();
02345   }
02346   else {
02347     left_x = block->block->bounding_box ().left ();
02348   }
02349   last_x = left_x;
02350   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
02351     blob = blob_it.data ();
02352     if (gradient != NULL) {
02353       block_skew = (1 - 1 / g_length) * blob->bounding_box ().bottom ()
02354         + *gradient / g_length * blob->bounding_box ().left ();
02355     }
02356     else if (blob->bounding_box ().left () - last_x > block->line_size / 2
02357       && last_x - left_x > block->line_size * 2
02358     && textord_interpolating_skew) {
02359       //                      tprintf("Interpolating skew from %g",block_skew);
02360       block_skew *= (float) (blob->bounding_box ().left () - left_x)
02361         / (last_x - left_x);
02362       //                      tprintf("to %g\n",block_skew);
02363     }
02364     last_x = blob->bounding_box ().left ();
02365     top = blob->bounding_box ().top () - block_skew;
02366     bottom = blob->bounding_box ().bottom () - block_skew;
02367 #ifndef GRAPHICS_DISABLED
02368     if (drawing_skew)
02369       to_win->DrawTo(blob->bounding_box ().left (), ycoord + block_skew);
02370 #endif
02371     if (!row_it.empty ()) {
02372       for (row_it.move_to_first ();
02373         !row_it.at_last () && row_it.data ()->min_y () > top;
02374         row_it.forward ());
02375       row = row_it.data ();
02376       if (row->min_y () <= top && row->max_y () >= bottom) {
02377       //any overlap
02378         dest_row = row;
02379         overlap_result = most_overlapping_row (&row_it, dest_row,
02380           top, bottom,
02381           block->line_size,
02382           blob->bounding_box ().
02383           contains (testpt));
02384         if (overlap_result == NEW_ROW && !reject_misses)
02385           overlap_result = ASSIGN;
02386       }
02387       else {
02388         overlap_result = NEW_ROW;
02389         if (!make_new_rows) {
02390           near_dist = row_it.data_relative (-1)->min_y () - top;
02391                                  //below bottom
02392           if (bottom < row->min_y ()) {
02393             if (row->min_y () - bottom <=
02394               (block->line_spacing -
02395             block->line_size) * tesseract::CCStruct::kDescenderFraction) {
02396                                  //done it
02397               overlap_result = ASSIGN;
02398               dest_row = row;
02399             }
02400           }
02401           else if (near_dist > 0
02402           && near_dist < bottom - row->max_y ()) {
02403             row_it.backward ();
02404             dest_row = row_it.data ();
02405             if (dest_row->min_y () - bottom <=
02406               (block->line_spacing -
02407             block->line_size) * tesseract::CCStruct::kDescenderFraction) {
02408                                  //done it
02409               overlap_result = ASSIGN;
02410             }
02411           }
02412           else {
02413             if (top - row->max_y () <=
02414               (block->line_spacing -
02415               block->line_size) * (textord_overlap_x +
02416             tesseract::CCStruct::kAscenderFraction)) {
02417                                  //done it
02418               overlap_result = ASSIGN;
02419               dest_row = row;
02420             }
02421           }
02422         }
02423       }
02424       if (overlap_result == ASSIGN)
02425         dest_row->add_blob (blob_it.extract (), top, bottom,
02426           block->line_size);
02427       if (overlap_result == NEW_ROW) {
02428         if (make_new_rows && top - bottom < block->max_blob_size) {
02429           dest_row =
02430             new TO_ROW (blob_it.extract (), top, bottom,
02431             block->line_size);
02432           row_count++;
02433           if (bottom > row_it.data ()->min_y ())
02434             row_it.add_before_then_move (dest_row);
02435           //insert in right place
02436           else
02437             row_it.add_after_then_move (dest_row);
02438           smooth_factor =
02439             1.0 / (row_count * textord_skew_lag +
02440             textord_skewsmooth_offset);
02441         }
02442         else
02443           overlap_result = REJECT;
02444       }
02445     }
02446     else if (make_new_rows && top - bottom < block->max_blob_size) {
02447       overlap_result = NEW_ROW;
02448       dest_row =
02449         new TO_ROW(blob_it.extract(), top, bottom, block->line_size);
02450       row_count++;
02451       row_it.add_after_then_move(dest_row);
02452       smooth_factor = 1.0 / (row_count * textord_skew_lag +
02453                              textord_skewsmooth_offset2);
02454     }
02455     else
02456       overlap_result = REJECT;
02457     if (blob->bounding_box ().contains(testpt) && textord_debug_blob) {
02458       if (overlap_result != REJECT) {
02459         tprintf("Test blob assigned to row at (%g,%g) on pass %d\n",
02460           dest_row->min_y(), dest_row->max_y(), pass);
02461       }
02462       else {
02463         tprintf("Test blob assigned to no row on pass %d\n", pass);
02464       }
02465     }
02466     if (overlap_result != REJECT) {
02467       while (!row_it.at_first() &&
02468              row_it.data()->min_y() > row_it.data_relative(-1)->min_y()) {
02469         row = row_it.extract();
02470         row_it.backward();
02471         row_it.add_before_then_move(row);
02472       }
02473       while (!row_it.at_last() &&
02474              row_it.data ()->min_y() < row_it.data_relative (1)->min_y()) {
02475         row = row_it.extract();
02476         row_it.forward();
02477                                  // Keep rows in order.
02478         row_it.add_after_then_move(row);
02479       }
02480       BLOBNBOX_IT added_blob_it(dest_row->blob_list());
02481       added_blob_it.move_to_last();
02482       TBOX prev_box = added_blob_it.data_relative(-1)->bounding_box();
02483       if (dest_row->blob_list()->singleton() ||
02484           !prev_box.major_x_overlap(blob->bounding_box())) {
02485         block_skew = (1 - smooth_factor) * block_skew
02486             + smooth_factor * (blob->bounding_box().bottom() -
02487             dest_row->initial_min_y());
02488       }
02489     }
02490   }
02491   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
02492     if (row_it.data()->blob_list()->empty())
02493       delete row_it.extract();  // Discard empty rows.
02494   }
02495 }
02496 
02497 
02503 OVERLAP_STATE most_overlapping_row(                    //find best row
02504                                    TO_ROW_IT *row_it,  //iterator
02505                                    TO_ROW *&best_row,  //output row
02506                                    float top,          //top of blob
02507                                    float bottom,       //bottom of blob
02508                                    float rowsize,      //max row size
02509                                    BOOL8 testing_blob  //test stuff
02510                                   ) {
02511   OVERLAP_STATE result;          //result of tests
02512   float overlap;                 //of blob & row
02513   float bestover;                //nearest row
02514   float merge_top, merge_bottom; //size of merged row
02515   ICOORD testpt;                 //testing only
02516   TO_ROW *row;                   //current row
02517   TO_ROW *test_row;              //for multiple overlaps
02518   BLOBNBOX_IT blob_it;           //for merging rows
02519 
02520   result = ASSIGN;
02521   row = row_it->data ();
02522   bestover = top - bottom;
02523   if (top > row->max_y ())
02524     bestover -= top - row->max_y ();
02525   if (bottom < row->min_y ())
02526                                  //compute overlap
02527     bestover -= row->min_y () - bottom;
02528   if (testing_blob && textord_debug_blob) {
02529     tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f\n",
02530             bottom, top, row->min_y(), row->max_y(), rowsize, bestover);
02531   }
02532   test_row = row;
02533   do {
02534     if (!row_it->at_last ()) {
02535       row_it->forward ();
02536       test_row = row_it->data ();
02537       if (test_row->min_y () <= top && test_row->max_y () >= bottom) {
02538         merge_top =
02539           test_row->max_y () >
02540           row->max_y ()? test_row->max_y () : row->max_y ();
02541         merge_bottom =
02542           test_row->min_y () <
02543           row->min_y ()? test_row->min_y () : row->min_y ();
02544         if (merge_top - merge_bottom <= rowsize) {
02545           if (testing_blob) {
02546             tprintf ("Merging rows at (%g,%g), (%g,%g)\n",
02547               row->min_y (), row->max_y (),
02548               test_row->min_y (), test_row->max_y ());
02549           }
02550           test_row->set_limits (merge_bottom, merge_top);
02551           blob_it.set_to_list (test_row->blob_list ());
02552           blob_it.add_list_after (row->blob_list ());
02553           blob_it.sort (blob_x_order);
02554           row_it->backward ();
02555           delete row_it->extract ();
02556           row_it->forward ();
02557           bestover = -1.0f;      //force replacement
02558         }
02559         overlap = top - bottom;
02560         if (top > test_row->max_y ())
02561           overlap -= top - test_row->max_y ();
02562         if (bottom < test_row->min_y ())
02563           overlap -= test_row->min_y () - bottom;
02564         if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
02565           result = REJECT;
02566         }
02567         if (overlap > bestover) {
02568           bestover = overlap;    //find biggest overlap
02569           row = test_row;
02570         }
02571         if (testing_blob && textord_debug_blob) {
02572           tprintf("Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f->%f\n",
02573                   bottom, top, test_row->min_y(), test_row->max_y(),
02574                   rowsize, overlap, bestover);
02575         }
02576       }
02577     }
02578   }
02579   while (!row_it->at_last ()
02580     && test_row->min_y () <= top && test_row->max_y () >= bottom);
02581   while (row_it->data () != row)
02582     row_it->backward ();         //make it point to row
02583                                  //doesn't overlap much
02584   if (top - bottom - bestover > rowsize * textord_overlap_x &&
02585       (!textord_fix_makerow_bug || bestover < rowsize * textord_overlap_x)
02586     && result == ASSIGN)
02587     result = NEW_ROW;            //doesn't overlap enough
02588   best_row = row;
02589   return result;
02590 }
02591 
02592 
02598 int blob_x_order(                    //sort function
02599                  const void *item1,  //items to compare
02600                  const void *item2) {
02601                                  //converted ptr
02602   BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
02603                                  //converted ptr
02604   BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
02605 
02606   if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
02607     return -1;
02608   else if (blob1->bounding_box ().left () > blob2->bounding_box ().left ())
02609     return 1;
02610   else
02611     return 0;
02612 }
02613 
02614 
02620 int row_y_order(                    //sort function
02621                 const void *item1,  //items to compare
02622                 const void *item2) {
02623                                  //converted ptr
02624   TO_ROW *row1 = *(TO_ROW **) item1;
02625                                  //converted ptr
02626   TO_ROW *row2 = *(TO_ROW **) item2;
02627 
02628   if (row1->parallel_c () > row2->parallel_c ())
02629     return -1;
02630   else if (row1->parallel_c () < row2->parallel_c ())
02631     return 1;
02632   else
02633     return 0;
02634 }
02635 
02636 
02642 int row_spacing_order(                    //sort function
02643                       const void *item1,  //items to compare
02644                       const void *item2) {
02645                                  //converted ptr
02646   TO_ROW *row1 = *(TO_ROW **) item1;
02647                                  //converted ptr
02648   TO_ROW *row2 = *(TO_ROW **) item2;
02649 
02650   if (row1->spacing < row2->spacing)
02651     return -1;
02652   else if (row1->spacing > row2->spacing)
02653     return 1;
02654   else
02655     return 0;
02656 }
02657 
02664 void mark_repeated_chars(TO_ROW *row) {
02665   BLOBNBOX_IT box_it(row->blob_list());            // Iterator.
02666   int num_repeated_sets = 0;
02667   if (!box_it.empty()) {
02668     do {
02669       BLOBNBOX* bblob = box_it.data();
02670       int repeat_length = 0;
02671       if (bblob->flow() == BTFT_LEADER &&
02672           !bblob->joined_to_prev() && bblob->cblob() != NULL) {
02673         BLOBNBOX_IT test_it(box_it);
02674         for (test_it.forward(); !test_it.at_first(); test_it.forward()) {
02675           bblob = test_it.data();
02676           if (bblob->flow() != BTFT_LEADER)
02677             break;
02678           if (bblob->joined_to_prev() || bblob->cblob() == NULL) {
02679             repeat_length = 0;
02680             break;
02681           }
02682           ++repeat_length;
02683         }
02684       }
02685       if (repeat_length >= kMinLeaderCount) {
02686         num_repeated_sets++;
02687         for (; repeat_length > 0; box_it.forward(), --repeat_length) {
02688           bblob = box_it.data();
02689           bblob->set_repeated_set(num_repeated_sets);
02690         }
02691         if (!box_it.at_first())
02692           bblob->set_repeated_set(0);
02693      } else {
02694         box_it.forward();
02695         bblob->set_repeated_set(0);
02696       }
02697     } while (!box_it.at_first());  // until all done
02698   }
02699   row->set_num_repeated_sets(num_repeated_sets);
02700 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines