tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/textord/gap_map.cpp
Go to the documentation of this file.
00001 #include          "statistc.h"
00002 #include          "gap_map.h"
00003 
00004 #define EXTERN
00005 EXTERN BOOL_VAR (gapmap_debug, FALSE, "Say which blocks have tables");
00006 EXTERN BOOL_VAR (gapmap_use_ends, FALSE,
00007 "Use large space at start and end of rows");
00008 EXTERN BOOL_VAR (gapmap_no_isolated_quanta, FALSE,
00009 "Ensure gaps not less than 2quanta wide");
00010 EXTERN double_VAR (gapmap_big_gaps, 1.75, "xht multiplier");
00011 
00012 /*************************************************************************
00013  * A block gap map is a quantised histogram of whitespace regions in the
00014  * block. It is a vertical projection of wide gaps WITHIN lines
00015  *
00016  * The map is held as an array of counts of rows which have a wide gap
00017  * covering that region of the row. Each bucket in the map represents a width
00018  * of about half an xheight - (The median of the xhts in the rows is used.)
00019  *
00020  * The block is considered RECTANGULAR - delimited by the left and right
00021  * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
00022  * counted.
00023  *
00024  *************************************************************************/
00025 
00026 GAPMAP::GAPMAP(                 //Constructor
00027                TO_BLOCK *block  //block
00028               ) {
00029   TO_ROW_IT row_it;              //row iterator
00030   TO_ROW *row;                   //current row
00031   BLOBNBOX_IT blob_it;           //iterator
00032   TBOX blob_box;
00033   TBOX prev_blob_box;
00034   inT16 gap_width;
00035   inT16 start_of_row;
00036   inT16 end_of_row;
00037   STATS xht_stats (0, 128);
00038   inT16 min_quantum;
00039   inT16 max_quantum;
00040   inT16 i;
00041 
00042   row_it.set_to_list (block->get_rows ());
00043   /*
00044     Find left and right extremes and bucket size
00045   */
00046   map = NULL;
00047   min_left = MAX_INT16;
00048   max_right = -MAX_INT16;
00049   total_rows = 0;
00050   any_tabs = FALSE;
00051   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00052     row = row_it.data ();
00053     if (!row->blob_list ()->empty ()) {
00054       total_rows++;
00055       xht_stats.add ((inT16) floor (row->xheight + 0.5), 1);
00056       blob_it.set_to_list (row->blob_list ());
00057       start_of_row = blob_it.data ()->bounding_box ().left ();
00058       end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
00059       if (min_left > start_of_row)
00060         min_left = start_of_row;
00061       if (max_right < end_of_row)
00062         max_right = end_of_row;
00063     }
00064   }
00065   if ((total_rows < 3) || (min_left >= max_right)) {
00066     total_rows = 0;
00067     min_left = max_right = 0;
00068     return;
00069   }
00070   bucket_size = (inT16) floor (xht_stats.median () + 0.5) / 2;
00071   map_max = (max_right - min_left) / bucket_size;
00072   map = (inT16 *) alloc_mem ((map_max + 1) * sizeof (inT16));
00073   for (i = 0; i <= map_max; i++)
00074     map[i] = 0;
00075 
00076   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00077     row = row_it.data ();
00078     if (!row->blob_list ()->empty ()) {
00079       blob_it.set_to_list (row->blob_list ());
00080       blob_it.mark_cycle_pt ();
00081       blob_box = box_next (&blob_it);
00082       prev_blob_box = blob_box;
00083       if (gapmap_use_ends) {
00084         /* Leading space */
00085         gap_width = blob_box.left () - min_left;
00086         if ((gap_width > gapmap_big_gaps * row->xheight)
00087         && gap_width > 2) {
00088           max_quantum = (blob_box.left () - min_left) / bucket_size;
00089           if (max_quantum > map_max) max_quantum = map_max;
00090             for (i = 0; i <= max_quantum; i++)
00091             map[i]++;
00092         }
00093       }
00094       while (!blob_it.cycled_list ()) {
00095         blob_box = box_next (&blob_it);
00096         gap_width = blob_box.left () - prev_blob_box.right ();
00097         if ((gap_width > gapmap_big_gaps * row->xheight)
00098         && gap_width > 2) {
00099           min_quantum =
00100             (prev_blob_box.right () - min_left) / bucket_size;
00101           max_quantum = (blob_box.left () - min_left) / bucket_size;
00102           if (max_quantum > map_max) max_quantum = map_max;
00103           for (i = min_quantum; i <= max_quantum; i++)
00104             map[i]++;
00105         }
00106         prev_blob_box = blob_box;
00107       }
00108       if (gapmap_use_ends) {
00109         /* Trailing space */
00110         gap_width = max_right - prev_blob_box.right ();
00111         if ((gap_width > gapmap_big_gaps * row->xheight)
00112         && gap_width > 2) {
00113           min_quantum =
00114             (prev_blob_box.right () - min_left) / bucket_size;
00115           if (min_quantum < 0) min_quantum = 0;
00116           for (i = min_quantum; i <= map_max; i++)
00117             map[i]++;
00118         }
00119       }
00120     }
00121   }
00122   for (i = 0; i <= map_max; i++) {
00123     if (map[i] > total_rows / 2) {
00124       if (gapmap_no_isolated_quanta &&
00125         (((i == 0) &&
00126         (map[i + 1] <= total_rows / 2)) ||
00127         ((i == map_max) &&
00128         (map[i - 1] <= total_rows / 2)) ||
00129         ((i > 0) &&
00130         (i < map_max) &&
00131         (map[i - 1] <= total_rows / 2) &&
00132       (map[i + 1] <= total_rows / 2)))) {
00133         map[i] = 0;              //prevent isolated quantum
00134       }
00135       else
00136         any_tabs = TRUE;
00137     }
00138   }
00139   if (gapmap_debug && any_tabs)
00140     tprintf ("Table found\n");
00141 }
00142 
00143 
00144 /*************************************************************************
00145  * GAPMAP::table_gap()
00146  * Is there a bucket in the specified range where more than half the rows in the
00147  * block have a wide gap?
00148  *************************************************************************/
00149 
00150 BOOL8 GAPMAP::table_gap(             //Is gap a table?
00151                         inT16 left,  //From here
00152                         inT16 right  //To here
00153                        ) {
00154   inT16 min_quantum;
00155   inT16 max_quantum;
00156   inT16 i;
00157   BOOL8 tab_found = FALSE;
00158 
00159   if (!any_tabs)
00160     return FALSE;
00161 
00162   min_quantum = (left - min_left) / bucket_size;
00163   max_quantum = (right - min_left) / bucket_size;
00164   // Clip to the bounds of the array. In some circumstances (big blob followed
00165   // by small blob) max_quantum can exceed the map_max bounds, but we clip
00166   // here instead, as it provides better long-term safety.
00167   if (min_quantum < 0) min_quantum = 0;
00168   if (max_quantum > map_max) max_quantum = map_max;
00169   for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++)
00170     if (map[i] > total_rows / 2)
00171       tab_found = TRUE;
00172   return tab_found;
00173 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines