tesseract
3.03
|
00001 #include "statistc.h" 00002 #include "gap_map.h" 00003 00004 #define EXTERN 00005 EXTERN BOOL_VAR (gapmap_debug, FALSE, "Say which blocks have tables"); 00006 EXTERN BOOL_VAR (gapmap_use_ends, FALSE, 00007 "Use large space at start and end of rows"); 00008 EXTERN BOOL_VAR (gapmap_no_isolated_quanta, FALSE, 00009 "Ensure gaps not less than 2quanta wide"); 00010 EXTERN double_VAR (gapmap_big_gaps, 1.75, "xht multiplier"); 00011 00012 /************************************************************************* 00013 * A block gap map is a quantised histogram of whitespace regions in the 00014 * block. It is a vertical projection of wide gaps WITHIN lines 00015 * 00016 * The map is held as an array of counts of rows which have a wide gap 00017 * covering that region of the row. Each bucket in the map represents a width 00018 * of about half an xheight - (The median of the xhts in the rows is used.) 00019 * 00020 * The block is considered RECTANGULAR - delimited by the left and right 00021 * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are 00022 * counted. 00023 * 00024 *************************************************************************/ 00025 00026 GAPMAP::GAPMAP( //Constructor 00027 TO_BLOCK *block //block 00028 ) { 00029 TO_ROW_IT row_it; //row iterator 00030 TO_ROW *row; //current row 00031 BLOBNBOX_IT blob_it; //iterator 00032 TBOX blob_box; 00033 TBOX prev_blob_box; 00034 inT16 gap_width; 00035 inT16 start_of_row; 00036 inT16 end_of_row; 00037 STATS xht_stats (0, 128); 00038 inT16 min_quantum; 00039 inT16 max_quantum; 00040 inT16 i; 00041 00042 row_it.set_to_list (block->get_rows ()); 00043 /* 00044 Find left and right extremes and bucket size 00045 */ 00046 map = NULL; 00047 min_left = MAX_INT16; 00048 max_right = -MAX_INT16; 00049 total_rows = 0; 00050 any_tabs = FALSE; 00051 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00052 row = row_it.data (); 00053 if (!row->blob_list ()->empty ()) { 00054 total_rows++; 00055 xht_stats.add ((inT16) floor (row->xheight + 0.5), 1); 00056 blob_it.set_to_list (row->blob_list ()); 00057 start_of_row = blob_it.data ()->bounding_box ().left (); 00058 end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); 00059 if (min_left > start_of_row) 00060 min_left = start_of_row; 00061 if (max_right < end_of_row) 00062 max_right = end_of_row; 00063 } 00064 } 00065 if ((total_rows < 3) || (min_left >= max_right)) { 00066 total_rows = 0; 00067 min_left = max_right = 0; 00068 return; 00069 } 00070 bucket_size = (inT16) floor (xht_stats.median () + 0.5) / 2; 00071 map_max = (max_right - min_left) / bucket_size; 00072 map = (inT16 *) alloc_mem ((map_max + 1) * sizeof (inT16)); 00073 for (i = 0; i <= map_max; i++) 00074 map[i] = 0; 00075 00076 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00077 row = row_it.data (); 00078 if (!row->blob_list ()->empty ()) { 00079 blob_it.set_to_list (row->blob_list ()); 00080 blob_it.mark_cycle_pt (); 00081 blob_box = box_next (&blob_it); 00082 prev_blob_box = blob_box; 00083 if (gapmap_use_ends) { 00084 /* Leading space */ 00085 gap_width = blob_box.left () - min_left; 00086 if ((gap_width > gapmap_big_gaps * row->xheight) 00087 && gap_width > 2) { 00088 max_quantum = (blob_box.left () - min_left) / bucket_size; 00089 if (max_quantum > map_max) max_quantum = map_max; 00090 for (i = 0; i <= max_quantum; i++) 00091 map[i]++; 00092 } 00093 } 00094 while (!blob_it.cycled_list ()) { 00095 blob_box = box_next (&blob_it); 00096 gap_width = blob_box.left () - prev_blob_box.right (); 00097 if ((gap_width > gapmap_big_gaps * row->xheight) 00098 && gap_width > 2) { 00099 min_quantum = 00100 (prev_blob_box.right () - min_left) / bucket_size; 00101 max_quantum = (blob_box.left () - min_left) / bucket_size; 00102 if (max_quantum > map_max) max_quantum = map_max; 00103 for (i = min_quantum; i <= max_quantum; i++) 00104 map[i]++; 00105 } 00106 prev_blob_box = blob_box; 00107 } 00108 if (gapmap_use_ends) { 00109 /* Trailing space */ 00110 gap_width = max_right - prev_blob_box.right (); 00111 if ((gap_width > gapmap_big_gaps * row->xheight) 00112 && gap_width > 2) { 00113 min_quantum = 00114 (prev_blob_box.right () - min_left) / bucket_size; 00115 if (min_quantum < 0) min_quantum = 0; 00116 for (i = min_quantum; i <= map_max; i++) 00117 map[i]++; 00118 } 00119 } 00120 } 00121 } 00122 for (i = 0; i <= map_max; i++) { 00123 if (map[i] > total_rows / 2) { 00124 if (gapmap_no_isolated_quanta && 00125 (((i == 0) && 00126 (map[i + 1] <= total_rows / 2)) || 00127 ((i == map_max) && 00128 (map[i - 1] <= total_rows / 2)) || 00129 ((i > 0) && 00130 (i < map_max) && 00131 (map[i - 1] <= total_rows / 2) && 00132 (map[i + 1] <= total_rows / 2)))) { 00133 map[i] = 0; //prevent isolated quantum 00134 } 00135 else 00136 any_tabs = TRUE; 00137 } 00138 } 00139 if (gapmap_debug && any_tabs) 00140 tprintf ("Table found\n"); 00141 } 00142 00143 00144 /************************************************************************* 00145 * GAPMAP::table_gap() 00146 * Is there a bucket in the specified range where more than half the rows in the 00147 * block have a wide gap? 00148 *************************************************************************/ 00149 00150 BOOL8 GAPMAP::table_gap( //Is gap a table? 00151 inT16 left, //From here 00152 inT16 right //To here 00153 ) { 00154 inT16 min_quantum; 00155 inT16 max_quantum; 00156 inT16 i; 00157 BOOL8 tab_found = FALSE; 00158 00159 if (!any_tabs) 00160 return FALSE; 00161 00162 min_quantum = (left - min_left) / bucket_size; 00163 max_quantum = (right - min_left) / bucket_size; 00164 // Clip to the bounds of the array. In some circumstances (big blob followed 00165 // by small blob) max_quantum can exceed the map_max bounds, but we clip 00166 // here instead, as it provides better long-term safety. 00167 if (min_quantum < 0) min_quantum = 0; 00168 if (max_quantum > map_max) max_quantum = map_max; 00169 for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) 00170 if (map[i] > total_rows / 2) 00171 tab_found = TRUE; 00172 return tab_found; 00173 }