tesseract
3.03
|
00001 00002 // File: tablefind.cpp 00003 // Description: Helper classes to find tables from ColPartitions. 00004 // Author: Faisal Shafait (faisal.shafait@dfki.de) 00005 // Created: Tue Jan 06 11:13:01 PST 2009 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #ifdef HAVE_CONFIG_H 00025 #include "config_auto.h" 00026 #endif 00027 00028 #include "tablefind.h" 00029 #include <math.h> 00030 00031 #include "allheaders.h" 00032 00033 #include "colpartitionset.h" 00034 #include "tablerecog.h" 00035 00036 namespace tesseract { 00037 00038 // These numbers are used to calculate the global median stats. 00039 // They just set an upper bound on the stats objects. 00040 // Maximum vertical spacing between neighbor partitions. 00041 const int kMaxVerticalSpacing = 500; 00042 // Maximum width of a blob in a partition. 00043 const int kMaxBlobWidth = 500; 00044 00045 // Minimum whitespace size to split a partition (measured as a multiple 00046 // of a partition's median width). 00047 const double kSplitPartitionSize = 2.0; 00048 // To insert text, the partition must satisfy these size constraints 00049 // in AllowTextPartition(). The idea is to filter noise partitions 00050 // determined by the size compared to the global medians. 00051 // TODO(nbeato): Need to find good numbers again. 00052 const double kAllowTextHeight = 0.5; 00053 const double kAllowTextWidth = 0.6; 00054 const double kAllowTextArea = 0.8; 00055 // The same thing applies to blobs (to filter noise). 00056 // TODO(nbeato): These numbers are a shot in the dark... 00057 // height and width are 0.5 * gridsize() in colfind.cpp 00058 // area is a rough guess for the size of a period. 00059 const double kAllowBlobHeight = 0.3; 00060 const double kAllowBlobWidth = 0.4; 00061 const double kAllowBlobArea = 0.05; 00062 00063 // Minimum number of components in a text partition. A partition having fewer 00064 // components than that is more likely a data partition and is a candidate 00065 // table cell. 00066 const int kMinBoxesInTextPartition = 10; 00067 00068 // Maximum number of components that a data partition can have 00069 const int kMaxBoxesInDataPartition = 20; 00070 00071 // Maximum allowed gap in a text partitions as a multiple of its median size. 00072 const double kMaxGapInTextPartition = 4.0; 00073 00074 // Minimum value that the maximum gap in a text partition should have as a 00075 // factor of its median size. 00076 const double kMinMaxGapInTextPartition = 0.5; 00077 00078 // The amount of overlap that is "normal" for adjacent blobs in a text 00079 // partition. This is used to calculate gap between overlapping blobs. 00080 const double kMaxBlobOverlapFactor = 4.0; 00081 00082 // Maximum x-height a table partition can have as a multiple of global 00083 // median x-height 00084 const double kMaxTableCellXheight = 2.0; 00085 00086 // Maximum line spacing between a table column header and column contents 00087 // for merging the two (as a multiple of the partition's median_size). 00088 const int kMaxColumnHeaderDistance = 4; 00089 00090 // Minimum ratio of num_table_partitions to num_text_partitions in a column 00091 // block to be called it a table column 00092 const double kTableColumnThreshold = 3.0; 00093 00094 // Search for horizontal ruling lines within the vertical margin as a 00095 // multiple of grid size 00096 const int kRulingVerticalMargin = 3; 00097 00098 // Minimum overlap that a colpartition must have with a table region 00099 // to become part of that table 00100 const double kMinOverlapWithTable = 0.6; 00101 00102 // Maximum side space (distance from column boundary) that a typical 00103 // text-line in flowing text should have as a multiple of its x-height 00104 // (Median size). 00105 const int kSideSpaceMargin = 10; 00106 00107 // Fraction of the peak of x-projection of a table region to set the 00108 // threshold for the x-projection histogram 00109 const double kSmallTableProjectionThreshold = 0.35; 00110 const double kLargeTableProjectionThreshold = 0.45; 00111 // Minimum number of rows required to look for more rows in the projection. 00112 const int kLargeTableRowCount = 6; 00113 00114 // Minimum number of rows in a table 00115 const int kMinRowsInTable = 3; 00116 00117 // The number of "whitespace blobs" that should appear between the 00118 // ColPartition's bounding box and the column tab stops to the left/right 00119 // when looking for center justified tab stops. 00120 const double kRequiredFullJustifiedSpacing = 4.0; 00121 00122 // The amount of padding (multiplied by global_median_xheight_ during use) 00123 // that is vertically added to the search adjacent leader search during 00124 // ColPartition marking. 00125 const int kAdjacentLeaderSearchPadding = 2; 00126 00127 // Used when filtering false positives. When finding the last line 00128 // of a paragraph (typically left-aligned), the previous line should have 00129 // its center to the right of the last line by this scaled amount. 00130 const double kParagraphEndingPreviousLineRatio = 1.3; 00131 00132 // The maximum amount of whitespace allowed left of a paragraph ending. 00133 // Do not filter a ColPartition with more than this space left of it. 00134 const double kMaxParagraphEndingLeftSpaceMultiple = 3.0; 00135 00136 // Used when filtering false positives. The last line of a paragraph 00137 // should be preceded by a line that is predominantly text. This is the 00138 // ratio of text to whitespace (to the right of the text) that is required 00139 // for the previous line to be a text. 00140 const double kMinParagraphEndingTextToWhitespaceRatio = 3.0; 00141 00142 // When counting table columns, this is the required gap between two columns 00143 // (it is multiplied by global_median_xheight_). 00144 const double kMaxXProjectionGapFactor = 2.0; 00145 00146 // Used for similarity in partitions using stroke width. Values copied 00147 // from ColFind.cpp in Ray's CL. 00148 const double kStrokeWidthFractionalTolerance = 0.25; 00149 const double kStrokeWidthConstantTolerance = 2.0; 00150 00151 BOOL_VAR(textord_dump_table_images, false, "Paint table detection output"); 00152 BOOL_VAR(textord_show_tables, false, "Show table regions"); 00153 BOOL_VAR(textord_tablefind_show_mark, false, 00154 "Debug table marking steps in detail"); 00155 BOOL_VAR(textord_tablefind_show_stats, false, 00156 "Show page stats used in table finding"); 00157 BOOL_VAR(textord_tablefind_recognize_tables, false, 00158 "Enables the table recognizer for table layout and filtering."); 00159 00160 ELISTIZE(ColSegment) 00161 CLISTIZE(ColSegment) 00162 00163 // Templated helper function used to create destructor callbacks for the 00164 // BBGrid::ClearGridData() method. 00165 template <typename T> void DeleteObject(T *object) { 00166 delete object; 00167 } 00168 00169 TableFinder::TableFinder() 00170 : resolution_(0), 00171 global_median_xheight_(0), 00172 global_median_blob_width_(0), 00173 global_median_ledding_(0), 00174 left_to_right_language_(true) { 00175 } 00176 00177 TableFinder::~TableFinder() { 00178 // ColPartitions and ColSegments created by this class for storage in grids 00179 // need to be deleted explicitly. 00180 clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>); 00181 leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>); 00182 fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>); 00183 col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>); 00184 table_grid_.ClearGridData(&DeleteObject<ColSegment>); 00185 } 00186 00187 void TableFinder::set_left_to_right_language(bool order) { 00188 left_to_right_language_ = order; 00189 } 00190 00191 void TableFinder::Init(int grid_size, const ICOORD& bottom_left, 00192 const ICOORD& top_right) { 00193 // Initialize clean partitions list and grid 00194 clean_part_grid_.Init(grid_size, bottom_left, top_right); 00195 leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right); 00196 fragmented_text_grid_.Init(grid_size, bottom_left, top_right); 00197 col_seg_grid_.Init(grid_size, bottom_left, top_right); 00198 table_grid_.Init(grid_size, bottom_left, top_right); 00199 } 00200 00201 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and 00202 // insert leaders and rulers into the leader_and_ruling_grid_ 00203 void TableFinder::InsertCleanPartitions(ColPartitionGrid* grid, 00204 TO_BLOCK* block) { 00205 // Calculate stats. This lets us filter partitions in AllowTextPartition() 00206 // and filter blobs in AllowBlob(). 00207 SetGlobalSpacings(grid); 00208 00209 // Iterate the ColPartitions in the grid. 00210 ColPartitionGridSearch gsearch(grid); 00211 gsearch.SetUniqueMode(true); 00212 gsearch.StartFullSearch(); 00213 ColPartition* part = NULL; 00214 while ((part = gsearch.NextFullSearch()) != NULL) { 00215 // Reject partitions with nothing useful inside of them. 00216 if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) 00217 continue; 00218 ColPartition* clean_part = part->ShallowCopy(); 00219 ColPartition* leader_part = NULL; 00220 if (part->IsLineType()) { 00221 InsertRulingPartition(clean_part); 00222 continue; 00223 } 00224 // Insert all non-text partitions to clean_parts 00225 if (!part->IsTextType()) { 00226 InsertImagePartition(clean_part); 00227 continue; 00228 } 00229 // Insert text colpartitions after removing noisy components from them 00230 // The leaders are split into a separate grid. 00231 BLOBNBOX_CLIST* part_boxes = part->boxes(); 00232 BLOBNBOX_C_IT pit(part_boxes); 00233 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) { 00234 BLOBNBOX *pblob = pit.data(); 00235 // Bad blobs... happens in UNLV set. 00236 // news.3G1, page 17 (around x=6) 00237 if (!AllowBlob(*pblob)) 00238 continue; 00239 if (pblob->flow() == BTFT_LEADER) { 00240 if (leader_part == NULL) { 00241 leader_part = part->ShallowCopy(); 00242 leader_part->set_flow(BTFT_LEADER); 00243 } 00244 leader_part->AddBox(pblob); 00245 } else if (pblob->region_type() != BRT_NOISE) { 00246 clean_part->AddBox(pblob); 00247 } 00248 } 00249 clean_part->ComputeLimits(); 00250 ColPartition* fragmented = clean_part->CopyButDontOwnBlobs(); 00251 InsertTextPartition(clean_part); 00252 SplitAndInsertFragmentedTextPartition(fragmented); 00253 if (leader_part != NULL) { 00254 // TODO(nbeato): Note that ComputeLimits does not update the column 00255 // information. So the leader may appear to span more columns than it 00256 // really does later on when IsInSameColumnAs gets called to test 00257 // for adjacent leaders. 00258 leader_part->ComputeLimits(); 00259 InsertLeaderPartition(leader_part); 00260 } 00261 } 00262 00263 // Make the partition partners better for upper and lower neighbors. 00264 clean_part_grid_.FindPartitionPartners(); 00265 clean_part_grid_.RefinePartitionPartners(false); 00266 } 00267 00268 // High level function to perform table detection 00269 void TableFinder::LocateTables(ColPartitionGrid* grid, 00270 ColPartitionSet** all_columns, 00271 WidthCallback* width_cb, 00272 const FCOORD& reskew) { 00273 // initialize spacing, neighbors, and columns 00274 InitializePartitions(all_columns); 00275 00276 #ifndef GRAPHICS_DISABLED 00277 if (textord_show_tables) { 00278 ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors"); 00279 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00280 DisplayColPartitions(table_win, &leader_and_ruling_grid_, 00281 ScrollView::AQUAMARINE); 00282 DisplayColPartitionConnections(table_win, &clean_part_grid_, 00283 ScrollView::ORANGE); 00284 00285 table_win = MakeWindow(100, 300, "Fragmented Text"); 00286 DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE); 00287 } 00288 #endif // GRAPHICS_DISABLED 00289 00290 // mark, filter, and smooth candidate table partitions 00291 MarkTablePartitions(); 00292 00293 // Make single-column blocks from good_columns_ partitions. col_segments are 00294 // moved to a grid later which takes the ownership 00295 ColSegment_LIST column_blocks; 00296 GetColumnBlocks(all_columns, &column_blocks); 00297 // Set the ratio of candidate table partitions in each column 00298 SetColumnsType(&column_blocks); 00299 00300 // Move column segments to col_seg_grid_ 00301 MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_); 00302 00303 // Detect split in column layout that might have occurred due to the 00304 // presence of a table. In such a case, merge the corresponding columns. 00305 GridMergeColumnBlocks(); 00306 00307 // Group horizontally overlapping table partitions into table columns. 00308 // table_columns created here get deleted at the end of this method. 00309 ColSegment_LIST table_columns; 00310 GetTableColumns(&table_columns); 00311 00312 // Within each column, mark the range table regions occupy based on the 00313 // table columns detected. table_regions are moved to a grid later which 00314 // takes the ownership 00315 ColSegment_LIST table_regions; 00316 GetTableRegions(&table_columns, &table_regions); 00317 00318 #ifndef GRAPHICS_DISABLED 00319 if (textord_tablefind_show_mark) { 00320 ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions"); 00321 DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE); 00322 DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW); 00323 } 00324 #endif // GRAPHICS_DISABLED 00325 00326 // Merge table regions across columns for tables spanning multiple 00327 // columns 00328 MoveColSegmentsToGrid(&table_regions, &table_grid_); 00329 GridMergeTableRegions(); 00330 00331 // Adjust table boundaries by including nearby horizontal lines and left 00332 // out column headers 00333 AdjustTableBoundaries(); 00334 GridMergeTableRegions(); 00335 00336 if (textord_tablefind_recognize_tables) { 00337 // Remove false alarms consiting of a single column 00338 DeleteSingleColumnTables(); 00339 00340 #ifndef GRAPHICS_DISABLED 00341 if (textord_show_tables) { 00342 ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations"); 00343 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00344 DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI); 00345 table_grid_.DisplayBoxes(table_win); 00346 } 00347 #endif // GRAPHICS_DISABLED 00348 00349 // Find table grid structure and reject tables that are malformed. 00350 RecognizeTables(); 00351 GridMergeTableRegions(); 00352 RecognizeTables(); 00353 00354 #ifndef GRAPHICS_DISABLED 00355 if (textord_show_tables) { 00356 ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables"); 00357 DisplayColPartitions(table_win, &clean_part_grid_, 00358 ScrollView::BLUE, ScrollView::BLUE); 00359 table_grid_.DisplayBoxes(table_win); 00360 } 00361 #endif // GRAPHICS_DISABLED 00362 } else { 00363 // Remove false alarms consiting of a single column 00364 // TODO(nbeato): verify this is a NOP after structured table rejection. 00365 // Right now it isn't. If the recognize function is doing what it is 00366 // supposed to do, this function is obsolete. 00367 DeleteSingleColumnTables(); 00368 00369 #ifndef GRAPHICS_DISABLED 00370 if (textord_show_tables) { 00371 ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables"); 00372 DisplayColPartitions(table_win, &clean_part_grid_, 00373 ScrollView::BLUE, ScrollView::BLUE); 00374 table_grid_.DisplayBoxes(table_win); 00375 } 00376 #endif // GRAPHICS_DISABLED 00377 } 00378 00379 if (textord_dump_table_images) 00380 WriteToPix(reskew); 00381 00382 // Merge all colpartitions in table regions to make them a single 00383 // colpartition and revert types of isolated table cells not 00384 // assigned to any table to their original types. 00385 MakeTableBlocks(grid, all_columns, width_cb); 00386 } 00387 // All grids have the same dimensions. The clean_part_grid_ sizes are set from 00388 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as 00389 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_ 00390 // dimensions instead of duplicated memory. 00391 int TableFinder::gridsize() const { 00392 return clean_part_grid_.gridsize(); 00393 } 00394 int TableFinder::gridwidth() const { 00395 return clean_part_grid_.gridwidth(); 00396 } 00397 int TableFinder::gridheight() const { 00398 return clean_part_grid_.gridheight(); 00399 } 00400 const ICOORD& TableFinder::bleft() const { 00401 return clean_part_grid_.bleft(); 00402 } 00403 const ICOORD& TableFinder::tright() const { 00404 return clean_part_grid_.tright(); 00405 } 00406 00407 void TableFinder::InsertTextPartition(ColPartition* part) { 00408 ASSERT_HOST(part != NULL); 00409 if (AllowTextPartition(*part)) { 00410 clean_part_grid_.InsertBBox(true, true, part); 00411 } else { 00412 delete part; 00413 } 00414 } 00415 void TableFinder::InsertFragmentedTextPartition(ColPartition* part) { 00416 ASSERT_HOST(part != NULL); 00417 if (AllowTextPartition(*part)) { 00418 fragmented_text_grid_.InsertBBox(true, true, part); 00419 } else { 00420 delete part; 00421 } 00422 } 00423 void TableFinder::InsertLeaderPartition(ColPartition* part) { 00424 ASSERT_HOST(part != NULL); 00425 if (!part->IsEmpty() && part->bounding_box().area() > 0) { 00426 leader_and_ruling_grid_.InsertBBox(true, true, part); 00427 } else { 00428 delete part; 00429 } 00430 } 00431 void TableFinder::InsertRulingPartition(ColPartition* part) { 00432 leader_and_ruling_grid_.InsertBBox(true, true, part); 00433 } 00434 void TableFinder::InsertImagePartition(ColPartition* part) { 00435 // NOTE: If images are placed into a different grid in the future, 00436 // the function SetPartitionSpacings needs to be updated. It should 00437 // be the only thing that cares about image partitions. 00438 clean_part_grid_.InsertBBox(true, true, part); 00439 } 00440 00441 // Splits a partition into its "words". The splits happen 00442 // at locations with wide inter-blob spacing. This is useful 00443 // because it allows the table recognize to "cut through" the 00444 // text lines on the page. The assumption is that a table 00445 // will have several lines with similar overlapping whitespace 00446 // whereas text will not have this type of property. 00447 // Note: The code Assumes that blobs are sorted by the left side x! 00448 // This will not work (as well) if the blobs are sorted by center/right. 00449 void TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition* part) { 00450 ASSERT_HOST(part != NULL); 00451 // Bye bye empty partitions! 00452 if (part->boxes()->empty()) { 00453 delete part; 00454 return; 00455 } 00456 00457 // The AllowBlob function prevents this. 00458 ASSERT_HOST(part->median_width() > 0); 00459 const double kThreshold = part->median_width() * kSplitPartitionSize; 00460 00461 ColPartition* right_part = part; 00462 bool found_split = true; 00463 while (found_split) { 00464 found_split = false; 00465 BLOBNBOX_C_IT box_it(right_part->boxes()); 00466 // Blobs are sorted left side first. If blobs overlap, 00467 // the previous blob may have a "more right" right side. 00468 // Account for this by always keeping the largest "right" 00469 // so far. 00470 int previous_right = MIN_INT32; 00471 00472 // Look for the next split in the partition. 00473 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) { 00474 const TBOX& box = box_it.data()->bounding_box(); 00475 if (previous_right != MIN_INT32 && 00476 box.left() - previous_right > kThreshold) { 00477 // We have a split position. Split the partition in two pieces. 00478 // Insert the left piece in the grid and keep processing the right. 00479 int mid_x = (box.left() + previous_right) / 2; 00480 ColPartition* left_part = right_part; 00481 right_part = left_part->SplitAt(mid_x); 00482 00483 InsertFragmentedTextPartition(left_part); 00484 found_split = true; 00485 break; 00486 } 00487 00488 // The right side of the previous blobs. 00489 previous_right = MAX(previous_right, box.right()); 00490 } 00491 } 00492 // When a split is not found, the right part is minimized 00493 // as much as possible, so process it. 00494 InsertFragmentedTextPartition(right_part); 00495 } 00496 00497 // Some simple criteria to filter out now. We want to make sure the 00498 // average blob size in the partition is consistent with the 00499 // global page stats. 00500 // The area metric will almost always pass for multi-blob partitions. 00501 // It is useful when filtering out noise caused by an isolated blob. 00502 bool TableFinder::AllowTextPartition(const ColPartition& part) const { 00503 const double kHeightRequired = global_median_xheight_ * kAllowTextHeight; 00504 const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth; 00505 const int median_area = global_median_xheight_ * global_median_blob_width_; 00506 const double kAreaPerBlobRequired = median_area * kAllowTextArea; 00507 // Keep comparisons strictly greater to disallow 0! 00508 return part.median_size() > kHeightRequired && 00509 part.median_width() > kWidthRequired && 00510 part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count(); 00511 } 00512 00513 // Same as above, applied to blobs. Keep in mind that 00514 // leaders, commas, and periods are important in tables. 00515 bool TableFinder::AllowBlob(const BLOBNBOX& blob) const { 00516 const TBOX& box = blob.bounding_box(); 00517 const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight; 00518 const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth; 00519 const int median_area = global_median_xheight_ * global_median_blob_width_; 00520 const double kAreaRequired = median_area * kAllowBlobArea; 00521 // Keep comparisons strictly greater to disallow 0! 00522 return box.height() > kHeightRequired && 00523 box.width() > kWidthRequired && 00524 box.area() > kAreaRequired; 00525 } 00526 00527 // TODO(nbeato): The grid that makes the window doesn't seem to matter. 00528 // The only downside is that window messages will be caught by 00529 // clean_part_grid_ instead of a useful object. This is a temporary solution 00530 // for the debug windows created by the TableFinder. 00531 ScrollView* TableFinder::MakeWindow(int x, int y, const char* window_name) { 00532 return clean_part_grid_.MakeWindow(x, y, window_name); 00533 } 00534 00535 // Make single-column blocks from good_columns_ partitions. 00536 void TableFinder::GetColumnBlocks(ColPartitionSet** all_columns, 00537 ColSegment_LIST* column_blocks) { 00538 for (int i = 0; i < gridheight(); ++i) { 00539 ColPartitionSet* columns = all_columns[i]; 00540 if (columns != NULL) { 00541 ColSegment_LIST new_blocks; 00542 // Get boxes from the current vertical position on the grid 00543 columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks); 00544 // Merge the new_blocks boxes into column_blocks if they are well-aligned 00545 GroupColumnBlocks(&new_blocks, column_blocks); 00546 } 00547 } 00548 } 00549 00550 // Merge column segments into the current list if they are well aligned. 00551 void TableFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks, 00552 ColSegment_LIST* column_blocks) { 00553 ColSegment_IT src_it(new_blocks); 00554 ColSegment_IT dest_it(column_blocks); 00555 // iterate through the source list 00556 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { 00557 ColSegment* src_seg = src_it.data(); 00558 TBOX src_box = src_seg->bounding_box(); 00559 bool match_found = false; 00560 // iterate through the destination list to find a matching column block 00561 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) { 00562 ColSegment* dest_seg = dest_it.data(); 00563 TBOX dest_box = dest_seg->bounding_box(); 00564 if (ConsecutiveBoxes(src_box, dest_box)) { 00565 // If matching block is found, insert the current block into it 00566 // and delete the soure block 00567 dest_seg->InsertBox(src_box); 00568 match_found = true; 00569 delete src_it.extract(); 00570 break; 00571 } 00572 } 00573 // If no match is found, just append the source block to column_blocks 00574 if (!match_found) { 00575 dest_it.add_after_then_move(src_it.extract()); 00576 } 00577 } 00578 } 00579 00580 // are the two boxes immediate neighbors along the vertical direction 00581 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) { 00582 int x_margin = 20; 00583 int y_margin = 5; 00584 return (abs(b1.left() - b2.left()) < x_margin) && 00585 (abs(b1.right() - b2.right()) < x_margin) && 00586 (abs(b1.top()-b2.bottom()) < y_margin || 00587 abs(b2.top()-b1.bottom()) < y_margin); 00588 } 00589 00590 // Set up info for clean_part_grid_ partitions to be valid during detection 00591 // code. 00592 void TableFinder::InitializePartitions(ColPartitionSet** all_columns) { 00593 FindNeighbors(); 00594 SetPartitionSpacings(&clean_part_grid_, all_columns); 00595 SetGlobalSpacings(&clean_part_grid_); 00596 } 00597 00598 // Set left, right and top, bottom spacings of each colpartition. 00599 void TableFinder::SetPartitionSpacings(ColPartitionGrid* grid, 00600 ColPartitionSet** all_columns) { 00601 // Iterate the ColPartitions in the grid. 00602 ColPartitionGridSearch gsearch(grid); 00603 gsearch.StartFullSearch(); 00604 ColPartition* part = NULL; 00605 while ((part = gsearch.NextFullSearch()) != NULL) { 00606 ColPartitionSet* columns = all_columns[gsearch.GridY()]; 00607 TBOX box = part->bounding_box(); 00608 int y = part->MidY(); 00609 ColPartition* left_column = columns->ColumnContaining(box.left(), y); 00610 ColPartition* right_column = columns->ColumnContaining(box.right(), y); 00611 // set distance from left column as space to the left 00612 if (left_column) { 00613 int left_space = MAX(0, box.left() - left_column->LeftAtY(y)); 00614 part->set_space_to_left(left_space); 00615 } 00616 // set distance from right column as space to the right 00617 if (right_column) { 00618 int right_space = MAX(0, right_column->RightAtY(y) - box.right()); 00619 part->set_space_to_right(right_space); 00620 } 00621 00622 // Look for images that may be closer. 00623 // NOTE: used to be part_grid_, might cause issues now 00624 ColPartitionGridSearch hsearch(grid); 00625 hsearch.StartSideSearch(box.left(), box.bottom(), box.top()); 00626 ColPartition* neighbor = NULL; 00627 while ((neighbor = hsearch.NextSideSearch(true)) != NULL) { 00628 if (neighbor->type() == PT_PULLOUT_IMAGE || 00629 neighbor->type() == PT_FLOWING_IMAGE || 00630 neighbor->type() == PT_HEADING_IMAGE) { 00631 int right = neighbor->bounding_box().right(); 00632 if (right < box.left()) { 00633 int space = MIN(box.left() - right, part->space_to_left()); 00634 part->set_space_to_left(space); 00635 } 00636 } 00637 } 00638 hsearch.StartSideSearch(box.left(), box.bottom(), box.top()); 00639 neighbor = NULL; 00640 while ((neighbor = hsearch.NextSideSearch(false)) != NULL) { 00641 if (neighbor->type() == PT_PULLOUT_IMAGE || 00642 neighbor->type() == PT_FLOWING_IMAGE || 00643 neighbor->type() == PT_HEADING_IMAGE) { 00644 int left = neighbor->bounding_box().left(); 00645 if (left > box.right()) { 00646 int space = MIN(left - box.right(), part->space_to_right()); 00647 part->set_space_to_right(space); 00648 } 00649 } 00650 } 00651 00652 ColPartition* upper_part = part->SingletonPartner(true); 00653 if (upper_part) { 00654 int space = MAX(0, upper_part->bounding_box().bottom() - 00655 part->bounding_box().bottom()); 00656 part->set_space_above(space); 00657 } else { 00658 // TODO(nbeato): What constitutes a good value? 00659 // 0 is the default value when not set, explicitly noting it needs to 00660 // be something else. 00661 part->set_space_above(MAX_INT32); 00662 } 00663 00664 ColPartition* lower_part = part->SingletonPartner(false); 00665 if (lower_part) { 00666 int space = MAX(0, part->bounding_box().bottom() - 00667 lower_part->bounding_box().bottom()); 00668 part->set_space_below(space); 00669 } else { 00670 // TODO(nbeato): What constitutes a good value? 00671 // 0 is the default value when not set, explicitly noting it needs to 00672 // be something else. 00673 part->set_space_below(MAX_INT32); 00674 } 00675 } 00676 } 00677 00678 // Set spacing and closest neighbors above and below a given colpartition. 00679 void TableFinder::SetVerticalSpacing(ColPartition* part) { 00680 TBOX box = part->bounding_box(); 00681 int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y()); 00682 int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y()); 00683 box.set_top(top_range); 00684 box.set_bottom(bottom_range); 00685 00686 TBOX part_box = part->bounding_box(); 00687 // Start a rect search 00688 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 00689 rectsearch(&clean_part_grid_); 00690 rectsearch.StartRectSearch(box); 00691 ColPartition* neighbor; 00692 int min_space_above = kMaxVerticalSpacing; 00693 int min_space_below = kMaxVerticalSpacing; 00694 ColPartition* above_neighbor = NULL; 00695 ColPartition* below_neighbor = NULL; 00696 while ((neighbor = rectsearch.NextRectSearch()) != NULL) { 00697 if (neighbor == part) 00698 continue; 00699 TBOX neighbor_box = neighbor->bounding_box(); 00700 if (neighbor_box.major_x_overlap(part_box)) { 00701 int gap = abs(part->median_bottom() - neighbor->median_bottom()); 00702 // If neighbor is below current partition 00703 if (neighbor_box.top() < part_box.bottom() && 00704 gap < min_space_below) { 00705 min_space_below = gap; 00706 below_neighbor = neighbor; 00707 } // If neighbor is above current partition 00708 else if (part_box.top() < neighbor_box.bottom() && 00709 gap < min_space_above) { 00710 min_space_above = gap; 00711 above_neighbor = neighbor; 00712 } 00713 } 00714 } 00715 part->set_space_above(min_space_above); 00716 part->set_space_below(min_space_below); 00717 part->set_nearest_neighbor_above(above_neighbor); 00718 part->set_nearest_neighbor_below(below_neighbor); 00719 } 00720 00721 // Set global spacing and x-height estimates 00722 void TableFinder::SetGlobalSpacings(ColPartitionGrid* grid) { 00723 STATS xheight_stats(0, kMaxVerticalSpacing + 1); 00724 STATS width_stats(0, kMaxBlobWidth + 1); 00725 STATS ledding_stats(0, kMaxVerticalSpacing + 1); 00726 // Iterate the ColPartitions in the grid. 00727 ColPartitionGridSearch gsearch(grid); 00728 gsearch.SetUniqueMode(true); 00729 gsearch.StartFullSearch(); 00730 ColPartition* part = NULL; 00731 while ((part = gsearch.NextFullSearch()) != NULL) { 00732 // TODO(nbeato): HACK HACK HACK! medians are equal to partition length. 00733 // ComputeLimits needs to get called somewhere outside of TableFinder 00734 // to make sure the partitions are properly initialized. 00735 // When this is called, SmoothPartitionPartners dies in an assert after 00736 // table find runs. Alternative solution. 00737 // part->ComputeLimits(); 00738 if (part->IsTextType()) { 00739 // xheight_stats.add(part->median_size(), part->boxes_count()); 00740 // width_stats.add(part->median_width(), part->boxes_count()); 00741 00742 // This loop can be removed when above issues are fixed. 00743 // Replace it with the 2 lines commented out above. 00744 BLOBNBOX_C_IT it(part->boxes()); 00745 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00746 xheight_stats.add(it.data()->bounding_box().height(), 1); 00747 width_stats.add(it.data()->bounding_box().width(), 1); 00748 } 00749 00750 ledding_stats.add(part->space_above(), 1); 00751 ledding_stats.add(part->space_below(), 1); 00752 } 00753 } 00754 // Set estimates based on median of statistics obtained 00755 set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5)); 00756 set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5)); 00757 set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5)); 00758 #ifndef GRAPHICS_DISABLED 00759 if (textord_tablefind_show_stats) { 00760 const char* kWindowName = "X-height (R), X-width (G), and ledding (B)"; 00761 ScrollView* stats_win = MakeWindow(500, 10, kWindowName); 00762 xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED); 00763 width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN); 00764 ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE); 00765 } 00766 #endif // GRAPHICS_DISABLED 00767 } 00768 00769 void TableFinder::set_global_median_xheight(int xheight) { 00770 global_median_xheight_ = xheight; 00771 } 00772 void TableFinder::set_global_median_blob_width(int width) { 00773 global_median_blob_width_ = width; 00774 } 00775 void TableFinder::set_global_median_ledding(int ledding) { 00776 global_median_ledding_ = ledding; 00777 } 00778 00779 void TableFinder::FindNeighbors() { 00780 ColPartitionGridSearch gsearch(&clean_part_grid_); 00781 gsearch.StartFullSearch(); 00782 ColPartition* part = NULL; 00783 while ((part = gsearch.NextFullSearch()) != NULL) { 00784 // TODO(nbeato): Rename this function, meaning is different now. 00785 // IT is finding nearest neighbors its own way 00786 //SetVerticalSpacing(part); 00787 00788 ColPartition* upper = part->SingletonPartner(true); 00789 if (upper) 00790 part->set_nearest_neighbor_above(upper); 00791 00792 ColPartition* lower = part->SingletonPartner(false); 00793 if (lower) 00794 part->set_nearest_neighbor_below(lower); 00795 } 00796 } 00797 00798 // High level interface. Input is an unmarked ColPartitionGrid 00799 // (namely, clean_part_grid_). Partitions are identified using local 00800 // information and filter/smoothed. The function exit should contain 00801 // a good sampling of the table partitions. 00802 void TableFinder::MarkTablePartitions() { 00803 MarkPartitionsUsingLocalInformation(); 00804 if (textord_tablefind_show_mark) { 00805 ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions"); 00806 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00807 DisplayColPartitions(table_win, &leader_and_ruling_grid_, 00808 ScrollView::AQUAMARINE); 00809 } 00810 FilterFalseAlarms(); 00811 if (textord_tablefind_show_mark) { 00812 ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions"); 00813 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00814 DisplayColPartitions(table_win, &leader_and_ruling_grid_, 00815 ScrollView::AQUAMARINE); 00816 } 00817 SmoothTablePartitionRuns(); 00818 if (textord_tablefind_show_mark) { 00819 ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions"); 00820 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00821 DisplayColPartitions(table_win, &leader_and_ruling_grid_, 00822 ScrollView::AQUAMARINE); 00823 } 00824 FilterFalseAlarms(); 00825 if (textord_tablefind_show_mark || textord_show_tables) { 00826 ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions"); 00827 DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE); 00828 DisplayColPartitions(table_win, &leader_and_ruling_grid_, 00829 ScrollView::AQUAMARINE); 00830 } 00831 } 00832 00833 // These types of partitions are marked as table partitions: 00834 // 1- Partitions that have at lease one large gap between words 00835 // 2- Partitions that consist of only one word (no significant gap 00836 // between components) 00837 // 3- Partitions that vertically overlap with other partitions within the 00838 // same column. 00839 // 4- Partitions with leaders before/after them. 00840 void TableFinder::MarkPartitionsUsingLocalInformation() { 00841 // Iterate the ColPartitions in the grid. 00842 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 00843 gsearch(&clean_part_grid_); 00844 gsearch.StartFullSearch(); 00845 ColPartition* part = NULL; 00846 while ((part = gsearch.NextFullSearch()) != NULL) { 00847 if (!part->IsTextType()) // Only consider text partitions 00848 continue; 00849 // Only consider partitions in dominant font size or smaller 00850 if (part->median_size() > kMaxTableCellXheight * global_median_xheight_) 00851 continue; 00852 // Mark partitions with a large gap, or no significant gap as 00853 // table partitions. 00854 // Comments: It produces several false alarms at: 00855 // - last line of a paragraph (fixed) 00856 // - single word section headings 00857 // - page headers and footers 00858 // - numbered equations 00859 // - line drawing regions 00860 // TODO(faisal): detect and fix above-mentioned cases 00861 if (HasWideOrNoInterWordGap(part) || 00862 HasLeaderAdjacent(*part)) { 00863 part->set_table_type(); 00864 } 00865 } 00866 } 00867 00868 // Check if the partition has at least one large gap between words or no 00869 // significant gap at all 00870 bool TableFinder::HasWideOrNoInterWordGap(ColPartition* part) const { 00871 // Should only get text partitions. 00872 ASSERT_HOST(part->IsTextType()); 00873 // Blob access 00874 BLOBNBOX_CLIST* part_boxes = part->boxes(); 00875 BLOBNBOX_C_IT it(part_boxes); 00876 // Check if this is a relatively small partition (such as a single word) 00877 if (part->bounding_box().width() < 00878 kMinBoxesInTextPartition * part->median_size() && 00879 part_boxes->length() < kMinBoxesInTextPartition) 00880 return true; 00881 00882 // Variables used to compute inter-blob spacing. 00883 int current_x0 = -1; 00884 int current_x1 = -1; 00885 int previous_x1 = -1; 00886 // Stores the maximum gap detected. 00887 int largest_partition_gap_found = -1; 00888 // Text partition gap limits. If this is text (and not a table), 00889 // there should be at least one gap larger than min_gap and no gap 00890 // larger than max_gap. 00891 const double max_gap = kMaxGapInTextPartition * part->median_size(); 00892 const double min_gap = kMinMaxGapInTextPartition * part->median_size(); 00893 00894 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00895 BLOBNBOX* blob = it.data(); 00896 current_x0 = blob->bounding_box().left(); 00897 current_x1 = blob->bounding_box().right(); 00898 if (previous_x1 != -1) { 00899 int gap = current_x0 - previous_x1; 00900 00901 // TODO(nbeato): Boxes may overlap? Huh? 00902 // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors 00903 // on the top right of the page are filtered out with this line. 00904 // Note 2: Iterating over blobs in a partition, so we are looking for 00905 // spacing between the words. 00906 if (gap < 0) { 00907 // More likely case, the blobs slightly overlap. This can happen 00908 // with diacritics (accents) or broken alphabet symbols (characters). 00909 // Merge boxes together by taking max of right sides. 00910 if (-gap < part->median_size() * kMaxBlobOverlapFactor) { 00911 previous_x1 = MAX(previous_x1, current_x1); 00912 continue; 00913 } 00914 // Extreme case, blobs overlap significantly in the same partition... 00915 // This should not happen often (if at all), but it does. 00916 // TODO(nbeato): investigate cases when this happens. 00917 else { 00918 // The behavior before was to completely ignore this case. 00919 } 00920 } 00921 00922 // If a large enough gap is found, mark it as a table cell (return true) 00923 if (gap > max_gap) 00924 return true; 00925 if (gap > largest_partition_gap_found) 00926 largest_partition_gap_found = gap; 00927 } 00928 previous_x1 = current_x1; 00929 } 00930 // Since no large gap was found, return false if the partition is too 00931 // long to be a data cell 00932 if (part->bounding_box().width() > 00933 kMaxBoxesInDataPartition * part->median_size() || 00934 part_boxes->length() > kMaxBoxesInDataPartition) 00935 return false; 00936 00937 // A partition may be a single blob. In this case, it's an isolated symbol 00938 // or non-text (such as a ruling or image). 00939 // Detect these as table partitions? Shouldn't this be case by case? 00940 // The behavior before was to ignore this, making max_partition_gap < 0 00941 // and implicitly return true. Just making it explicit. 00942 if (largest_partition_gap_found == -1) 00943 return true; 00944 00945 // return true if the maximum gap found is smaller than the minimum allowed 00946 // max_gap in a text partition. This indicates that there is no signficant 00947 // space in the partition, hence it is likely a single word. 00948 return largest_partition_gap_found < min_gap; 00949 } 00950 00951 // A criteria for possible tables is that a table may have leaders 00952 // between data cells. An aggressive solution to find such tables is to 00953 // explicitly mark partitions that have adjacent leaders. 00954 // Note that this includes overlapping leaders. However, it does not 00955 // include leaders in different columns on the page. 00956 // Possible false-positive will include lists, such as a table of contents. 00957 // As these arise, the agressive nature of this search may need to be 00958 // trimmed down. 00959 bool TableFinder::HasLeaderAdjacent(const ColPartition& part) { 00960 if (part.flow() == BTFT_LEADER) 00961 return true; 00962 // Search range is left and right bounded by an offset of the 00963 // median xheight. This offset is to allow some tolerance to the 00964 // the leaders on the page in the event that the alignment is still 00965 // a bit off. 00966 const TBOX& box = part.bounding_box(); 00967 const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_; 00968 const int top = box.top() + search_size; 00969 const int bottom = box.bottom() - search_size; 00970 ColPartitionGridSearch hsearch(&leader_and_ruling_grid_); 00971 for (int direction = 0; direction < 2; ++direction) { 00972 bool right_to_left = (direction == 0); 00973 int x = right_to_left ? box.right() : box.left(); 00974 hsearch.StartSideSearch(x, bottom, top); 00975 ColPartition* leader = NULL; 00976 while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) { 00977 // This should not happen, they are in different grids. 00978 ASSERT_HOST(&part != leader); 00979 // The leader could be a horizontal ruling in the grid. 00980 // Make sure it is actually a leader. 00981 if (leader->flow() != BTFT_LEADER) 00982 continue; 00983 // Make sure the leader shares a page column with the partition, 00984 // otherwise we are spreading across columns. 00985 if (!part.IsInSameColumnAs(*leader)) 00986 break; 00987 // There should be a significant vertical overlap 00988 if (!leader->VSignificantCoreOverlap(part)) 00989 continue; 00990 // Leader passed all tests, so it is adjacent. 00991 return true; 00992 } 00993 } 00994 // No leaders are adjacent to the given partition. 00995 return false; 00996 } 00997 00998 // Filter individual text partitions marked as table partitions 00999 // consisting of paragraph endings, small section headings, and 01000 // headers and footers. 01001 void TableFinder::FilterFalseAlarms() { 01002 FilterParagraphEndings(); 01003 FilterHeaderAndFooter(); 01004 // TODO(nbeato): Fully justified text as non-table? 01005 } 01006 01007 void TableFinder::FilterParagraphEndings() { 01008 // Detect last line of paragraph 01009 // Iterate the ColPartitions in the grid. 01010 ColPartitionGridSearch gsearch(&clean_part_grid_); 01011 gsearch.StartFullSearch(); 01012 ColPartition* part = NULL; 01013 while ((part = gsearch.NextFullSearch()) != NULL) { 01014 if (part->type() != PT_TABLE) 01015 continue; // Consider only table partitions 01016 01017 // Paragraph ending should have flowing text above it. 01018 ColPartition* upper_part = part->nearest_neighbor_above(); 01019 if (!upper_part) 01020 continue; 01021 if (upper_part->type() != PT_FLOWING_TEXT) 01022 continue; 01023 if (upper_part->bounding_box().width() < 01024 2 * part->bounding_box().width()) 01025 continue; 01026 // Check if its the last line of a paragraph. 01027 // In most cases, a paragraph ending should be left-aligned to text line 01028 // above it. Sometimes, it could be a 2 line paragraph, in which case 01029 // the line above it is indented. 01030 // To account for that, check if the partition center is to 01031 // the left of the one above it. 01032 int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2; 01033 int upper_mid = (upper_part->bounding_box().left() + 01034 upper_part->bounding_box().right()) / 2; 01035 int current_spacing = 0; // spacing of the current line to margin 01036 int upper_spacing = 0; // spacing of the previous line to the margin 01037 if (left_to_right_language_) { 01038 // Left to right languages, use mid - left to figure out the distance 01039 // the middle is from the left margin. 01040 int left = MIN(part->bounding_box().left(), 01041 upper_part->bounding_box().left()); 01042 current_spacing = mid - left; 01043 upper_spacing = upper_mid - left; 01044 } else { 01045 // Right to left languages, use right - mid to figure out the distance 01046 // the middle is from the right margin. 01047 int right = MAX(part->bounding_box().right(), 01048 upper_part->bounding_box().right()); 01049 current_spacing = right - mid; 01050 upper_spacing = right - upper_mid; 01051 } 01052 if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) 01053 continue; 01054 01055 // Paragraphs should have similar fonts. 01056 if (!part->MatchingSizes(*upper_part) || 01057 !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance, 01058 kStrokeWidthConstantTolerance)) { 01059 continue; 01060 } 01061 01062 // The last line of a paragraph should be left aligned. 01063 // TODO(nbeato): This would be untrue if the text was right aligned. 01064 // How often is that? 01065 if (part->space_to_left() > 01066 kMaxParagraphEndingLeftSpaceMultiple * part->median_size()) 01067 continue; 01068 // The line above it should be right aligned (assuming justified format). 01069 // Since we can't assume justified text, we compare whitespace to text. 01070 // The above line should have majority spanning text (or the current 01071 // line could have fit on the previous line). So compare 01072 // whitespace to text. 01073 if (upper_part->bounding_box().width() < 01074 kMinParagraphEndingTextToWhitespaceRatio * upper_part->space_to_right()) 01075 continue; 01076 01077 // Ledding above the line should be less than ledding below 01078 if (part->space_above() >= part->space_below() || 01079 part->space_above() > 2 * global_median_ledding_) 01080 continue; 01081 01082 // If all checks failed, it is probably text. 01083 part->clear_table_type(); 01084 } 01085 } 01086 01087 void TableFinder::FilterHeaderAndFooter() { 01088 // Consider top-most text colpartition as header and bottom most as footer 01089 ColPartition* header = NULL; 01090 ColPartition* footer = NULL; 01091 int max_top = MIN_INT32; 01092 int min_bottom = MAX_INT32; 01093 ColPartitionGridSearch gsearch(&clean_part_grid_); 01094 gsearch.StartFullSearch(); 01095 ColPartition* part = NULL; 01096 while ((part = gsearch.NextFullSearch()) != NULL) { 01097 if (!part->IsTextType()) 01098 continue; // Consider only text partitions 01099 int top = part->bounding_box().top(); 01100 int bottom = part->bounding_box().bottom(); 01101 if (top > max_top) { 01102 max_top = top; 01103 header = part; 01104 } 01105 if (bottom < min_bottom) { 01106 min_bottom = bottom; 01107 footer = part; 01108 } 01109 } 01110 if (header) 01111 header->clear_table_type(); 01112 if (footer) 01113 footer->clear_table_type(); 01114 } 01115 01116 // Mark all ColPartitions as table cells that have a table cell above 01117 // and below them 01118 // TODO(faisal): This is too aggressive at the moment. The method needs to 01119 // consider spacing and alignment as well. Detection of false alarm table cells 01120 // should also be done as part of it. 01121 void TableFinder::SmoothTablePartitionRuns() { 01122 // Iterate the ColPartitions in the grid. 01123 ColPartitionGridSearch gsearch(&clean_part_grid_); 01124 gsearch.StartFullSearch(); 01125 ColPartition* part = NULL; 01126 while ((part = gsearch.NextFullSearch()) != NULL) { 01127 if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) 01128 continue; // Consider only text partitions 01129 ColPartition* upper_part = part->nearest_neighbor_above(); 01130 ColPartition* lower_part = part->nearest_neighbor_below(); 01131 if (!upper_part || !lower_part) 01132 continue; 01133 if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) 01134 part->set_table_type(); 01135 } 01136 01137 // Pass 2, do the opposite. If both the upper and lower neighbors 01138 // exist and are not tables, this probably shouldn't be a table. 01139 gsearch.StartFullSearch(); 01140 part = NULL; 01141 while ((part = gsearch.NextFullSearch()) != NULL) { 01142 if (part->type() != PT_TABLE) 01143 continue; // Consider only text partitions 01144 ColPartition* upper_part = part->nearest_neighbor_above(); 01145 ColPartition* lower_part = part->nearest_neighbor_below(); 01146 01147 // table can't be by itself 01148 if ((upper_part && upper_part->type() != PT_TABLE) && 01149 (lower_part && lower_part->type() != PT_TABLE)) { 01150 part->clear_table_type(); 01151 } 01152 } 01153 } 01154 01155 // Set the type of a column segment based on the ratio of table to text cells 01156 void TableFinder::SetColumnsType(ColSegment_LIST* column_blocks) { 01157 ColSegment_IT it(column_blocks); 01158 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 01159 ColSegment* seg = it.data(); 01160 TBOX box = seg->bounding_box(); 01161 int num_table_cells = 0; 01162 int num_text_cells = 0; 01163 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01164 rsearch(&clean_part_grid_); 01165 rsearch.SetUniqueMode(true); 01166 rsearch.StartRectSearch(box); 01167 ColPartition* part = NULL; 01168 while ((part = rsearch.NextRectSearch()) != NULL) { 01169 if (part->type() == PT_TABLE) { 01170 num_table_cells++; 01171 } else if (part->type() == PT_FLOWING_TEXT) { 01172 num_text_cells++; 01173 } 01174 } 01175 // If a column block has no text or table partition in it, it is not needed 01176 // for table detection. 01177 if (!num_table_cells && !num_text_cells) { 01178 delete it.extract(); 01179 } else { 01180 seg->set_num_table_cells(num_table_cells); 01181 seg->set_num_text_cells(num_text_cells); 01182 // set column type based on the ratio of table to text cells 01183 seg->set_type(); 01184 } 01185 } 01186 } 01187 01188 // Move column blocks to grid 01189 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments, 01190 ColSegmentGrid *col_seg_grid) { 01191 ColSegment_IT it(segments); 01192 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 01193 ColSegment* seg = it.extract(); 01194 col_seg_grid->InsertBBox(true, true, seg); 01195 } 01196 } 01197 01198 // Merge column blocks if a split is detected due to the presence of a 01199 // table. A text block is considered split if it has multiple 01200 // neighboring blocks above/below it, and at least one of the 01201 // neighboring blocks is of table type (has a high density of table 01202 // partitions). In this case neighboring blocks in the direction 01203 // (above/below) of the table block are merged with the text block. 01204 01205 // Comment: This method does not handle split due to a full page table 01206 // since table columns in this case do not have a text column on which 01207 // split decision can be based. 01208 void TableFinder::GridMergeColumnBlocks() { 01209 int margin = gridsize(); 01210 01211 // Iterate the Column Blocks in the grid. 01212 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01213 gsearch(&col_seg_grid_); 01214 gsearch.StartFullSearch(); 01215 ColSegment* seg; 01216 while ((seg = gsearch.NextFullSearch()) != NULL) { 01217 if (seg->type() != COL_TEXT) 01218 continue; // only consider text blocks for split detection 01219 bool neighbor_found = false; 01220 bool modified = false; // Modified at least once 01221 // keep expanding current box as long as neighboring table columns 01222 // are found above or below it. 01223 do { 01224 TBOX box = seg->bounding_box(); 01225 // slightly expand the search region vertically 01226 int top_range = MIN(box.top() + margin, tright().y()); 01227 int bottom_range = MAX(box.bottom() - margin, bleft().y()); 01228 box.set_top(top_range); 01229 box.set_bottom(bottom_range); 01230 neighbor_found = false; 01231 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01232 rectsearch(&col_seg_grid_); 01233 rectsearch.StartRectSearch(box); 01234 ColSegment* neighbor = NULL; 01235 while ((neighbor = rectsearch.NextRectSearch()) != NULL) { 01236 if (neighbor == seg) 01237 continue; 01238 const TBOX& neighbor_box = neighbor->bounding_box(); 01239 // If the neighbor box significantly overlaps with the current 01240 // box (due to the expansion of the current box in the 01241 // previous iteration of this loop), remove the neighbor box 01242 // and expand the current box to include it. 01243 if (neighbor_box.overlap_fraction(box) >= 0.9) { 01244 seg->InsertBox(neighbor_box); 01245 modified = true; 01246 rectsearch.RemoveBBox(); 01247 gsearch.RepositionIterator(); 01248 delete neighbor; 01249 continue; 01250 } 01251 // Only expand if the neighbor box is of table type 01252 if (neighbor->type() != COL_TABLE) 01253 continue; 01254 // Insert the neighbor box into the current column block 01255 if (neighbor_box.major_x_overlap(box) && 01256 !box.contains(neighbor_box)) { 01257 seg->InsertBox(neighbor_box); 01258 neighbor_found = true; 01259 modified = true; 01260 rectsearch.RemoveBBox(); 01261 gsearch.RepositionIterator(); 01262 delete neighbor; 01263 } 01264 } 01265 } while (neighbor_found); 01266 if (modified) { 01267 // Because the box has changed, it has to be removed first. 01268 gsearch.RemoveBBox(); 01269 col_seg_grid_.InsertBBox(true, true, seg); 01270 gsearch.RepositionIterator(); 01271 } 01272 } 01273 } 01274 01275 // Group horizontally overlapping table partitions into table columns. 01276 // TODO(faisal): This is too aggressive at the moment. The method should 01277 // consider more attributes to group table partitions together. Some common 01278 // errors are: 01279 // 1- page number is merged with a table column above it even 01280 // if there is a large vertical gap between them. 01281 // 2- column headers go on to catch one of the columns arbitrarily 01282 // 3- an isolated noise blob near page top or bottom merges with the table 01283 // column below/above it 01284 // 4- cells from two vertically adjacent tables merge together to make a 01285 // single column resulting in merging of the two tables 01286 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) { 01287 ColSegment_IT it(table_columns); 01288 // Iterate the ColPartitions in the grid. 01289 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01290 gsearch(&clean_part_grid_); 01291 gsearch.StartFullSearch(); 01292 ColPartition* part; 01293 while ((part = gsearch.NextFullSearch()) != NULL) { 01294 if (part->inside_table_column() || part->type() != PT_TABLE) 01295 continue; // prevent a partition to be assigned to multiple columns 01296 const TBOX& box = part->bounding_box(); 01297 ColSegment* col = new ColSegment(); 01298 col->InsertBox(box); 01299 part->set_inside_table_column(true); 01300 // Start a search below the current cell to find bottom neighbours 01301 // Note: a full search will always process things above it first, so 01302 // this should be starting at the highest cell and working its way down. 01303 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01304 vsearch(&clean_part_grid_); 01305 vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom()); 01306 ColPartition* neighbor = NULL; 01307 bool found_neighbours = false; 01308 while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) { 01309 // only consider neighbors not assigned to any column yet 01310 if (neighbor->inside_table_column()) 01311 continue; 01312 // Horizontal lines should not break the flow 01313 if (neighbor->IsHorizontalLine()) 01314 continue; 01315 // presence of a non-table neighbor marks the end of current 01316 // table column 01317 if (neighbor->type() != PT_TABLE) 01318 break; 01319 // add the neighbor partition to the table column 01320 const TBOX& neighbor_box = neighbor->bounding_box(); 01321 col->InsertBox(neighbor_box); 01322 neighbor->set_inside_table_column(true); 01323 found_neighbours = true; 01324 } 01325 if (found_neighbours) { 01326 it.add_after_then_move(col); 01327 } else { 01328 part->set_inside_table_column(false); 01329 delete col; 01330 } 01331 } 01332 } 01333 01334 // Mark regions in a column that are x-bounded by the column boundaries and 01335 // y-bounded by the table columns' projection on the y-axis as table regions 01336 void TableFinder::GetTableRegions(ColSegment_LIST* table_columns, 01337 ColSegment_LIST* table_regions) { 01338 ColSegment_IT cit(table_columns); 01339 ColSegment_IT rit(table_regions); 01340 // Iterate through column blocks 01341 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01342 gsearch(&col_seg_grid_); 01343 gsearch.StartFullSearch(); 01344 ColSegment* part; 01345 int page_height = tright().y() - bleft().y(); 01346 ASSERT_HOST(page_height > 0); 01347 // create a bool array to hold projection on y-axis 01348 bool* table_region = new bool[page_height]; 01349 while ((part = gsearch.NextFullSearch()) != NULL) { 01350 TBOX part_box = part->bounding_box(); 01351 // reset the projection array 01352 for (int i = 0; i < page_height; i++) { 01353 table_region[i] = false; 01354 } 01355 // iterate through all table columns to find regions in the current 01356 // page column block 01357 cit.move_to_first(); 01358 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) { 01359 TBOX col_box = cit.data()->bounding_box(); 01360 // find intersection region of table column and page column 01361 TBOX intersection_box = col_box.intersection(part_box); 01362 // project table column on the y-axis 01363 for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) { 01364 table_region[i - bleft().y()] = true; 01365 } 01366 } 01367 // set x-limits of table regions to page column width 01368 TBOX current_table_box; 01369 current_table_box.set_left(part_box.left()); 01370 current_table_box.set_right(part_box.right()); 01371 // go through the y-axis projection to find runs of table 01372 // regions. Each run makes one table region. 01373 for (int i = 1; i < page_height; i++) { 01374 // detect start of a table region 01375 if (!table_region[i - 1] && table_region[i]) { 01376 current_table_box.set_bottom(i + bleft().y()); 01377 } 01378 // TODO(nbeato): Is it guaranteed that the last row is not a table region? 01379 // detect end of a table region 01380 if (table_region[i - 1] && !table_region[i]) { 01381 current_table_box.set_top(i + bleft().y()); 01382 if (!current_table_box.null_box()) { 01383 ColSegment* seg = new ColSegment(); 01384 seg->InsertBox(current_table_box); 01385 rit.add_after_then_move(seg); 01386 } 01387 } 01388 } 01389 } 01390 delete[] table_region; 01391 } 01392 01393 // Merge table regions corresponding to tables spanning multiple columns if 01394 // there is a colpartition (horizontal ruling line or normal text) that 01395 // touches both regions. 01396 // TODO(faisal): A rare error occurs if there are two horizontally adjacent 01397 // tables with aligned ruling lines. In this case, line finder returns a 01398 // single line and hence the tables get merged together 01399 void TableFinder::GridMergeTableRegions() { 01400 // Iterate the table regions in the grid. 01401 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01402 gsearch(&table_grid_); 01403 gsearch.StartFullSearch(); 01404 ColSegment* seg = NULL; 01405 while ((seg = gsearch.NextFullSearch()) != NULL) { 01406 bool neighbor_found = false; 01407 bool modified = false; // Modified at least once 01408 do { 01409 // Start a rectangle search x-bounded by the image and y by the table 01410 const TBOX& box = seg->bounding_box(); 01411 TBOX search_region(box); 01412 search_region.set_left(bleft().x()); 01413 search_region.set_right(tright().x()); 01414 neighbor_found = false; 01415 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01416 rectsearch(&table_grid_); 01417 rectsearch.StartRectSearch(search_region); 01418 ColSegment* neighbor = NULL; 01419 while ((neighbor = rectsearch.NextRectSearch()) != NULL) { 01420 if (neighbor == seg) 01421 continue; 01422 const TBOX& neighbor_box = neighbor->bounding_box(); 01423 // Check if a neighbor box has a large overlap with the table 01424 // region. This may happen as a result of merging two table 01425 // regions in the previous iteration. 01426 if (neighbor_box.overlap_fraction(box) >= 0.9) { 01427 seg->InsertBox(neighbor_box); 01428 rectsearch.RemoveBBox(); 01429 gsearch.RepositionIterator(); 01430 delete neighbor; 01431 modified = true; 01432 continue; 01433 } 01434 // Check if two table regions belong together based on a common 01435 // horizontal ruling line 01436 if (BelongToOneTable(box, neighbor_box)) { 01437 seg->InsertBox(neighbor_box); 01438 neighbor_found = true; 01439 modified = true; 01440 rectsearch.RemoveBBox(); 01441 gsearch.RepositionIterator(); 01442 delete neighbor; 01443 } 01444 } 01445 } while (neighbor_found); 01446 if (modified) { 01447 // Because the box has changed, it has to be removed first. 01448 gsearch.RemoveBBox(); 01449 table_grid_.InsertBBox(true, true, seg); 01450 gsearch.RepositionIterator(); 01451 } 01452 } 01453 } 01454 01455 // Decide if two table regions belong to one table based on a common 01456 // horizontal ruling line or another colpartition 01457 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) { 01458 // Check the obvious case. Most likely not true because overlapping boxes 01459 // should already be merged, but seems like a good thing to do in case things 01460 // change. 01461 if (box1.overlap(box2)) 01462 return true; 01463 // Check for ColPartitions spanning both table regions 01464 TBOX bbox = box1.bounding_union(box2); 01465 // Start a rect search on bbox 01466 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01467 rectsearch(&clean_part_grid_); 01468 rectsearch.StartRectSearch(bbox); 01469 ColPartition* part = NULL; 01470 while ((part = rectsearch.NextRectSearch()) != NULL) { 01471 const TBOX& part_box = part->bounding_box(); 01472 // return true if a colpartition spanning both table regions is found 01473 if (part_box.overlap(box1) && part_box.overlap(box2) && 01474 !part->IsImageType()) 01475 return true; 01476 } 01477 return false; 01478 } 01479 01480 // Adjust table boundaries by: 01481 // - building a tight bounding box around all ColPartitions contained in it. 01482 // - expanding table boundaries to include all colpartitions that overlap the 01483 // table by more than half of their area 01484 // - expanding table boundaries to include nearby horizontal rule lines 01485 // - expanding table vertically to include left out column headers 01486 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually 01487 // makes following errors: 01488 // 1- horizontal lines consisting of underlines are included in the table if 01489 // they are close enough 01490 // 2- horizontal lines originating from noise tend to get merged with a table 01491 // near the top of the page 01492 // 3- the criteria for including horizontal lines is very generous. Many times 01493 // horizontal lines separating headers and footers get merged with a 01494 // single-column table in a multi-column page thereby including text 01495 // from the neighboring column inside the table 01496 // 4- the criteria for including left out column headers also tends to 01497 // occasionally include text-lines above the tables, typically from 01498 // table caption 01499 void TableFinder::AdjustTableBoundaries() { 01500 // Iterate the table regions in the grid 01501 ColSegment_CLIST adjusted_tables; 01502 ColSegment_C_IT it(&adjusted_tables); 01503 ColSegmentGridSearch gsearch(&table_grid_); 01504 gsearch.StartFullSearch(); 01505 ColSegment* table = NULL; 01506 while ((table = gsearch.NextFullSearch()) != NULL) { 01507 const TBOX& table_box = table->bounding_box(); 01508 TBOX grown_box = table_box; 01509 GrowTableBox(table_box, &grown_box); 01510 // To prevent a table from expanding again, do not insert the 01511 // modified box back to the grid. Instead move it to a list and 01512 // and remove it from the grid. The list is moved later back to the grid. 01513 if (!grown_box.null_box()) { 01514 ColSegment* col = new ColSegment(); 01515 col->InsertBox(grown_box); 01516 it.add_after_then_move(col); 01517 } 01518 gsearch.RemoveBBox(); 01519 delete table; 01520 } 01521 // clear table grid to move final tables in it 01522 // TODO(nbeato): table_grid_ should already be empty. The above loop 01523 // removed everything. Maybe just assert it is empty? 01524 table_grid_.Clear(); 01525 it.move_to_first(); 01526 // move back final tables to table_grid_ 01527 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 01528 ColSegment* seg = it.extract(); 01529 table_grid_.InsertBBox(true, true, seg); 01530 } 01531 } 01532 01533 void TableFinder::GrowTableBox(const TBOX& table_box, TBOX* result_box) { 01534 // TODO(nbeato): The growing code is a bit excessive right now. 01535 // By removing these lines, the partitions considered need 01536 // to have some overlap or be special cases. These lines could 01537 // be added again once a check is put in place to make sure that 01538 // growing tables don't stomp on a lot of non-table partitions. 01539 01540 // search for horizontal ruling lines within the vertical margin 01541 // int vertical_margin = kRulingVerticalMargin * gridsize(); 01542 TBOX search_box = table_box; 01543 // int top = MIN(search_box.top() + vertical_margin, tright().y()); 01544 // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y()); 01545 // search_box.set_top(top); 01546 // search_box.set_bottom(bottom); 01547 01548 GrowTableToIncludePartials(table_box, search_box, result_box); 01549 GrowTableToIncludeLines(table_box, search_box, result_box); 01550 IncludeLeftOutColumnHeaders(result_box); 01551 } 01552 01553 // Grow a table by increasing the size of the box to include 01554 // partitions with significant overlap with the table. 01555 void TableFinder::GrowTableToIncludePartials(const TBOX& table_box, 01556 const TBOX& search_range, 01557 TBOX* result_box) { 01558 // Rulings are in a different grid, so search 2 grids for rulings, text, 01559 // and table partitions that are not entirely within the new box. 01560 for (int i = 0; i < 2; ++i) { 01561 ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ : 01562 &leader_and_ruling_grid_; 01563 ColPartitionGridSearch rectsearch(grid); 01564 rectsearch.StartRectSearch(search_range); 01565 ColPartition* part = NULL; 01566 while ((part = rectsearch.NextRectSearch()) != NULL) { 01567 // Only include text and table types. 01568 if (part->IsImageType()) 01569 continue; 01570 const TBOX& part_box = part->bounding_box(); 01571 // Include partition in the table if more than half of it 01572 // is covered by the table 01573 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { 01574 *result_box = result_box->bounding_union(part_box); 01575 continue; 01576 } 01577 } 01578 } 01579 } 01580 01581 // Grow a table by expanding to the extents of significantly 01582 // overlapping lines. 01583 void TableFinder::GrowTableToIncludeLines(const TBOX& table_box, 01584 const TBOX& search_range, 01585 TBOX* result_box) { 01586 ColPartitionGridSearch rsearch(&leader_and_ruling_grid_); 01587 rsearch.SetUniqueMode(true); 01588 rsearch.StartRectSearch(search_range); 01589 ColPartition* part = NULL; 01590 while ((part = rsearch.NextRectSearch()) != NULL) { 01591 // TODO(nbeato) This should also do vertical, but column 01592 // boundaries are breaking things. This function needs to be 01593 // updated to allow vertical lines as well. 01594 if (!part->IsLineType()) 01595 continue; 01596 // Avoid the following function call if the result of the 01597 // function is irrelevant. 01598 const TBOX& part_box = part->bounding_box(); 01599 if (result_box->contains(part_box)) 01600 continue; 01601 // Include a partially overlapping horizontal line only if the 01602 // extra ColPartitions that will be included due to expansion 01603 // have large side spacing w.r.t. columns containing them. 01604 if (HLineBelongsToTable(*part, table_box)) 01605 *result_box = result_box->bounding_union(part_box); 01606 // TODO(nbeato): Vertical 01607 } 01608 } 01609 01610 // Checks whether the horizontal line belong to the table by looking at the 01611 // side spacing of extra ColParitions that will be included in the table 01612 // due to expansion 01613 bool TableFinder::HLineBelongsToTable(const ColPartition& part, 01614 const TBOX& table_box) { 01615 if (!part.IsHorizontalLine()) 01616 return false; 01617 const TBOX& part_box = part.bounding_box(); 01618 if (!part_box.major_x_overlap(table_box)) 01619 return false; 01620 // Do not consider top-most horizontal line since it usually 01621 // originates from noise. 01622 // TODO(nbeato): I had to comment this out because the ruling grid doesn't 01623 // have neighbors solved. 01624 // if (!part.nearest_neighbor_above()) 01625 // return false; 01626 const TBOX bbox = part_box.bounding_union(table_box); 01627 // In the "unioned table" box (the table extents expanded by the line), 01628 // keep track of how many partitions have significant padding to the left 01629 // and right. If more than half of the partitions covered by the new table 01630 // have significant spacing, the line belongs to the table and the table 01631 // grows to include all of the partitions. 01632 int num_extra_partitions = 0; 01633 int extra_space_to_right = 0; 01634 int extra_space_to_left = 0; 01635 // Rulings are in a different grid, so search 2 grids for rulings, text, 01636 // and table partitions that are introduced by the new box. 01637 for (int i = 0; i < 2; ++i) { 01638 ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ : 01639 &leader_and_ruling_grid_; 01640 // Start a rect search on bbox 01641 ColPartitionGridSearch rectsearch(grid); 01642 rectsearch.SetUniqueMode(true); 01643 rectsearch.StartRectSearch(bbox); 01644 ColPartition* extra_part = NULL; 01645 while ((extra_part = rectsearch.NextRectSearch()) != NULL) { 01646 // ColPartition already in table 01647 const TBOX& extra_part_box = extra_part->bounding_box(); 01648 if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) 01649 continue; 01650 // Non-text ColPartitions do not contribute 01651 if (extra_part->IsImageType()) 01652 continue; 01653 // Consider this partition. 01654 num_extra_partitions++; 01655 // presence of a table cell is a strong hint, so just increment the scores 01656 // without looking at the spacing. 01657 if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) { 01658 extra_space_to_right++; 01659 extra_space_to_left++; 01660 continue; 01661 } 01662 int space_threshold = kSideSpaceMargin * part.median_size(); 01663 if (extra_part->space_to_right() > space_threshold) 01664 extra_space_to_right++; 01665 if (extra_part->space_to_left() > space_threshold) 01666 extra_space_to_left++; 01667 } 01668 } 01669 // tprintf("%d %d %d\n", 01670 // num_extra_partitions,extra_space_to_right,extra_space_to_left); 01671 return (extra_space_to_right > num_extra_partitions / 2) || 01672 (extra_space_to_left > num_extra_partitions / 2); 01673 } 01674 01675 // Look for isolated column headers above the given table box and 01676 // include them in the table 01677 void TableFinder::IncludeLeftOutColumnHeaders(TBOX* table_box) { 01678 // Start a search above the current table to look for column headers 01679 ColPartitionGridSearch vsearch(&clean_part_grid_); 01680 vsearch.StartVerticalSearch(table_box->left(), table_box->right(), 01681 table_box->top()); 01682 ColPartition* neighbor = NULL; 01683 ColPartition* previous_neighbor = NULL; 01684 while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) { 01685 // Max distance to find a table heading. 01686 const int max_distance = kMaxColumnHeaderDistance * 01687 neighbor->median_size(); 01688 int table_top = table_box->top(); 01689 const TBOX& box = neighbor->bounding_box(); 01690 // Do not continue if the next box is way above 01691 if (box.bottom() - table_top > max_distance) 01692 break; 01693 // Unconditionally include partitions of type TABLE or LINE 01694 // TODO(faisal): add some reasonable conditions here 01695 if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) { 01696 table_box->set_top(box.top()); 01697 previous_neighbor = NULL; 01698 continue; 01699 } 01700 // If there are two text partitions, one above the other, without a table 01701 // cell on their left or right side, consider them a barrier and quit 01702 if (previous_neighbor == NULL) { 01703 previous_neighbor = neighbor; 01704 } else { 01705 const TBOX& previous_box = previous_neighbor->bounding_box(); 01706 if (!box.major_y_overlap(previous_box)) 01707 break; 01708 } 01709 } 01710 } 01711 01712 // Remove false alarms consiting of a single column based on their 01713 // projection on the x-axis. Projection of a real table on the x-axis 01714 // should have at least one zero-valley larger than the global median 01715 // x-height of the page. 01716 void TableFinder::DeleteSingleColumnTables() { 01717 int page_width = tright().x() - bleft().x(); 01718 ASSERT_HOST(page_width > 0); 01719 // create an integer array to hold projection on x-axis 01720 int* table_xprojection = new int[page_width]; 01721 // Iterate through all tables in the table grid 01722 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01723 table_search(&table_grid_); 01724 table_search.StartFullSearch(); 01725 ColSegment* table; 01726 while ((table = table_search.NextFullSearch()) != NULL) { 01727 TBOX table_box = table->bounding_box(); 01728 // reset the projection array 01729 for (int i = 0; i < page_width; i++) { 01730 table_xprojection[i] = 0; 01731 } 01732 // Start a rect search on table_box 01733 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01734 rectsearch(&clean_part_grid_); 01735 rectsearch.SetUniqueMode(true); 01736 rectsearch.StartRectSearch(table_box); 01737 ColPartition* part; 01738 while ((part = rectsearch.NextRectSearch()) != NULL) { 01739 if (!part->IsTextType()) 01740 continue; // Do not consider non-text partitions 01741 if (part->flow() == BTFT_LEADER) 01742 continue; // Assume leaders are in tables 01743 TBOX part_box = part->bounding_box(); 01744 // Do not consider partitions partially covered by the table 01745 if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) 01746 continue; 01747 BLOBNBOX_CLIST* part_boxes = part->boxes(); 01748 BLOBNBOX_C_IT pit(part_boxes); 01749 01750 // Make sure overlapping blobs don't artificially inflate the number 01751 // of rows in the table. This happens frequently with things such as 01752 // decimals and split characters. Do this by assuming the column 01753 // partition is sorted mostly left to right and just clip 01754 // bounding boxes by the previous box's extent. 01755 int next_position_to_write = 0; 01756 01757 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) { 01758 BLOBNBOX *pblob = pit.data(); 01759 // ignore blob height for the purpose of projection since we 01760 // are only interested in finding valleys 01761 int xstart = pblob->bounding_box().left(); 01762 int xend = pblob->bounding_box().right(); 01763 01764 xstart = MAX(xstart, next_position_to_write); 01765 for (int i = xstart; i < xend; i++) 01766 table_xprojection[i - bleft().x()]++; 01767 next_position_to_write = xend; 01768 } 01769 } 01770 // Find largest valley between two reasonable peaks in the table 01771 if (!GapInXProjection(table_xprojection, page_width)) { 01772 table_search.RemoveBBox(); 01773 delete table; 01774 } 01775 } 01776 delete[] table_xprojection; 01777 } 01778 01779 // Return true if at least one gap larger than the global x-height 01780 // exists in the horizontal projection 01781 bool TableFinder::GapInXProjection(int* xprojection, int length) { 01782 // Find peak value of the histogram 01783 int peak_value = 0; 01784 for (int i = 0; i < length; i++) { 01785 if (xprojection[i] > peak_value) { 01786 peak_value = xprojection[i]; 01787 } 01788 } 01789 // Peak value represents the maximum number of horizontally 01790 // overlapping colpartitions, so this can be considered as the 01791 // number of rows in the table 01792 if (peak_value < kMinRowsInTable) 01793 return false; 01794 double projection_threshold = kSmallTableProjectionThreshold * peak_value; 01795 if (peak_value >= kLargeTableRowCount) 01796 projection_threshold = kLargeTableProjectionThreshold * peak_value; 01797 // Threshold the histogram 01798 for (int i = 0; i < length; i++) { 01799 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0; 01800 } 01801 // Find the largest run of zeros between two ones 01802 int largest_gap = 0; 01803 int run_start = -1; 01804 for (int i = 1; i < length; i++) { 01805 // detect start of a run of zeros 01806 if (xprojection[i - 1] && !xprojection[i]) { 01807 run_start = i; 01808 } 01809 // detect end of a run of zeros and update the value of largest gap 01810 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) { 01811 int gap = i - run_start; 01812 if (gap > largest_gap) 01813 largest_gap = gap; 01814 run_start = -1; 01815 } 01816 } 01817 return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_; 01818 } 01819 01820 // Given the location of a table "guess", try to overlay a cellular 01821 // grid in the location, adjusting the boundaries. 01822 // TODO(nbeato): Falsely introduces: 01823 // -headers/footers (not any worse, too much overlap destroys cells) 01824 // -page numbers (not worse, included because maximize margins) 01825 // -equations (nicely fit into a celluar grid, but more sparsely) 01826 // -figures (random text box, also sparse) 01827 // -small left-aligned text areas with overlapping positioned whitespace 01828 // (rejected before) 01829 // Overall, this just needs some more work. 01830 void TableFinder::RecognizeTables() { 01831 ScrollView* table_win = NULL; 01832 if (textord_show_tables) { 01833 table_win = MakeWindow(0, 0, "Table Structure"); 01834 DisplayColPartitions(table_win, &fragmented_text_grid_, 01835 ScrollView::BLUE, ScrollView::LIGHT_BLUE); 01836 // table_grid_.DisplayBoxes(table_win); 01837 } 01838 01839 01840 TableRecognizer recognizer; 01841 recognizer.Init(); 01842 recognizer.set_line_grid(&leader_and_ruling_grid_); 01843 recognizer.set_text_grid(&fragmented_text_grid_); 01844 recognizer.set_max_text_height(global_median_xheight_ * 2.0); 01845 recognizer.set_min_height(1.5 * gridheight()); 01846 // Loop over all of the tables and try to fit them. 01847 // Store the good tables here. 01848 ColSegment_CLIST good_tables; 01849 ColSegment_C_IT good_it(&good_tables); 01850 01851 ColSegmentGridSearch gsearch(&table_grid_); 01852 gsearch.StartFullSearch(); 01853 ColSegment* found_table = NULL; 01854 while ((found_table = gsearch.NextFullSearch()) != NULL) { 01855 gsearch.RemoveBBox(); 01856 01857 // The goal is to make the tables persistent in a list. 01858 // When that happens, this will move into the search loop. 01859 const TBOX& found_box = found_table->bounding_box(); 01860 StructuredTable* table_structure = recognizer.RecognizeTable(found_box); 01861 01862 // Process a table. Good tables are inserted into the grid again later on 01863 // We can't change boxes in the grid while it is running a search. 01864 if (table_structure != NULL) { 01865 if (textord_show_tables) { 01866 table_structure->Display(table_win, ScrollView::LIME_GREEN); 01867 } 01868 found_table->set_bounding_box(table_structure->bounding_box()); 01869 delete table_structure; 01870 good_it.add_after_then_move(found_table); 01871 } else { 01872 delete found_table; 01873 } 01874 } 01875 // TODO(nbeato): MERGE!! There is awesome info now available for merging. 01876 01877 // At this point, the grid is empty. We can safely insert the good tables 01878 // back into grid. 01879 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) 01880 table_grid_.InsertBBox(true, true, good_it.extract()); 01881 } 01882 01883 // Displays the column segments in some window. 01884 void TableFinder::DisplayColSegments(ScrollView* win, 01885 ColSegment_LIST *segments, 01886 ScrollView::Color color) { 01887 #ifndef GRAPHICS_DISABLED 01888 win->Pen(color); 01889 win->Brush(ScrollView::NONE); 01890 ColSegment_IT it(segments); 01891 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 01892 ColSegment* col = it.data(); 01893 const TBOX& box = col->bounding_box(); 01894 int left_x = box.left(); 01895 int right_x = box.right(); 01896 int top_y = box.top(); 01897 int bottom_y = box.bottom(); 01898 win->Rectangle(left_x, bottom_y, right_x, top_y); 01899 } 01900 win->UpdateWindow(); 01901 #endif 01902 } 01903 01904 void TableFinder::DisplayColSegmentGrid(ScrollView* win, ColSegmentGrid* grid, 01905 ScrollView::Color color) { 01906 #ifndef GRAPHICS_DISABLED 01907 // Iterate the ColPartitions in the grid. 01908 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 01909 gsearch(grid); 01910 gsearch.StartFullSearch(); 01911 ColSegment* seg = NULL; 01912 while ((seg = gsearch.NextFullSearch()) != NULL) { 01913 const TBOX& box = seg->bounding_box(); 01914 int left_x = box.left(); 01915 int right_x = box.right(); 01916 int top_y = box.top(); 01917 int bottom_y = box.bottom(); 01918 win->Brush(ScrollView::NONE); 01919 win->Pen(color); 01920 win->Rectangle(left_x, bottom_y, right_x, top_y); 01921 } 01922 win->UpdateWindow(); 01923 #endif 01924 } 01925 01926 // Displays the colpartitions using a new coloring on an existing window. 01927 // Note: This method is only for debug purpose during development and 01928 // would not be part of checked in code 01929 void TableFinder::DisplayColPartitions(ScrollView* win, 01930 ColPartitionGrid* grid, 01931 ScrollView::Color default_color, 01932 ScrollView::Color table_color) { 01933 #ifndef GRAPHICS_DISABLED 01934 ScrollView::Color color = default_color; 01935 // Iterate the ColPartitions in the grid. 01936 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01937 gsearch(grid); 01938 gsearch.StartFullSearch(); 01939 ColPartition* part = NULL; 01940 while ((part = gsearch.NextFullSearch()) != NULL) { 01941 color = default_color; 01942 if (part->type() == PT_TABLE) 01943 color = table_color; 01944 01945 const TBOX& box = part->bounding_box(); 01946 int left_x = box.left(); 01947 int right_x = box.right(); 01948 int top_y = box.top(); 01949 int bottom_y = box.bottom(); 01950 win->Brush(ScrollView::NONE); 01951 win->Pen(color); 01952 win->Rectangle(left_x, bottom_y, right_x, top_y); 01953 } 01954 win->UpdateWindow(); 01955 #endif 01956 } 01957 void TableFinder::DisplayColPartitions(ScrollView* win, 01958 ColPartitionGrid* grid, 01959 ScrollView::Color default_color) { 01960 DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW); 01961 } 01962 01963 void TableFinder::DisplayColPartitionConnections( 01964 ScrollView* win, 01965 ColPartitionGrid* grid, 01966 ScrollView::Color color) { 01967 #ifndef GRAPHICS_DISABLED 01968 // Iterate the ColPartitions in the grid. 01969 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 01970 gsearch(grid); 01971 gsearch.StartFullSearch(); 01972 ColPartition* part = NULL; 01973 while ((part = gsearch.NextFullSearch()) != NULL) { 01974 const TBOX& box = part->bounding_box(); 01975 int left_x = box.left(); 01976 int right_x = box.right(); 01977 int top_y = box.top(); 01978 int bottom_y = box.bottom(); 01979 01980 ColPartition* upper_part = part->nearest_neighbor_above(); 01981 if (upper_part) { 01982 TBOX upper_box = upper_part->bounding_box(); 01983 int mid_x = (left_x + right_x) / 2; 01984 int mid_y = (top_y + bottom_y) / 2; 01985 int other_x = (upper_box.left() + upper_box.right()) / 2; 01986 int other_y = (upper_box.top() + upper_box.bottom()) / 2; 01987 win->Brush(ScrollView::NONE); 01988 win->Pen(color); 01989 win->Line(mid_x, mid_y, other_x, other_y); 01990 } 01991 ColPartition* lower_part = part->nearest_neighbor_below(); 01992 if (lower_part) { 01993 TBOX lower_box = lower_part->bounding_box(); 01994 int mid_x = (left_x + right_x) / 2; 01995 int mid_y = (top_y + bottom_y) / 2; 01996 int other_x = (lower_box.left() + lower_box.right()) / 2; 01997 int other_y = (lower_box.top() + lower_box.bottom()) / 2; 01998 win->Brush(ScrollView::NONE); 01999 win->Pen(color); 02000 win->Line(mid_x, mid_y, other_x, other_y); 02001 } 02002 } 02003 win->UpdateWindow(); 02004 #endif 02005 } 02006 02007 02008 // Write debug image and text file. 02009 // Note: This method is only for debug purpose during development and 02010 // would not be part of checked in code 02011 void TableFinder::WriteToPix(const FCOORD& reskew) { 02012 // Input file must be named test1.tif 02013 PIX* pix = pixRead("test1.tif"); 02014 if (!pix) { 02015 tprintf("Input file test1.tif not found.\n"); 02016 return; 02017 } 02018 int img_height = pixGetHeight(pix); 02019 int img_width = pixGetWidth(pix); 02020 // Maximum number of text or table partitions 02021 int num_boxes = 10; 02022 BOXA* text_box_array = boxaCreate(num_boxes); 02023 BOXA* table_box_array = boxaCreate(num_boxes); 02024 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 02025 gsearch(&clean_part_grid_); 02026 gsearch.StartFullSearch(); 02027 ColPartition* part; 02028 // load colpartitions into text_box_array and table_box_array 02029 while ((part = gsearch.NextFullSearch()) != NULL) { 02030 TBOX box = part->bounding_box(); 02031 box.rotate_large(reskew); 02032 BOX* lept_box = boxCreate(box.left(), img_height - box.top(), 02033 box.right() - box.left(), 02034 box.top() - box.bottom()); 02035 if (part->type() == PT_TABLE) 02036 boxaAddBox(table_box_array, lept_box, L_INSERT); 02037 else 02038 boxaAddBox(text_box_array, lept_box, L_INSERT); 02039 } 02040 // draw colpartitions on the output image 02041 PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000); 02042 out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00); 02043 02044 BOXA* table_array = boxaCreate(num_boxes); 02045 // text file containing detected table bounding boxes 02046 FILE* fptr = fopen("tess-table.txt", "wb"); 02047 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 02048 table_search(&table_grid_); 02049 table_search.StartFullSearch(); 02050 ColSegment* table; 02051 // load table boxes to table_array and write them to text file as well 02052 while ((table = table_search.NextFullSearch()) != NULL) { 02053 TBOX box = table->bounding_box(); 02054 box.rotate_large(reskew); 02055 // Since deskewing introduces negative coordinates, reskewing 02056 // might not completely recover from that since both steps enlarge 02057 // the actual box. Hence a box that undergoes deskewing/reskewing 02058 // may go out of image boundaries. Crop a table box if needed to 02059 // contain it inside the image dimensions. 02060 box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1)); 02061 BOX* lept_box = boxCreate(box.left(), img_height - box.top(), 02062 box.right() - box.left(), 02063 box.top() - box.bottom()); 02064 boxaAddBox(table_array, lept_box, L_INSERT); 02065 fprintf(fptr, "%d %d %d %d TABLE\n", box.left(), 02066 img_height - box.top(), box.right(), img_height - box.bottom()); 02067 } 02068 fclose(fptr); 02069 // paint table boxes on the debug image 02070 out = pixDrawBoxa(out, table_array, 5, 0x7fff0000); 02071 02072 pixWrite("out.png", out, IFF_PNG); 02073 // memory cleanup 02074 boxaDestroy(&text_box_array); 02075 boxaDestroy(&table_box_array); 02076 boxaDestroy(&table_array); 02077 pixDestroy(&pix); 02078 pixDestroy(&out); 02079 } 02080 02081 // Merge all colpartitions in table regions to make them a single 02082 // colpartition and revert types of isolated table cells not 02083 // assigned to any table to their original types. 02084 void TableFinder::MakeTableBlocks(ColPartitionGrid* grid, 02085 ColPartitionSet** all_columns, 02086 WidthCallback* width_cb) { 02087 // Since we have table blocks already, remove table tags from all 02088 // colpartitions 02089 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 02090 gsearch(grid); 02091 gsearch.StartFullSearch(); 02092 ColPartition* part = NULL; 02093 02094 while ((part = gsearch.NextFullSearch()) != NULL) { 02095 if (part->type() == PT_TABLE) { 02096 part->clear_table_type(); 02097 } 02098 } 02099 // Now make a single colpartition out of each table block and remove 02100 // all colpartitions contained within a table 02101 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> 02102 table_search(&table_grid_); 02103 table_search.StartFullSearch(); 02104 ColSegment* table; 02105 while ((table = table_search.NextFullSearch()) != NULL) { 02106 TBOX table_box = table->bounding_box(); 02107 // Start a rect search on table_box 02108 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> 02109 rectsearch(grid); 02110 rectsearch.StartRectSearch(table_box); 02111 ColPartition* part; 02112 ColPartition* table_partition = NULL; 02113 while ((part = rectsearch.NextRectSearch()) != NULL) { 02114 // Do not consider image partitions 02115 if (!part->IsTextType()) 02116 continue; 02117 TBOX part_box = part->bounding_box(); 02118 // Include partition in the table if more than half of it 02119 // is covered by the table 02120 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { 02121 rectsearch.RemoveBBox(); 02122 if (table_partition) { 02123 table_partition->Absorb(part, width_cb); 02124 } else { 02125 table_partition = part; 02126 } 02127 } 02128 } 02129 // Insert table colpartition back to part_grid_ 02130 if (table_partition) { 02131 // To match the columns used when transforming to blocks, the new table 02132 // partition must have its first and last column set at the grid y that 02133 // corresponds to its bottom. 02134 const TBOX& table_box = table_partition->bounding_box(); 02135 int grid_x, grid_y; 02136 grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y); 02137 table_partition->SetPartitionType(resolution_, all_columns[grid_y]); 02138 table_partition->set_table_type(); 02139 table_partition->set_blob_type(BRT_TEXT); 02140 table_partition->set_flow(BTFT_CHAIN); 02141 table_partition->SetBlobTypes(); 02142 grid->InsertBBox(true, true, table_partition); 02143 } 02144 } 02145 } 02146 02149 ColSegment::ColSegment() 02150 : ELIST_LINK(), 02151 num_table_cells_(0), 02152 num_text_cells_(0), 02153 type_(COL_UNKNOWN) { 02154 } 02155 ColSegment::~ColSegment() { 02156 } 02157 02158 // Provides a color for BBGrid to draw the rectangle. 02159 ScrollView::Color ColSegment::BoxColor() const { 02160 const ScrollView::Color kBoxColors[PT_COUNT] = { 02161 ScrollView::YELLOW, 02162 ScrollView::BLUE, 02163 ScrollView::YELLOW, 02164 ScrollView::MAGENTA, 02165 }; 02166 return kBoxColors[type_]; 02167 } 02168 02169 // Insert a box into this column segment 02170 void ColSegment::InsertBox(const TBOX& other) { 02171 bounding_box_ = bounding_box_.bounding_union(other); 02172 } 02173 02174 // Set column segment type based on the ratio of text and table partitions 02175 // in it. 02176 void ColSegment::set_type() { 02177 if (num_table_cells_ > kTableColumnThreshold * num_text_cells_) 02178 type_ = COL_TABLE; 02179 else if (num_text_cells_ > num_table_cells_) 02180 type_ = COL_TEXT; 02181 else 02182 type_ = COL_MIXED; 02183 } 02184 02185 } // namespace tesseract.