tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/textord/tablefind.cpp
Go to the documentation of this file.
00001 
00002 // File:        tablefind.cpp
00003 // Description: Helper classes to find tables from ColPartitions.
00004 // Author:      Faisal Shafait (faisal.shafait@dfki.de)
00005 // Created:     Tue Jan 06 11:13:01 PST 2009
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #ifdef HAVE_CONFIG_H
00025 #include "config_auto.h"
00026 #endif
00027 
00028 #include "tablefind.h"
00029 #include <math.h>
00030 
00031 #include "allheaders.h"
00032 
00033 #include "colpartitionset.h"
00034 #include "tablerecog.h"
00035 
00036 namespace tesseract {
00037 
00038 // These numbers are used to calculate the global median stats.
00039 // They just set an upper bound on the stats objects.
00040 // Maximum vertical spacing between neighbor partitions.
00041 const int kMaxVerticalSpacing = 500;
00042 // Maximum width of a blob in a partition.
00043 const int kMaxBlobWidth = 500;
00044 
00045 // Minimum whitespace size to split a partition (measured as a multiple
00046 // of a partition's median width).
00047 const double kSplitPartitionSize = 2.0;
00048 // To insert text, the partition must satisfy these size constraints
00049 // in AllowTextPartition(). The idea is to filter noise partitions
00050 // determined by the size compared to the global medians.
00051 // TODO(nbeato): Need to find good numbers again.
00052 const double kAllowTextHeight = 0.5;
00053 const double kAllowTextWidth = 0.6;
00054 const double kAllowTextArea = 0.8;
00055 // The same thing applies to blobs (to filter noise).
00056 // TODO(nbeato): These numbers are a shot in the dark...
00057 // height and width are 0.5 * gridsize() in colfind.cpp
00058 // area is a rough guess for the size of a period.
00059 const double kAllowBlobHeight = 0.3;
00060 const double kAllowBlobWidth = 0.4;
00061 const double kAllowBlobArea = 0.05;
00062 
00063 // Minimum number of components in a text partition. A partition having fewer
00064 // components than that is more likely a data partition and is a candidate
00065 // table cell.
00066 const int kMinBoxesInTextPartition = 10;
00067 
00068 // Maximum number of components that a data partition can have
00069 const int kMaxBoxesInDataPartition = 20;
00070 
00071 // Maximum allowed gap in a text partitions as a multiple of its median size.
00072 const double kMaxGapInTextPartition = 4.0;
00073 
00074 // Minimum value that the maximum gap in a text partition should have as a
00075 // factor of its median size.
00076 const double kMinMaxGapInTextPartition = 0.5;
00077 
00078 // The amount of overlap that is "normal" for adjacent blobs in a text
00079 // partition. This is used to calculate gap between overlapping blobs.
00080 const double kMaxBlobOverlapFactor = 4.0;
00081 
00082 // Maximum x-height a table partition can have as a multiple of global
00083 // median x-height
00084 const double kMaxTableCellXheight = 2.0;
00085 
00086 // Maximum line spacing between a table column header and column contents
00087 // for merging the two (as a multiple of the partition's median_size).
00088 const int kMaxColumnHeaderDistance = 4;
00089 
00090 // Minimum ratio of num_table_partitions to num_text_partitions in a column
00091 // block to be called it a table column
00092 const double kTableColumnThreshold = 3.0;
00093 
00094 // Search for horizontal ruling lines within the vertical margin as a
00095 // multiple of grid size
00096 const int kRulingVerticalMargin = 3;
00097 
00098 // Minimum overlap that a colpartition must have with a table region
00099 // to become part of that table
00100 const double kMinOverlapWithTable = 0.6;
00101 
00102 // Maximum side space (distance from column boundary) that a typical
00103 // text-line in flowing text should have as a multiple of its x-height
00104 // (Median size).
00105 const int kSideSpaceMargin = 10;
00106 
00107 // Fraction of the peak of x-projection of a table region to set the
00108 // threshold for the x-projection histogram
00109 const double kSmallTableProjectionThreshold = 0.35;
00110 const double kLargeTableProjectionThreshold = 0.45;
00111 // Minimum number of rows required to look for more rows in the projection.
00112 const int kLargeTableRowCount = 6;
00113 
00114 // Minimum number of rows in a table
00115 const int kMinRowsInTable = 3;
00116 
00117 // The number of "whitespace blobs" that should appear between the
00118 // ColPartition's bounding box and the column tab stops to the left/right
00119 // when looking for center justified tab stops.
00120 const double kRequiredFullJustifiedSpacing = 4.0;
00121 
00122 // The amount of padding (multiplied by global_median_xheight_ during use)
00123 // that is vertically added to the search adjacent leader search during
00124 // ColPartition marking.
00125 const int kAdjacentLeaderSearchPadding = 2;
00126 
00127 // Used when filtering false positives. When finding the last line
00128 // of a paragraph (typically left-aligned), the previous line should have
00129 // its center to the right of the last line by this scaled amount.
00130 const double kParagraphEndingPreviousLineRatio = 1.3;
00131 
00132 // The maximum amount of whitespace allowed left of a paragraph ending.
00133 // Do not filter a ColPartition with more than this space left of it.
00134 const double kMaxParagraphEndingLeftSpaceMultiple = 3.0;
00135 
00136 // Used when filtering false positives. The last line of a paragraph
00137 // should be preceded by a line that is predominantly text. This is the
00138 // ratio of text to whitespace (to the right of the text) that is required
00139 // for the previous line to be a text.
00140 const double kMinParagraphEndingTextToWhitespaceRatio = 3.0;
00141 
00142 // When counting table columns, this is the required gap between two columns
00143 // (it is multiplied by global_median_xheight_).
00144 const double kMaxXProjectionGapFactor = 2.0;
00145 
00146 // Used for similarity in partitions using stroke width. Values copied
00147 // from ColFind.cpp in Ray's CL.
00148 const double kStrokeWidthFractionalTolerance = 0.25;
00149 const double kStrokeWidthConstantTolerance = 2.0;
00150 
00151 BOOL_VAR(textord_dump_table_images, false, "Paint table detection output");
00152 BOOL_VAR(textord_show_tables, false, "Show table regions");
00153 BOOL_VAR(textord_tablefind_show_mark, false,
00154          "Debug table marking steps in detail");
00155 BOOL_VAR(textord_tablefind_show_stats, false,
00156          "Show page stats used in table finding");
00157 BOOL_VAR(textord_tablefind_recognize_tables, false,
00158          "Enables the table recognizer for table layout and filtering.");
00159 
00160 ELISTIZE(ColSegment)
00161 CLISTIZE(ColSegment)
00162 
00163 // Templated helper function used to create destructor callbacks for the
00164 // BBGrid::ClearGridData() method.
00165 template <typename T> void DeleteObject(T *object) {
00166   delete object;
00167 }
00168 
00169 TableFinder::TableFinder()
00170     : resolution_(0),
00171       global_median_xheight_(0),
00172       global_median_blob_width_(0),
00173       global_median_ledding_(0),
00174       left_to_right_language_(true) {
00175 }
00176 
00177 TableFinder::~TableFinder() {
00178   // ColPartitions and ColSegments created by this class for storage in grids
00179   // need to be deleted explicitly.
00180   clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
00181   leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
00182   fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
00183   col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
00184   table_grid_.ClearGridData(&DeleteObject<ColSegment>);
00185 }
00186 
00187 void TableFinder::set_left_to_right_language(bool order) {
00188   left_to_right_language_ = order;
00189 }
00190 
00191 void TableFinder::Init(int grid_size, const ICOORD& bottom_left,
00192                        const ICOORD& top_right) {
00193   // Initialize clean partitions list and grid
00194   clean_part_grid_.Init(grid_size, bottom_left, top_right);
00195   leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
00196   fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
00197   col_seg_grid_.Init(grid_size, bottom_left, top_right);
00198   table_grid_.Init(grid_size, bottom_left, top_right);
00199 }
00200 
00201 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
00202 // insert leaders and rulers into the leader_and_ruling_grid_
00203 void TableFinder::InsertCleanPartitions(ColPartitionGrid* grid,
00204                                         TO_BLOCK* block) {
00205   // Calculate stats. This lets us filter partitions in AllowTextPartition()
00206   // and filter blobs in AllowBlob().
00207   SetGlobalSpacings(grid);
00208 
00209   // Iterate the ColPartitions in the grid.
00210   ColPartitionGridSearch gsearch(grid);
00211   gsearch.SetUniqueMode(true);
00212   gsearch.StartFullSearch();
00213   ColPartition* part = NULL;
00214   while ((part = gsearch.NextFullSearch()) != NULL) {
00215     // Reject partitions with nothing useful inside of them.
00216     if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
00217       continue;
00218     ColPartition* clean_part = part->ShallowCopy();
00219     ColPartition* leader_part = NULL;
00220     if (part->IsLineType()) {
00221       InsertRulingPartition(clean_part);
00222       continue;
00223     }
00224     // Insert all non-text partitions to clean_parts
00225     if (!part->IsTextType()) {
00226       InsertImagePartition(clean_part);
00227       continue;
00228     }
00229     // Insert text colpartitions after removing noisy components from them
00230     // The leaders are split into a separate grid.
00231     BLOBNBOX_CLIST* part_boxes = part->boxes();
00232     BLOBNBOX_C_IT pit(part_boxes);
00233     for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
00234       BLOBNBOX *pblob = pit.data();
00235       // Bad blobs... happens in UNLV set.
00236       // news.3G1, page 17 (around x=6)
00237       if (!AllowBlob(*pblob))
00238         continue;
00239       if (pblob->flow() == BTFT_LEADER) {
00240         if (leader_part == NULL) {
00241           leader_part = part->ShallowCopy();
00242           leader_part->set_flow(BTFT_LEADER);
00243         }
00244         leader_part->AddBox(pblob);
00245       } else if (pblob->region_type() != BRT_NOISE) {
00246         clean_part->AddBox(pblob);
00247       }
00248     }
00249     clean_part->ComputeLimits();
00250     ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
00251     InsertTextPartition(clean_part);
00252     SplitAndInsertFragmentedTextPartition(fragmented);
00253     if (leader_part != NULL) {
00254       // TODO(nbeato): Note that ComputeLimits does not update the column
00255       // information. So the leader may appear to span more columns than it
00256       // really does later on when IsInSameColumnAs gets called to test
00257       // for adjacent leaders.
00258       leader_part->ComputeLimits();
00259       InsertLeaderPartition(leader_part);
00260     }
00261   }
00262 
00263   // Make the partition partners better for upper and lower neighbors.
00264   clean_part_grid_.FindPartitionPartners();
00265   clean_part_grid_.RefinePartitionPartners(false);
00266 }
00267 
00268 // High level function to perform table detection
00269 void TableFinder::LocateTables(ColPartitionGrid* grid,
00270                                ColPartitionSet** all_columns,
00271                                WidthCallback* width_cb,
00272                                const FCOORD& reskew) {
00273   // initialize spacing, neighbors, and columns
00274   InitializePartitions(all_columns);
00275 
00276 #ifndef GRAPHICS_DISABLED
00277   if (textord_show_tables) {
00278     ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
00279     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00280     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
00281                          ScrollView::AQUAMARINE);
00282     DisplayColPartitionConnections(table_win, &clean_part_grid_,
00283                                    ScrollView::ORANGE);
00284 
00285     table_win = MakeWindow(100, 300, "Fragmented Text");
00286     DisplayColPartitions(table_win, &fragmented_text_grid_, ScrollView::BLUE);
00287   }
00288 #endif  // GRAPHICS_DISABLED
00289 
00290   // mark, filter, and smooth candidate table partitions
00291   MarkTablePartitions();
00292 
00293   // Make single-column blocks from good_columns_ partitions. col_segments are
00294   // moved to a grid later which takes the ownership
00295   ColSegment_LIST column_blocks;
00296   GetColumnBlocks(all_columns, &column_blocks);
00297   // Set the ratio of candidate table partitions in each column
00298   SetColumnsType(&column_blocks);
00299 
00300   // Move column segments to col_seg_grid_
00301   MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
00302 
00303   // Detect split in column layout that might have occurred due to the
00304   // presence of a table. In such a case, merge the corresponding columns.
00305   GridMergeColumnBlocks();
00306 
00307   // Group horizontally overlapping table partitions into table columns.
00308   // table_columns created here get deleted at the end of this method.
00309   ColSegment_LIST table_columns;
00310   GetTableColumns(&table_columns);
00311 
00312   // Within each column, mark the range table regions occupy based on the
00313   // table columns detected. table_regions are moved to a grid later which
00314   // takes the ownership
00315   ColSegment_LIST table_regions;
00316   GetTableRegions(&table_columns, &table_regions);
00317 
00318 #ifndef GRAPHICS_DISABLED
00319   if (textord_tablefind_show_mark) {
00320     ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
00321     DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
00322     DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
00323   }
00324 #endif  // GRAPHICS_DISABLED
00325 
00326   // Merge table regions across columns for tables spanning multiple
00327   // columns
00328   MoveColSegmentsToGrid(&table_regions, &table_grid_);
00329   GridMergeTableRegions();
00330 
00331   // Adjust table boundaries by including nearby horizontal lines and left
00332   // out column headers
00333   AdjustTableBoundaries();
00334   GridMergeTableRegions();
00335 
00336   if (textord_tablefind_recognize_tables) {
00337     // Remove false alarms consiting of a single column
00338     DeleteSingleColumnTables();
00339 
00340 #ifndef GRAPHICS_DISABLED
00341     if (textord_show_tables) {
00342       ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
00343       DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00344       DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
00345       table_grid_.DisplayBoxes(table_win);
00346     }
00347 #endif  // GRAPHICS_DISABLED
00348 
00349     // Find table grid structure and reject tables that are malformed.
00350     RecognizeTables();
00351     GridMergeTableRegions();
00352     RecognizeTables();
00353 
00354 #ifndef GRAPHICS_DISABLED
00355     if (textord_show_tables) {
00356       ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
00357       DisplayColPartitions(table_win, &clean_part_grid_,
00358                            ScrollView::BLUE, ScrollView::BLUE);
00359       table_grid_.DisplayBoxes(table_win);
00360     }
00361 #endif  // GRAPHICS_DISABLED
00362   } else {
00363     // Remove false alarms consiting of a single column
00364     // TODO(nbeato): verify this is a NOP after structured table rejection.
00365     // Right now it isn't. If the recognize function is doing what it is
00366     // supposed to do, this function is obsolete.
00367     DeleteSingleColumnTables();
00368 
00369 #ifndef GRAPHICS_DISABLED
00370     if (textord_show_tables) {
00371       ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
00372       DisplayColPartitions(table_win, &clean_part_grid_,
00373                            ScrollView::BLUE, ScrollView::BLUE);
00374       table_grid_.DisplayBoxes(table_win);
00375     }
00376 #endif  // GRAPHICS_DISABLED
00377   }
00378 
00379   if (textord_dump_table_images)
00380     WriteToPix(reskew);
00381 
00382   // Merge all colpartitions in table regions to make them a single
00383   // colpartition and revert types of isolated table cells not
00384   // assigned to any table to their original types.
00385   MakeTableBlocks(grid, all_columns, width_cb);
00386 }
00387 // All grids have the same dimensions. The clean_part_grid_ sizes are set from
00388 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as
00389 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_
00390 // dimensions instead of duplicated memory.
00391 int TableFinder::gridsize() const {
00392   return clean_part_grid_.gridsize();
00393 }
00394 int TableFinder::gridwidth() const {
00395   return clean_part_grid_.gridwidth();
00396 }
00397 int TableFinder::gridheight() const {
00398   return clean_part_grid_.gridheight();
00399 }
00400 const ICOORD& TableFinder::bleft() const {
00401   return clean_part_grid_.bleft();
00402 }
00403 const ICOORD& TableFinder::tright() const {
00404   return clean_part_grid_.tright();
00405 }
00406 
00407 void TableFinder::InsertTextPartition(ColPartition* part) {
00408   ASSERT_HOST(part != NULL);
00409   if (AllowTextPartition(*part)) {
00410     clean_part_grid_.InsertBBox(true, true, part);
00411   } else {
00412     delete part;
00413   }
00414 }
00415 void TableFinder::InsertFragmentedTextPartition(ColPartition* part) {
00416   ASSERT_HOST(part != NULL);
00417   if (AllowTextPartition(*part)) {
00418     fragmented_text_grid_.InsertBBox(true, true, part);
00419   } else {
00420     delete part;
00421   }
00422 }
00423 void TableFinder::InsertLeaderPartition(ColPartition* part) {
00424   ASSERT_HOST(part != NULL);
00425   if (!part->IsEmpty() && part->bounding_box().area() > 0) {
00426     leader_and_ruling_grid_.InsertBBox(true, true, part);
00427   } else {
00428     delete part;
00429   }
00430 }
00431 void TableFinder::InsertRulingPartition(ColPartition* part) {
00432   leader_and_ruling_grid_.InsertBBox(true, true, part);
00433 }
00434 void TableFinder::InsertImagePartition(ColPartition* part) {
00435   // NOTE: If images are placed into a different grid in the future,
00436   // the function SetPartitionSpacings needs to be updated. It should
00437   // be the only thing that cares about image partitions.
00438   clean_part_grid_.InsertBBox(true, true, part);
00439 }
00440 
00441 // Splits a partition into its "words". The splits happen
00442 // at locations with wide inter-blob spacing. This is useful
00443 // because it allows the table recognize to "cut through" the
00444 // text lines on the page. The assumption is that a table
00445 // will have several lines with similar overlapping whitespace
00446 // whereas text will not have this type of property.
00447 // Note: The code Assumes that blobs are sorted by the left side x!
00448 // This will not work (as well) if the blobs are sorted by center/right.
00449 void TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition* part) {
00450   ASSERT_HOST(part != NULL);
00451   // Bye bye empty partitions!
00452   if (part->boxes()->empty()) {
00453     delete part;
00454     return;
00455   }
00456 
00457   // The AllowBlob function prevents this.
00458   ASSERT_HOST(part->median_width() > 0);
00459   const double kThreshold = part->median_width() * kSplitPartitionSize;
00460 
00461   ColPartition* right_part = part;
00462   bool found_split = true;
00463   while (found_split) {
00464     found_split = false;
00465     BLOBNBOX_C_IT box_it(right_part->boxes());
00466     // Blobs are sorted left side first. If blobs overlap,
00467     // the previous blob may have a "more right" right side.
00468     // Account for this by always keeping the largest "right"
00469     // so far.
00470     int previous_right = MIN_INT32;
00471 
00472     // Look for the next split in the partition.
00473     for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
00474       const TBOX& box = box_it.data()->bounding_box();
00475       if (previous_right != MIN_INT32 &&
00476           box.left() - previous_right > kThreshold) {
00477         // We have a split position. Split the partition in two pieces.
00478         // Insert the left piece in the grid and keep processing the right.
00479         int mid_x = (box.left() + previous_right) / 2;
00480         ColPartition* left_part = right_part;
00481         right_part = left_part->SplitAt(mid_x);
00482 
00483         InsertFragmentedTextPartition(left_part);
00484         found_split = true;
00485         break;
00486       }
00487 
00488       // The right side of the previous blobs.
00489       previous_right = MAX(previous_right, box.right());
00490     }
00491   }
00492   // When a split is not found, the right part is minimized
00493   // as much as possible, so process it.
00494   InsertFragmentedTextPartition(right_part);
00495 }
00496 
00497 // Some simple criteria to filter out now. We want to make sure the
00498 // average blob size in the partition is consistent with the
00499 // global page stats.
00500 // The area metric will almost always pass for multi-blob partitions.
00501 // It is useful when filtering out noise caused by an isolated blob.
00502 bool TableFinder::AllowTextPartition(const ColPartition& part) const {
00503   const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
00504   const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
00505   const int median_area = global_median_xheight_ * global_median_blob_width_;
00506   const double kAreaPerBlobRequired = median_area * kAllowTextArea;
00507   // Keep comparisons strictly greater to disallow 0!
00508   return part.median_size() > kHeightRequired &&
00509          part.median_width() > kWidthRequired &&
00510          part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
00511 }
00512 
00513 // Same as above, applied to blobs. Keep in mind that
00514 // leaders, commas, and periods are important in tables.
00515 bool TableFinder::AllowBlob(const BLOBNBOX& blob) const {
00516   const TBOX& box = blob.bounding_box();
00517   const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
00518   const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
00519   const int median_area = global_median_xheight_ * global_median_blob_width_;
00520   const double kAreaRequired = median_area * kAllowBlobArea;
00521   // Keep comparisons strictly greater to disallow 0!
00522   return box.height() > kHeightRequired &&
00523          box.width() > kWidthRequired &&
00524          box.area() > kAreaRequired;
00525 }
00526 
00527 // TODO(nbeato): The grid that makes the window doesn't seem to matter.
00528 // The only downside is that window messages will be caught by
00529 // clean_part_grid_ instead of a useful object. This is a temporary solution
00530 // for the debug windows created by the TableFinder.
00531 ScrollView* TableFinder::MakeWindow(int x, int y, const char* window_name) {
00532   return clean_part_grid_.MakeWindow(x, y, window_name);
00533 }
00534 
00535 // Make single-column blocks from good_columns_ partitions.
00536 void TableFinder::GetColumnBlocks(ColPartitionSet** all_columns,
00537                                   ColSegment_LIST* column_blocks) {
00538   for (int i = 0; i < gridheight(); ++i) {
00539     ColPartitionSet* columns = all_columns[i];
00540     if (columns != NULL) {
00541       ColSegment_LIST new_blocks;
00542       // Get boxes from the current vertical position on the grid
00543       columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
00544       // Merge the new_blocks boxes into column_blocks if they are well-aligned
00545       GroupColumnBlocks(&new_blocks, column_blocks);
00546     }
00547   }
00548 }
00549 
00550 // Merge column segments into the current list if they are well aligned.
00551 void TableFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks,
00552                                     ColSegment_LIST* column_blocks) {
00553   ColSegment_IT src_it(new_blocks);
00554   ColSegment_IT dest_it(column_blocks);
00555   // iterate through the source list
00556   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
00557     ColSegment* src_seg = src_it.data();
00558     TBOX src_box = src_seg->bounding_box();
00559     bool match_found = false;
00560     // iterate through the destination list to find a matching column block
00561     for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
00562       ColSegment* dest_seg = dest_it.data();
00563       TBOX dest_box = dest_seg->bounding_box();
00564       if (ConsecutiveBoxes(src_box, dest_box)) {
00565         // If matching block is found, insert the current block into it
00566         // and delete the soure block
00567         dest_seg->InsertBox(src_box);
00568         match_found = true;
00569         delete src_it.extract();
00570         break;
00571       }
00572     }
00573     // If no match is found, just append the source block to column_blocks
00574     if (!match_found) {
00575       dest_it.add_after_then_move(src_it.extract());
00576     }
00577   }
00578 }
00579 
00580 // are the two boxes immediate neighbors along the vertical direction
00581 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
00582   int x_margin = 20;
00583   int y_margin = 5;
00584   return (abs(b1.left() - b2.left()) < x_margin) &&
00585       (abs(b1.right() - b2.right()) < x_margin) &&
00586       (abs(b1.top()-b2.bottom()) < y_margin ||
00587        abs(b2.top()-b1.bottom()) < y_margin);
00588 }
00589 
00590 // Set up info for clean_part_grid_ partitions to be valid during detection
00591 // code.
00592 void TableFinder::InitializePartitions(ColPartitionSet** all_columns) {
00593   FindNeighbors();
00594   SetPartitionSpacings(&clean_part_grid_, all_columns);
00595   SetGlobalSpacings(&clean_part_grid_);
00596 }
00597 
00598 // Set left, right and top, bottom spacings of each colpartition.
00599 void TableFinder::SetPartitionSpacings(ColPartitionGrid* grid,
00600                                        ColPartitionSet** all_columns) {
00601   // Iterate the ColPartitions in the grid.
00602   ColPartitionGridSearch gsearch(grid);
00603   gsearch.StartFullSearch();
00604   ColPartition* part = NULL;
00605   while ((part = gsearch.NextFullSearch()) != NULL) {
00606     ColPartitionSet* columns = all_columns[gsearch.GridY()];
00607     TBOX box = part->bounding_box();
00608     int y = part->MidY();
00609     ColPartition* left_column = columns->ColumnContaining(box.left(), y);
00610     ColPartition* right_column = columns->ColumnContaining(box.right(), y);
00611     // set distance from left column as space to the left
00612     if (left_column) {
00613       int left_space = MAX(0, box.left() - left_column->LeftAtY(y));
00614       part->set_space_to_left(left_space);
00615     }
00616     // set distance from right column as space to the right
00617     if (right_column) {
00618       int right_space = MAX(0, right_column->RightAtY(y) - box.right());
00619       part->set_space_to_right(right_space);
00620     }
00621 
00622     // Look for images that may be closer.
00623     // NOTE: used to be part_grid_, might cause issues now
00624     ColPartitionGridSearch hsearch(grid);
00625     hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
00626     ColPartition* neighbor = NULL;
00627     while ((neighbor = hsearch.NextSideSearch(true)) != NULL) {
00628       if (neighbor->type() == PT_PULLOUT_IMAGE ||
00629           neighbor->type() == PT_FLOWING_IMAGE ||
00630           neighbor->type() == PT_HEADING_IMAGE) {
00631         int right = neighbor->bounding_box().right();
00632         if (right < box.left()) {
00633           int space = MIN(box.left() - right, part->space_to_left());
00634           part->set_space_to_left(space);
00635         }
00636       }
00637     }
00638     hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
00639     neighbor = NULL;
00640     while ((neighbor = hsearch.NextSideSearch(false)) != NULL) {
00641       if (neighbor->type() == PT_PULLOUT_IMAGE ||
00642           neighbor->type() == PT_FLOWING_IMAGE ||
00643           neighbor->type() == PT_HEADING_IMAGE) {
00644         int left = neighbor->bounding_box().left();
00645         if (left > box.right()) {
00646           int space = MIN(left - box.right(), part->space_to_right());
00647           part->set_space_to_right(space);
00648         }
00649       }
00650     }
00651 
00652     ColPartition* upper_part = part->SingletonPartner(true);
00653     if (upper_part) {
00654       int space = MAX(0, upper_part->bounding_box().bottom() -
00655                          part->bounding_box().bottom());
00656       part->set_space_above(space);
00657     } else {
00658       // TODO(nbeato): What constitutes a good value?
00659       // 0 is the default value when not set, explicitly noting it needs to
00660       // be something else.
00661       part->set_space_above(MAX_INT32);
00662     }
00663 
00664     ColPartition* lower_part = part->SingletonPartner(false);
00665     if (lower_part) {
00666       int space = MAX(0, part->bounding_box().bottom() -
00667                          lower_part->bounding_box().bottom());
00668       part->set_space_below(space);
00669     } else {
00670       // TODO(nbeato): What constitutes a good value?
00671       // 0 is the default value when not set, explicitly noting it needs to
00672       // be something else.
00673       part->set_space_below(MAX_INT32);
00674     }
00675   }
00676 }
00677 
00678 // Set spacing and closest neighbors above and below a given colpartition.
00679 void TableFinder::SetVerticalSpacing(ColPartition* part) {
00680   TBOX box = part->bounding_box();
00681   int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y());
00682   int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y());
00683   box.set_top(top_range);
00684   box.set_bottom(bottom_range);
00685 
00686   TBOX part_box = part->bounding_box();
00687   // Start a rect search
00688   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
00689       rectsearch(&clean_part_grid_);
00690   rectsearch.StartRectSearch(box);
00691   ColPartition* neighbor;
00692   int min_space_above = kMaxVerticalSpacing;
00693   int min_space_below = kMaxVerticalSpacing;
00694   ColPartition* above_neighbor = NULL;
00695   ColPartition* below_neighbor = NULL;
00696   while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
00697     if (neighbor == part)
00698       continue;
00699     TBOX neighbor_box = neighbor->bounding_box();
00700     if (neighbor_box.major_x_overlap(part_box)) {
00701       int gap = abs(part->median_bottom() - neighbor->median_bottom());
00702       // If neighbor is below current partition
00703       if (neighbor_box.top() < part_box.bottom() &&
00704           gap < min_space_below) {
00705         min_space_below = gap;
00706         below_neighbor = neighbor;
00707       }  // If neighbor is above current partition
00708       else if (part_box.top() < neighbor_box.bottom() &&
00709                gap < min_space_above) {
00710         min_space_above = gap;
00711         above_neighbor = neighbor;
00712        }
00713     }
00714   }
00715   part->set_space_above(min_space_above);
00716   part->set_space_below(min_space_below);
00717   part->set_nearest_neighbor_above(above_neighbor);
00718   part->set_nearest_neighbor_below(below_neighbor);
00719 }
00720 
00721 // Set global spacing and x-height estimates
00722 void TableFinder::SetGlobalSpacings(ColPartitionGrid* grid) {
00723   STATS xheight_stats(0, kMaxVerticalSpacing + 1);
00724   STATS width_stats(0, kMaxBlobWidth + 1);
00725   STATS ledding_stats(0, kMaxVerticalSpacing + 1);
00726   // Iterate the ColPartitions in the grid.
00727   ColPartitionGridSearch gsearch(grid);
00728   gsearch.SetUniqueMode(true);
00729   gsearch.StartFullSearch();
00730   ColPartition* part = NULL;
00731   while ((part = gsearch.NextFullSearch()) != NULL) {
00732     // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
00733     // ComputeLimits needs to get called somewhere outside of TableFinder
00734     // to make sure the partitions are properly initialized.
00735     // When this is called, SmoothPartitionPartners dies in an assert after
00736     // table find runs. Alternative solution.
00737     // part->ComputeLimits();
00738     if (part->IsTextType()) {
00739       // xheight_stats.add(part->median_size(), part->boxes_count());
00740       // width_stats.add(part->median_width(), part->boxes_count());
00741 
00742       // This loop can be removed when above issues are fixed.
00743       // Replace it with the 2 lines commented out above.
00744       BLOBNBOX_C_IT it(part->boxes());
00745       for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00746         xheight_stats.add(it.data()->bounding_box().height(), 1);
00747         width_stats.add(it.data()->bounding_box().width(), 1);
00748       }
00749 
00750       ledding_stats.add(part->space_above(), 1);
00751       ledding_stats.add(part->space_below(), 1);
00752     }
00753   }
00754   // Set estimates based on median of statistics obtained
00755   set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
00756   set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
00757   set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
00758   #ifndef GRAPHICS_DISABLED
00759   if (textord_tablefind_show_stats) {
00760     const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
00761     ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
00762     xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
00763     width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
00764     ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
00765   }
00766   #endif  // GRAPHICS_DISABLED
00767 }
00768 
00769 void TableFinder::set_global_median_xheight(int xheight) {
00770   global_median_xheight_ = xheight;
00771 }
00772 void TableFinder::set_global_median_blob_width(int width) {
00773   global_median_blob_width_ = width;
00774 }
00775 void TableFinder::set_global_median_ledding(int ledding) {
00776   global_median_ledding_ = ledding;
00777 }
00778 
00779 void TableFinder::FindNeighbors() {
00780   ColPartitionGridSearch gsearch(&clean_part_grid_);
00781   gsearch.StartFullSearch();
00782   ColPartition* part = NULL;
00783   while ((part = gsearch.NextFullSearch()) != NULL) {
00784     // TODO(nbeato): Rename this function, meaning is different now.
00785     // IT is finding nearest neighbors its own way
00786     //SetVerticalSpacing(part);
00787 
00788     ColPartition* upper = part->SingletonPartner(true);
00789     if (upper)
00790       part->set_nearest_neighbor_above(upper);
00791 
00792     ColPartition* lower = part->SingletonPartner(false);
00793     if (lower)
00794       part->set_nearest_neighbor_below(lower);
00795   }
00796 }
00797 
00798 // High level interface. Input is an unmarked ColPartitionGrid
00799 // (namely, clean_part_grid_). Partitions are identified using local
00800 // information and filter/smoothed. The function exit should contain
00801 // a good sampling of the table partitions.
00802 void TableFinder::MarkTablePartitions() {
00803   MarkPartitionsUsingLocalInformation();
00804   if (textord_tablefind_show_mark) {
00805     ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
00806     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00807     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
00808                          ScrollView::AQUAMARINE);
00809   }
00810   FilterFalseAlarms();
00811   if (textord_tablefind_show_mark) {
00812     ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
00813     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00814     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
00815                          ScrollView::AQUAMARINE);
00816   }
00817   SmoothTablePartitionRuns();
00818   if (textord_tablefind_show_mark) {
00819     ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
00820     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00821     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
00822                          ScrollView::AQUAMARINE);
00823   }
00824   FilterFalseAlarms();
00825   if (textord_tablefind_show_mark || textord_show_tables) {
00826     ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
00827     DisplayColPartitions(table_win, &clean_part_grid_, ScrollView::BLUE);
00828     DisplayColPartitions(table_win, &leader_and_ruling_grid_,
00829                          ScrollView::AQUAMARINE);
00830   }
00831 }
00832 
00833 // These types of partitions are marked as table partitions:
00834 //  1- Partitions that have at lease one large gap between words
00835 //  2- Partitions that consist of only one word (no significant gap
00836 //     between components)
00837 //  3- Partitions that vertically overlap with other partitions within the
00838 //     same column.
00839 //  4- Partitions with leaders before/after them.
00840 void TableFinder::MarkPartitionsUsingLocalInformation() {
00841   // Iterate the ColPartitions in the grid.
00842   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
00843     gsearch(&clean_part_grid_);
00844   gsearch.StartFullSearch();
00845   ColPartition* part = NULL;
00846   while ((part = gsearch.NextFullSearch()) != NULL) {
00847     if (!part->IsTextType())  // Only consider text partitions
00848       continue;
00849     // Only consider partitions in dominant font size or smaller
00850     if (part->median_size() > kMaxTableCellXheight * global_median_xheight_)
00851       continue;
00852     // Mark partitions with a large gap, or no significant gap as
00853     // table partitions.
00854     // Comments: It produces several false alarms at:
00855     //  - last line of a paragraph (fixed)
00856     //  - single word section headings
00857     //  - page headers and footers
00858     //  - numbered equations
00859     //  - line drawing regions
00860     // TODO(faisal): detect and fix above-mentioned cases
00861     if (HasWideOrNoInterWordGap(part) ||
00862         HasLeaderAdjacent(*part)) {
00863       part->set_table_type();
00864     }
00865   }
00866 }
00867 
00868 // Check if the partition has at least one large gap between words or no
00869 // significant gap at all
00870 bool TableFinder::HasWideOrNoInterWordGap(ColPartition* part) const {
00871   // Should only get text partitions.
00872   ASSERT_HOST(part->IsTextType());
00873   // Blob access
00874   BLOBNBOX_CLIST* part_boxes = part->boxes();
00875   BLOBNBOX_C_IT it(part_boxes);
00876   // Check if this is a relatively small partition (such as a single word)
00877   if (part->bounding_box().width() <
00878       kMinBoxesInTextPartition * part->median_size() &&
00879       part_boxes->length() < kMinBoxesInTextPartition)
00880     return true;
00881 
00882   // Variables used to compute inter-blob spacing.
00883   int current_x0 = -1;
00884   int current_x1 = -1;
00885   int previous_x1 = -1;
00886   // Stores the maximum gap detected.
00887   int largest_partition_gap_found = -1;
00888   // Text partition gap limits. If this is text (and not a table),
00889   // there should be at least one gap larger than min_gap and no gap
00890   // larger than max_gap.
00891   const double max_gap = kMaxGapInTextPartition * part->median_size();
00892   const double min_gap = kMinMaxGapInTextPartition * part->median_size();
00893 
00894   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00895     BLOBNBOX* blob = it.data();
00896     current_x0 = blob->bounding_box().left();
00897     current_x1 = blob->bounding_box().right();
00898     if (previous_x1 != -1) {
00899       int gap = current_x0 - previous_x1;
00900 
00901       // TODO(nbeato): Boxes may overlap? Huh?
00902       // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
00903       // on the top right of the page are filtered out with this line.
00904       // Note 2: Iterating over blobs in a partition, so we are looking for
00905       // spacing between the words.
00906       if (gap < 0) {
00907         // More likely case, the blobs slightly overlap. This can happen
00908         // with diacritics (accents) or broken alphabet symbols (characters).
00909         // Merge boxes together by taking max of right sides.
00910         if (-gap < part->median_size() * kMaxBlobOverlapFactor) {
00911           previous_x1 = MAX(previous_x1, current_x1);
00912           continue;
00913         }
00914         // Extreme case, blobs overlap significantly in the same partition...
00915         // This should not happen often (if at all), but it does.
00916         // TODO(nbeato): investigate cases when this happens.
00917         else {
00918           // The behavior before was to completely ignore this case.
00919         }
00920       }
00921 
00922       // If a large enough gap is found, mark it as a table cell (return true)
00923       if (gap > max_gap)
00924         return true;
00925       if (gap > largest_partition_gap_found)
00926         largest_partition_gap_found = gap;
00927     }
00928     previous_x1 = current_x1;
00929   }
00930   // Since no large gap was found, return false if the partition is too
00931   // long to be a data cell
00932   if (part->bounding_box().width() >
00933       kMaxBoxesInDataPartition * part->median_size() ||
00934       part_boxes->length() > kMaxBoxesInDataPartition)
00935     return false;
00936 
00937   // A partition may be a single blob. In this case, it's an isolated symbol
00938   // or non-text (such as a ruling or image).
00939   // Detect these as table partitions? Shouldn't this be case by case?
00940   // The behavior before was to ignore this, making max_partition_gap < 0
00941   // and implicitly return true. Just making it explicit.
00942   if (largest_partition_gap_found == -1)
00943     return true;
00944 
00945   // return true if the maximum gap found is smaller than the minimum allowed
00946   // max_gap in a text partition. This indicates that there is no signficant
00947   // space in the partition, hence it is likely a single word.
00948   return largest_partition_gap_found < min_gap;
00949 }
00950 
00951 // A criteria for possible tables is that a table may have leaders
00952 // between data cells. An aggressive solution to find such tables is to
00953 // explicitly mark partitions that have adjacent leaders.
00954 // Note that this includes overlapping leaders. However, it does not
00955 // include leaders in different columns on the page.
00956 // Possible false-positive will include lists, such as a table of contents.
00957 // As these arise, the agressive nature of this search may need to be
00958 // trimmed down.
00959 bool TableFinder::HasLeaderAdjacent(const ColPartition& part) {
00960   if (part.flow() == BTFT_LEADER)
00961     return true;
00962   // Search range is left and right bounded by an offset of the
00963   // median xheight. This offset is to allow some tolerance to the
00964   // the leaders on the page in the event that the alignment is still
00965   // a bit off.
00966   const TBOX& box = part.bounding_box();
00967   const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
00968   const int top = box.top() + search_size;
00969   const int bottom = box.bottom() - search_size;
00970   ColPartitionGridSearch hsearch(&leader_and_ruling_grid_);
00971   for (int direction = 0; direction < 2; ++direction) {
00972     bool right_to_left = (direction == 0);
00973     int x = right_to_left ? box.right() : box.left();
00974     hsearch.StartSideSearch(x, bottom, top);
00975     ColPartition* leader = NULL;
00976     while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) {
00977       // This should not happen, they are in different grids.
00978       ASSERT_HOST(&part != leader);
00979       // The leader could be a horizontal ruling in the grid.
00980       // Make sure it is actually a leader.
00981       if (leader->flow() != BTFT_LEADER)
00982         continue;
00983       // Make sure the leader shares a page column with the partition,
00984       // otherwise we are spreading across columns.
00985       if (!part.IsInSameColumnAs(*leader))
00986         break;
00987       // There should be a significant vertical overlap
00988       if (!leader->VSignificantCoreOverlap(part))
00989         continue;
00990       // Leader passed all tests, so it is adjacent.
00991       return true;
00992     }
00993   }
00994   // No leaders are adjacent to the given partition.
00995   return false;
00996 }
00997 
00998 // Filter individual text partitions marked as table partitions
00999 // consisting of paragraph endings, small section headings, and
01000 // headers and footers.
01001 void TableFinder::FilterFalseAlarms() {
01002   FilterParagraphEndings();
01003   FilterHeaderAndFooter();
01004   // TODO(nbeato): Fully justified text as non-table?
01005 }
01006 
01007 void TableFinder::FilterParagraphEndings() {
01008   // Detect last line of paragraph
01009   // Iterate the ColPartitions in the grid.
01010   ColPartitionGridSearch gsearch(&clean_part_grid_);
01011   gsearch.StartFullSearch();
01012   ColPartition* part = NULL;
01013   while ((part = gsearch.NextFullSearch()) != NULL) {
01014     if (part->type() != PT_TABLE)
01015       continue;  // Consider only table partitions
01016 
01017     // Paragraph ending should have flowing text above it.
01018     ColPartition* upper_part = part->nearest_neighbor_above();
01019     if (!upper_part)
01020       continue;
01021     if (upper_part->type() != PT_FLOWING_TEXT)
01022       continue;
01023     if (upper_part->bounding_box().width() <
01024         2 * part->bounding_box().width())
01025       continue;
01026     // Check if its the last line of a paragraph.
01027     // In most cases, a paragraph ending should be left-aligned to text line
01028     // above it. Sometimes, it could be a 2 line paragraph, in which case
01029     // the line above it is indented.
01030     // To account for that, check if the partition center is to
01031     // the left of the one above it.
01032     int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
01033     int upper_mid = (upper_part->bounding_box().left() +
01034                      upper_part->bounding_box().right()) / 2;
01035     int current_spacing = 0;  // spacing of the current line to margin
01036     int upper_spacing = 0;    // spacing of the previous line to the margin
01037     if (left_to_right_language_) {
01038       // Left to right languages, use mid - left to figure out the distance
01039       // the middle is from the left margin.
01040       int left = MIN(part->bounding_box().left(),
01041                      upper_part->bounding_box().left());
01042       current_spacing = mid - left;
01043       upper_spacing = upper_mid - left;
01044     } else {
01045       // Right to left languages, use right - mid to figure out the distance
01046       // the middle is from the right margin.
01047       int right = MAX(part->bounding_box().right(),
01048                       upper_part->bounding_box().right());
01049       current_spacing = right - mid;
01050       upper_spacing = right - upper_mid;
01051     }
01052     if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
01053       continue;
01054 
01055     // Paragraphs should have similar fonts.
01056     if (!part->MatchingSizes(*upper_part) ||
01057         !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
01058                                    kStrokeWidthConstantTolerance)) {
01059       continue;
01060     }
01061 
01062     // The last line of a paragraph should be left aligned.
01063     // TODO(nbeato): This would be untrue if the text was right aligned.
01064     // How often is that?
01065     if (part->space_to_left() >
01066         kMaxParagraphEndingLeftSpaceMultiple * part->median_size())
01067       continue;
01068     // The line above it should be right aligned (assuming justified format).
01069     // Since we can't assume justified text, we compare whitespace to text.
01070     // The above line should have majority spanning text (or the current
01071     // line could have fit on the previous line). So compare
01072     // whitespace to text.
01073     if (upper_part->bounding_box().width() <
01074         kMinParagraphEndingTextToWhitespaceRatio * upper_part->space_to_right())
01075       continue;
01076 
01077     // Ledding above the line should be less than ledding below
01078     if (part->space_above() >= part->space_below() ||
01079         part->space_above() > 2 * global_median_ledding_)
01080       continue;
01081 
01082     // If all checks failed, it is probably text.
01083     part->clear_table_type();
01084   }
01085 }
01086 
01087 void TableFinder::FilterHeaderAndFooter() {
01088   // Consider top-most text colpartition as header and bottom most as footer
01089   ColPartition* header = NULL;
01090   ColPartition* footer = NULL;
01091   int max_top = MIN_INT32;
01092   int min_bottom = MAX_INT32;
01093   ColPartitionGridSearch gsearch(&clean_part_grid_);
01094   gsearch.StartFullSearch();
01095   ColPartition* part = NULL;
01096   while ((part = gsearch.NextFullSearch()) != NULL) {
01097     if (!part->IsTextType())
01098       continue;  // Consider only text partitions
01099     int top = part->bounding_box().top();
01100     int bottom = part->bounding_box().bottom();
01101     if (top > max_top) {
01102       max_top = top;
01103       header = part;
01104     }
01105     if (bottom < min_bottom) {
01106       min_bottom = bottom;
01107       footer = part;
01108     }
01109   }
01110   if (header)
01111     header->clear_table_type();
01112   if (footer)
01113     footer->clear_table_type();
01114 }
01115 
01116 // Mark all ColPartitions as table cells that have a table cell above
01117 // and below them
01118 // TODO(faisal): This is too aggressive at the moment. The method needs to
01119 // consider spacing and alignment as well. Detection of false alarm table cells
01120 // should also be done as part of it.
01121 void TableFinder::SmoothTablePartitionRuns() {
01122   // Iterate the ColPartitions in the grid.
01123   ColPartitionGridSearch gsearch(&clean_part_grid_);
01124   gsearch.StartFullSearch();
01125   ColPartition* part = NULL;
01126   while ((part = gsearch.NextFullSearch()) != NULL) {
01127     if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
01128       continue;  // Consider only text partitions
01129     ColPartition* upper_part = part->nearest_neighbor_above();
01130     ColPartition* lower_part = part->nearest_neighbor_below();
01131     if (!upper_part || !lower_part)
01132       continue;
01133     if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
01134       part->set_table_type();
01135   }
01136 
01137   // Pass 2, do the opposite. If both the upper and lower neighbors
01138   // exist and are not tables, this probably shouldn't be a table.
01139   gsearch.StartFullSearch();
01140   part = NULL;
01141   while ((part = gsearch.NextFullSearch()) != NULL) {
01142     if (part->type() != PT_TABLE)
01143       continue;  // Consider only text partitions
01144     ColPartition* upper_part = part->nearest_neighbor_above();
01145     ColPartition* lower_part = part->nearest_neighbor_below();
01146 
01147     // table can't be by itself
01148     if ((upper_part && upper_part->type() != PT_TABLE) &&
01149         (lower_part && lower_part->type() != PT_TABLE)) {
01150       part->clear_table_type();
01151     }
01152   }
01153 }
01154 
01155 // Set the type of a column segment based on the ratio of table to text cells
01156 void TableFinder::SetColumnsType(ColSegment_LIST* column_blocks) {
01157   ColSegment_IT it(column_blocks);
01158   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01159     ColSegment* seg = it.data();
01160     TBOX box = seg->bounding_box();
01161     int num_table_cells = 0;
01162     int num_text_cells = 0;
01163     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01164         rsearch(&clean_part_grid_);
01165     rsearch.SetUniqueMode(true);
01166     rsearch.StartRectSearch(box);
01167     ColPartition* part = NULL;
01168     while ((part = rsearch.NextRectSearch()) != NULL) {
01169       if (part->type() == PT_TABLE) {
01170         num_table_cells++;
01171       } else if (part->type() == PT_FLOWING_TEXT) {
01172         num_text_cells++;
01173       }
01174     }
01175     // If a column block has no text or table partition in it, it is not needed
01176     // for table detection.
01177     if (!num_table_cells && !num_text_cells) {
01178       delete it.extract();
01179     } else {
01180       seg->set_num_table_cells(num_table_cells);
01181       seg->set_num_text_cells(num_text_cells);
01182       // set column type based on the ratio of table to text cells
01183       seg->set_type();
01184     }
01185   }
01186 }
01187 
01188 // Move column blocks to grid
01189 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
01190                                          ColSegmentGrid *col_seg_grid) {
01191   ColSegment_IT it(segments);
01192   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01193     ColSegment* seg = it.extract();
01194     col_seg_grid->InsertBBox(true, true, seg);
01195   }
01196 }
01197 
01198 // Merge column blocks if a split is detected due to the presence of a
01199 // table. A text block is considered split if it has multiple
01200 // neighboring blocks above/below it, and at least one of the
01201 // neighboring blocks is of table type (has a high density of table
01202 // partitions). In this case neighboring blocks in the direction
01203 // (above/below) of the table block are merged with the text block.
01204 
01205 // Comment: This method does not handle split due to a full page table
01206 // since table columns in this case do not have a text column on which
01207 // split decision can be based.
01208 void TableFinder::GridMergeColumnBlocks() {
01209   int margin = gridsize();
01210 
01211   // Iterate the Column Blocks in the grid.
01212   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01213     gsearch(&col_seg_grid_);
01214   gsearch.StartFullSearch();
01215   ColSegment* seg;
01216   while ((seg = gsearch.NextFullSearch()) != NULL) {
01217     if (seg->type() != COL_TEXT)
01218       continue;  // only consider text blocks for split detection
01219     bool neighbor_found = false;
01220     bool modified = false;  // Modified at least once
01221     // keep expanding current box as long as neighboring table columns
01222     // are found above or below it.
01223     do {
01224       TBOX box = seg->bounding_box();
01225       // slightly expand the search region vertically
01226       int top_range = MIN(box.top() + margin, tright().y());
01227       int bottom_range = MAX(box.bottom() - margin, bleft().y());
01228       box.set_top(top_range);
01229       box.set_bottom(bottom_range);
01230       neighbor_found = false;
01231       GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01232           rectsearch(&col_seg_grid_);
01233       rectsearch.StartRectSearch(box);
01234       ColSegment* neighbor = NULL;
01235       while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
01236         if (neighbor == seg)
01237           continue;
01238         const TBOX& neighbor_box = neighbor->bounding_box();
01239         // If the neighbor box significantly overlaps with the current
01240         // box (due to the expansion of the current box in the
01241         // previous iteration of this loop), remove the neighbor box
01242         // and expand the current box to include it.
01243         if (neighbor_box.overlap_fraction(box) >= 0.9) {
01244           seg->InsertBox(neighbor_box);
01245           modified = true;
01246           rectsearch.RemoveBBox();
01247           gsearch.RepositionIterator();
01248           delete neighbor;
01249           continue;
01250         }
01251         // Only expand if the neighbor box is of table type
01252         if (neighbor->type() != COL_TABLE)
01253           continue;
01254         // Insert the neighbor box into the current column block
01255         if (neighbor_box.major_x_overlap(box) &&
01256             !box.contains(neighbor_box)) {
01257           seg->InsertBox(neighbor_box);
01258           neighbor_found = true;
01259           modified = true;
01260           rectsearch.RemoveBBox();
01261           gsearch.RepositionIterator();
01262           delete neighbor;
01263         }
01264       }
01265     } while (neighbor_found);
01266     if (modified) {
01267       // Because the box has changed, it has to be removed first.
01268       gsearch.RemoveBBox();
01269       col_seg_grid_.InsertBBox(true, true, seg);
01270       gsearch.RepositionIterator();
01271     }
01272   }
01273 }
01274 
01275 // Group horizontally overlapping table partitions into table columns.
01276 // TODO(faisal): This is too aggressive at the moment. The method should
01277 // consider more attributes to group table partitions together. Some common
01278 // errors are:
01279 //  1- page number is merged with a table column above it even
01280 //      if there is a large vertical gap between them.
01281 //  2- column headers go on to catch one of the columns arbitrarily
01282 //  3- an isolated noise blob near page top or bottom merges with the table
01283 //     column below/above it
01284 //  4- cells from two vertically adjacent tables merge together to make a
01285 //     single column resulting in merging of the two tables
01286 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {
01287   ColSegment_IT it(table_columns);
01288   // Iterate the ColPartitions in the grid.
01289   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01290     gsearch(&clean_part_grid_);
01291   gsearch.StartFullSearch();
01292   ColPartition* part;
01293   while ((part = gsearch.NextFullSearch()) != NULL) {
01294     if (part->inside_table_column() || part->type() != PT_TABLE)
01295       continue;  // prevent a partition to be assigned to multiple columns
01296     const TBOX& box = part->bounding_box();
01297     ColSegment* col = new ColSegment();
01298     col->InsertBox(box);
01299     part->set_inside_table_column(true);
01300     // Start a search below the current cell to find bottom neighbours
01301     // Note: a full search will always process things above it first, so
01302     // this should be starting at the highest cell and working its way down.
01303     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01304         vsearch(&clean_part_grid_);
01305     vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
01306     ColPartition* neighbor = NULL;
01307     bool found_neighbours = false;
01308     while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) {
01309       // only consider neighbors not assigned to any column yet
01310       if (neighbor->inside_table_column())
01311         continue;
01312       // Horizontal lines should not break the flow
01313       if (neighbor->IsHorizontalLine())
01314         continue;
01315       // presence of a non-table neighbor marks the end of current
01316       // table column
01317       if (neighbor->type() != PT_TABLE)
01318         break;
01319       // add the neighbor partition to the table column
01320       const TBOX& neighbor_box = neighbor->bounding_box();
01321       col->InsertBox(neighbor_box);
01322       neighbor->set_inside_table_column(true);
01323       found_neighbours = true;
01324     }
01325     if (found_neighbours) {
01326       it.add_after_then_move(col);
01327     } else {
01328       part->set_inside_table_column(false);
01329       delete col;
01330     }
01331   }
01332 }
01333 
01334 // Mark regions in a column that are x-bounded by the column boundaries and
01335 // y-bounded by the table columns' projection on the y-axis as table regions
01336 void TableFinder::GetTableRegions(ColSegment_LIST* table_columns,
01337                                   ColSegment_LIST* table_regions) {
01338   ColSegment_IT cit(table_columns);
01339   ColSegment_IT rit(table_regions);
01340   // Iterate through column blocks
01341   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01342       gsearch(&col_seg_grid_);
01343   gsearch.StartFullSearch();
01344   ColSegment* part;
01345   int page_height = tright().y() - bleft().y();
01346   ASSERT_HOST(page_height > 0);
01347   // create a bool array to hold projection on y-axis
01348   bool* table_region = new bool[page_height];
01349   while ((part = gsearch.NextFullSearch()) != NULL) {
01350     TBOX part_box = part->bounding_box();
01351     // reset the projection array
01352     for (int i = 0; i < page_height; i++) {
01353       table_region[i] = false;
01354     }
01355     // iterate through all table columns to find regions in the current
01356     // page column block
01357     cit.move_to_first();
01358     for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
01359       TBOX col_box = cit.data()->bounding_box();
01360       // find intersection region of table column and page column
01361       TBOX intersection_box = col_box.intersection(part_box);
01362       // project table column on the y-axis
01363       for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
01364         table_region[i - bleft().y()] = true;
01365       }
01366     }
01367     // set x-limits of table regions to page column width
01368     TBOX current_table_box;
01369     current_table_box.set_left(part_box.left());
01370     current_table_box.set_right(part_box.right());
01371     // go through the y-axis projection to find runs of table
01372     // regions. Each run makes one table region.
01373     for (int i = 1; i < page_height; i++) {
01374       // detect start of a table region
01375       if (!table_region[i - 1] && table_region[i]) {
01376         current_table_box.set_bottom(i + bleft().y());
01377       }
01378       // TODO(nbeato): Is it guaranteed that the last row is not a table region?
01379       // detect end of a table region
01380       if (table_region[i - 1] && !table_region[i]) {
01381         current_table_box.set_top(i + bleft().y());
01382         if (!current_table_box.null_box()) {
01383           ColSegment* seg = new ColSegment();
01384           seg->InsertBox(current_table_box);
01385           rit.add_after_then_move(seg);
01386         }
01387       }
01388     }
01389   }
01390   delete[] table_region;
01391 }
01392 
01393 // Merge table regions corresponding to tables spanning multiple columns if
01394 // there is a colpartition (horizontal ruling line or normal text) that
01395 // touches both regions.
01396 // TODO(faisal): A rare error occurs if there are two horizontally adjacent
01397 // tables with aligned ruling lines. In this case, line finder returns a
01398 // single line and hence the tables get merged together
01399 void TableFinder::GridMergeTableRegions() {
01400   // Iterate the table regions in the grid.
01401   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01402       gsearch(&table_grid_);
01403   gsearch.StartFullSearch();
01404   ColSegment* seg = NULL;
01405   while ((seg = gsearch.NextFullSearch()) != NULL) {
01406     bool neighbor_found = false;
01407     bool modified = false;  // Modified at least once
01408     do {
01409       // Start a rectangle search x-bounded by the image and y by the table
01410       const TBOX& box = seg->bounding_box();
01411       TBOX search_region(box);
01412       search_region.set_left(bleft().x());
01413       search_region.set_right(tright().x());
01414       neighbor_found = false;
01415       GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01416           rectsearch(&table_grid_);
01417       rectsearch.StartRectSearch(search_region);
01418       ColSegment* neighbor = NULL;
01419       while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
01420         if (neighbor == seg)
01421           continue;
01422         const TBOX& neighbor_box = neighbor->bounding_box();
01423         // Check if a neighbor box has a large overlap with the table
01424         // region.  This may happen as a result of merging two table
01425         // regions in the previous iteration.
01426         if (neighbor_box.overlap_fraction(box) >= 0.9) {
01427           seg->InsertBox(neighbor_box);
01428           rectsearch.RemoveBBox();
01429           gsearch.RepositionIterator();
01430           delete neighbor;
01431           modified = true;
01432           continue;
01433         }
01434         // Check if two table regions belong together based on a common
01435         // horizontal ruling line
01436         if (BelongToOneTable(box, neighbor_box)) {
01437           seg->InsertBox(neighbor_box);
01438           neighbor_found = true;
01439           modified = true;
01440           rectsearch.RemoveBBox();
01441           gsearch.RepositionIterator();
01442           delete neighbor;
01443         }
01444       }
01445     } while (neighbor_found);
01446     if (modified) {
01447       // Because the box has changed, it has to be removed first.
01448       gsearch.RemoveBBox();
01449       table_grid_.InsertBBox(true, true, seg);
01450       gsearch.RepositionIterator();
01451     }
01452   }
01453 }
01454 
01455 // Decide if two table regions belong to one table based on a common
01456 // horizontal ruling line or another colpartition
01457 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
01458   // Check the obvious case. Most likely not true because overlapping boxes
01459   // should already be merged, but seems like a good thing to do in case things
01460   // change.
01461   if (box1.overlap(box2))
01462     return true;
01463   // Check for ColPartitions spanning both table regions
01464   TBOX bbox = box1.bounding_union(box2);
01465   // Start a rect search on bbox
01466   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01467       rectsearch(&clean_part_grid_);
01468   rectsearch.StartRectSearch(bbox);
01469   ColPartition* part = NULL;
01470   while ((part = rectsearch.NextRectSearch()) != NULL) {
01471     const TBOX& part_box = part->bounding_box();
01472     // return true if a colpartition spanning both table regions is found
01473     if (part_box.overlap(box1) && part_box.overlap(box2) &&
01474         !part->IsImageType())
01475       return true;
01476   }
01477   return false;
01478 }
01479 
01480 // Adjust table boundaries by:
01481 //  - building a tight bounding box around all ColPartitions contained in it.
01482 //  - expanding table boundaries to include all colpartitions that overlap the
01483 //    table by more than half of their area
01484 //  - expanding table boundaries to include nearby horizontal rule lines
01485 //  - expanding table vertically to include left out column headers
01486 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
01487 //               makes following errors:
01488 //  1- horizontal lines consisting of underlines are included in the table if
01489 //     they are close enough
01490 //  2- horizontal lines originating from noise tend to get merged with a table
01491 //     near the top of the page
01492 //  3- the criteria for including horizontal lines is very generous. Many times
01493 //     horizontal lines separating headers and footers get merged with a
01494 //     single-column table in a multi-column page thereby including text
01495 //     from the neighboring column inside the table
01496 //  4- the criteria for including left out column headers also tends to
01497 //     occasionally include text-lines above the tables, typically from
01498 //     table caption
01499 void TableFinder::AdjustTableBoundaries() {
01500   // Iterate the table regions in the grid
01501   ColSegment_CLIST adjusted_tables;
01502   ColSegment_C_IT it(&adjusted_tables);
01503   ColSegmentGridSearch gsearch(&table_grid_);
01504   gsearch.StartFullSearch();
01505   ColSegment* table = NULL;
01506   while ((table = gsearch.NextFullSearch()) != NULL) {
01507     const TBOX& table_box = table->bounding_box();
01508     TBOX grown_box = table_box;
01509     GrowTableBox(table_box, &grown_box);
01510     // To prevent a table from expanding again, do not insert the
01511     // modified box back to the grid. Instead move it to a list and
01512     // and remove it from the grid. The list is moved later back to the grid.
01513     if (!grown_box.null_box()) {
01514       ColSegment* col = new ColSegment();
01515       col->InsertBox(grown_box);
01516       it.add_after_then_move(col);
01517     }
01518     gsearch.RemoveBBox();
01519     delete table;
01520   }
01521   // clear table grid to move final tables in it
01522   // TODO(nbeato): table_grid_ should already be empty. The above loop
01523   // removed everything. Maybe just assert it is empty?
01524   table_grid_.Clear();
01525   it.move_to_first();
01526   // move back final tables to table_grid_
01527   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01528     ColSegment* seg = it.extract();
01529     table_grid_.InsertBBox(true, true, seg);
01530   }
01531 }
01532 
01533 void TableFinder::GrowTableBox(const TBOX& table_box, TBOX* result_box) {
01534   // TODO(nbeato): The growing code is a bit excessive right now.
01535   // By removing these lines, the partitions considered need
01536   // to have some overlap or be special cases. These lines could
01537   // be added again once a check is put in place to make sure that
01538   // growing tables don't stomp on a lot of non-table partitions.
01539 
01540   // search for horizontal ruling lines within the vertical margin
01541   // int vertical_margin = kRulingVerticalMargin * gridsize();
01542   TBOX search_box = table_box;
01543   // int top = MIN(search_box.top() + vertical_margin, tright().y());
01544   // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
01545   // search_box.set_top(top);
01546   // search_box.set_bottom(bottom);
01547 
01548   GrowTableToIncludePartials(table_box, search_box, result_box);
01549   GrowTableToIncludeLines(table_box, search_box, result_box);
01550   IncludeLeftOutColumnHeaders(result_box);
01551 }
01552 
01553 // Grow a table by increasing the size of the box to include
01554 // partitions with significant overlap with the table.
01555 void TableFinder::GrowTableToIncludePartials(const TBOX& table_box,
01556                                              const TBOX& search_range,
01557                                              TBOX* result_box) {
01558   // Rulings are in a different grid, so search 2 grids for rulings, text,
01559   // and table partitions that are not entirely within the new box.
01560   for (int i = 0; i < 2; ++i) {
01561     ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
01562                                         &leader_and_ruling_grid_;
01563     ColPartitionGridSearch rectsearch(grid);
01564     rectsearch.StartRectSearch(search_range);
01565     ColPartition* part = NULL;
01566     while ((part = rectsearch.NextRectSearch()) != NULL) {
01567      // Only include text and table types.
01568       if (part->IsImageType())
01569         continue;
01570       const TBOX& part_box = part->bounding_box();
01571       // Include partition in the table if more than half of it
01572       // is covered by the table
01573       if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
01574         *result_box = result_box->bounding_union(part_box);
01575         continue;
01576       }
01577     }
01578   }
01579 }
01580 
01581 // Grow a table by expanding to the extents of significantly
01582 // overlapping lines.
01583 void TableFinder::GrowTableToIncludeLines(const TBOX& table_box,
01584                                           const TBOX& search_range,
01585                                           TBOX* result_box) {
01586   ColPartitionGridSearch rsearch(&leader_and_ruling_grid_);
01587   rsearch.SetUniqueMode(true);
01588   rsearch.StartRectSearch(search_range);
01589   ColPartition* part = NULL;
01590   while ((part = rsearch.NextRectSearch()) != NULL) {
01591     // TODO(nbeato) This should also do vertical, but column
01592     // boundaries are breaking things. This function needs to be
01593     // updated to allow vertical lines as well.
01594     if (!part->IsLineType())
01595       continue;
01596     // Avoid the following function call if the result of the
01597     // function is irrelevant.
01598     const TBOX& part_box = part->bounding_box();
01599     if (result_box->contains(part_box))
01600       continue;
01601     // Include a partially overlapping horizontal line only if the
01602     // extra ColPartitions that will be included due to expansion
01603     // have large side spacing w.r.t. columns containing them.
01604     if (HLineBelongsToTable(*part, table_box))
01605       *result_box = result_box->bounding_union(part_box);
01606     // TODO(nbeato): Vertical
01607   }
01608 }
01609 
01610 // Checks whether the horizontal line belong to the table by looking at the
01611 // side spacing of extra ColParitions that will be included in the table
01612 // due to expansion
01613 bool TableFinder::HLineBelongsToTable(const ColPartition& part,
01614                                       const TBOX& table_box) {
01615   if (!part.IsHorizontalLine())
01616     return false;
01617   const TBOX& part_box = part.bounding_box();
01618   if (!part_box.major_x_overlap(table_box))
01619     return false;
01620   // Do not consider top-most horizontal line since it usually
01621   // originates from noise.
01622   // TODO(nbeato): I had to comment this out because the ruling grid doesn't
01623   // have neighbors solved.
01624   // if (!part.nearest_neighbor_above())
01625   //   return false;
01626   const TBOX bbox = part_box.bounding_union(table_box);
01627   // In the "unioned table" box (the table extents expanded by the line),
01628   // keep track of how many partitions have significant padding to the left
01629   // and right. If more than half of the partitions covered by the new table
01630   // have significant spacing, the line belongs to the table and the table
01631   // grows to include all of the partitions.
01632   int num_extra_partitions = 0;
01633   int extra_space_to_right = 0;
01634   int extra_space_to_left = 0;
01635   // Rulings are in a different grid, so search 2 grids for rulings, text,
01636   // and table partitions that are introduced by the new box.
01637   for (int i = 0; i < 2; ++i) {
01638     ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
01639                                         &leader_and_ruling_grid_;
01640     // Start a rect search on bbox
01641     ColPartitionGridSearch rectsearch(grid);
01642     rectsearch.SetUniqueMode(true);
01643     rectsearch.StartRectSearch(bbox);
01644     ColPartition* extra_part = NULL;
01645     while ((extra_part = rectsearch.NextRectSearch()) != NULL) {
01646       // ColPartition already in table
01647       const TBOX& extra_part_box = extra_part->bounding_box();
01648       if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
01649         continue;
01650       // Non-text ColPartitions do not contribute
01651       if (extra_part->IsImageType())
01652         continue;
01653       // Consider this partition.
01654       num_extra_partitions++;
01655       // presence of a table cell is a strong hint, so just increment the scores
01656       // without looking at the spacing.
01657       if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
01658         extra_space_to_right++;
01659         extra_space_to_left++;
01660         continue;
01661       }
01662       int space_threshold = kSideSpaceMargin * part.median_size();
01663       if (extra_part->space_to_right() > space_threshold)
01664         extra_space_to_right++;
01665       if (extra_part->space_to_left() > space_threshold)
01666         extra_space_to_left++;
01667     }
01668   }
01669   // tprintf("%d %d %d\n",
01670   // num_extra_partitions,extra_space_to_right,extra_space_to_left);
01671   return (extra_space_to_right > num_extra_partitions / 2) ||
01672       (extra_space_to_left > num_extra_partitions / 2);
01673 }
01674 
01675 // Look for isolated column headers above the given table box and
01676 // include them in the table
01677 void TableFinder::IncludeLeftOutColumnHeaders(TBOX* table_box) {
01678   // Start a search above the current table to look for column headers
01679   ColPartitionGridSearch vsearch(&clean_part_grid_);
01680   vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
01681                               table_box->top());
01682   ColPartition* neighbor = NULL;
01683   ColPartition* previous_neighbor = NULL;
01684   while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) {
01685     // Max distance to find a table heading.
01686     const int max_distance = kMaxColumnHeaderDistance *
01687                              neighbor->median_size();
01688     int table_top = table_box->top();
01689     const TBOX& box = neighbor->bounding_box();
01690     // Do not continue if the next box is way above
01691     if (box.bottom() - table_top > max_distance)
01692       break;
01693     // Unconditionally include partitions of type TABLE or LINE
01694     // TODO(faisal): add some reasonable conditions here
01695     if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
01696       table_box->set_top(box.top());
01697       previous_neighbor = NULL;
01698       continue;
01699     }
01700     // If there are two text partitions, one above the other, without a table
01701     // cell on their left or right side, consider them a barrier and quit
01702     if (previous_neighbor == NULL) {
01703       previous_neighbor = neighbor;
01704     } else {
01705       const TBOX& previous_box = previous_neighbor->bounding_box();
01706       if (!box.major_y_overlap(previous_box))
01707         break;
01708     }
01709   }
01710 }
01711 
01712 // Remove false alarms consiting of a single column based on their
01713 // projection on the x-axis. Projection of a real table on the x-axis
01714 // should have at least one zero-valley larger than the global median
01715 // x-height of the page.
01716 void TableFinder::DeleteSingleColumnTables() {
01717   int page_width = tright().x() - bleft().x();
01718   ASSERT_HOST(page_width > 0);
01719   // create an integer array to hold projection on x-axis
01720   int* table_xprojection = new int[page_width];
01721   // Iterate through all tables in the table grid
01722   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01723       table_search(&table_grid_);
01724   table_search.StartFullSearch();
01725   ColSegment* table;
01726   while ((table = table_search.NextFullSearch()) != NULL) {
01727     TBOX table_box = table->bounding_box();
01728     // reset the projection array
01729     for (int i = 0; i < page_width; i++) {
01730       table_xprojection[i] = 0;
01731     }
01732     // Start a rect search on table_box
01733     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01734         rectsearch(&clean_part_grid_);
01735     rectsearch.SetUniqueMode(true);
01736     rectsearch.StartRectSearch(table_box);
01737     ColPartition* part;
01738     while ((part = rectsearch.NextRectSearch()) != NULL) {
01739       if (!part->IsTextType())
01740         continue;  // Do not consider non-text partitions
01741       if (part->flow() == BTFT_LEADER)
01742         continue;  // Assume leaders are in tables
01743       TBOX part_box = part->bounding_box();
01744       // Do not consider partitions partially covered by the table
01745       if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
01746         continue;
01747       BLOBNBOX_CLIST* part_boxes = part->boxes();
01748       BLOBNBOX_C_IT pit(part_boxes);
01749 
01750       // Make sure overlapping blobs don't artificially inflate the number
01751       // of rows in the table. This happens frequently with things such as
01752       // decimals and split characters. Do this by assuming the column
01753       // partition is sorted mostly left to right and just clip
01754       // bounding boxes by the previous box's extent.
01755       int next_position_to_write = 0;
01756 
01757       for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
01758         BLOBNBOX *pblob = pit.data();
01759         // ignore blob height for the purpose of projection since we
01760         // are only interested in finding valleys
01761         int xstart = pblob->bounding_box().left();
01762         int xend = pblob->bounding_box().right();
01763 
01764         xstart = MAX(xstart, next_position_to_write);
01765         for (int i = xstart; i < xend; i++)
01766           table_xprojection[i - bleft().x()]++;
01767         next_position_to_write = xend;
01768       }
01769     }
01770     // Find largest valley between two reasonable peaks in the table
01771     if (!GapInXProjection(table_xprojection, page_width)) {
01772       table_search.RemoveBBox();
01773       delete table;
01774     }
01775   }
01776   delete[] table_xprojection;
01777 }
01778 
01779 // Return true if at least one gap larger than the global x-height
01780 // exists in the horizontal projection
01781 bool TableFinder::GapInXProjection(int* xprojection, int length) {
01782   // Find peak value of the histogram
01783   int peak_value = 0;
01784   for (int i = 0; i < length; i++) {
01785     if (xprojection[i] > peak_value) {
01786       peak_value = xprojection[i];
01787     }
01788   }
01789   // Peak value represents the maximum number of horizontally
01790   // overlapping colpartitions, so this can be considered as the
01791   // number of rows in the table
01792   if (peak_value < kMinRowsInTable)
01793     return false;
01794   double projection_threshold = kSmallTableProjectionThreshold * peak_value;
01795   if (peak_value >= kLargeTableRowCount)
01796     projection_threshold = kLargeTableProjectionThreshold * peak_value;
01797   // Threshold the histogram
01798   for (int i = 0; i < length; i++) {
01799     xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
01800   }
01801   // Find the largest run of zeros between two ones
01802   int largest_gap = 0;
01803   int run_start = -1;
01804   for (int i = 1; i < length; i++) {
01805     // detect start of a run of zeros
01806     if (xprojection[i - 1] && !xprojection[i]) {
01807       run_start = i;
01808     }
01809     // detect end of a run of zeros and update the value of largest gap
01810     if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
01811       int gap = i - run_start;
01812       if (gap > largest_gap)
01813         largest_gap = gap;
01814       run_start = -1;
01815     }
01816   }
01817   return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
01818 }
01819 
01820 // Given the location of a table "guess", try to overlay a cellular
01821 // grid in the location, adjusting the boundaries.
01822 // TODO(nbeato): Falsely introduces:
01823 //   -headers/footers (not any worse, too much overlap destroys cells)
01824 //   -page numbers (not worse, included because maximize margins)
01825 //   -equations (nicely fit into a celluar grid, but more sparsely)
01826 //   -figures (random text box, also sparse)
01827 //   -small left-aligned text areas with overlapping positioned whitespace
01828 //       (rejected before)
01829 // Overall, this just needs some more work.
01830 void TableFinder::RecognizeTables() {
01831   ScrollView* table_win = NULL;
01832   if (textord_show_tables) {
01833     table_win = MakeWindow(0, 0, "Table Structure");
01834     DisplayColPartitions(table_win, &fragmented_text_grid_,
01835                          ScrollView::BLUE, ScrollView::LIGHT_BLUE);
01836     // table_grid_.DisplayBoxes(table_win);
01837   }
01838 
01839 
01840   TableRecognizer recognizer;
01841   recognizer.Init();
01842   recognizer.set_line_grid(&leader_and_ruling_grid_);
01843   recognizer.set_text_grid(&fragmented_text_grid_);
01844   recognizer.set_max_text_height(global_median_xheight_ * 2.0);
01845   recognizer.set_min_height(1.5 * gridheight());
01846   // Loop over all of the tables and try to fit them.
01847   // Store the good tables here.
01848   ColSegment_CLIST good_tables;
01849   ColSegment_C_IT good_it(&good_tables);
01850 
01851   ColSegmentGridSearch gsearch(&table_grid_);
01852   gsearch.StartFullSearch();
01853   ColSegment* found_table = NULL;
01854   while ((found_table = gsearch.NextFullSearch()) != NULL) {
01855     gsearch.RemoveBBox();
01856 
01857     // The goal is to make the tables persistent in a list.
01858     // When that happens, this will move into the search loop.
01859     const TBOX& found_box = found_table->bounding_box();
01860     StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
01861 
01862     // Process a table. Good tables are inserted into the grid again later on
01863     // We can't change boxes in the grid while it is running a search.
01864     if (table_structure != NULL) {
01865       if (textord_show_tables) {
01866         table_structure->Display(table_win, ScrollView::LIME_GREEN);
01867       }
01868       found_table->set_bounding_box(table_structure->bounding_box());
01869       delete table_structure;
01870       good_it.add_after_then_move(found_table);
01871     } else {
01872       delete found_table;
01873     }
01874   }
01875   // TODO(nbeato): MERGE!! There is awesome info now available for merging.
01876 
01877   // At this point, the grid is empty. We can safely insert the good tables
01878   // back into grid.
01879   for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
01880     table_grid_.InsertBBox(true, true, good_it.extract());
01881 }
01882 
01883 // Displays the column segments in some window.
01884 void TableFinder::DisplayColSegments(ScrollView* win,
01885                                      ColSegment_LIST *segments,
01886                                      ScrollView::Color color) {
01887 #ifndef GRAPHICS_DISABLED
01888   win->Pen(color);
01889   win->Brush(ScrollView::NONE);
01890   ColSegment_IT it(segments);
01891   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01892     ColSegment* col = it.data();
01893     const TBOX& box = col->bounding_box();
01894     int left_x = box.left();
01895     int right_x = box.right();
01896     int top_y = box.top();
01897     int bottom_y = box.bottom();
01898     win->Rectangle(left_x, bottom_y, right_x, top_y);
01899   }
01900   win->UpdateWindow();
01901 #endif
01902 }
01903 
01904 void TableFinder::DisplayColSegmentGrid(ScrollView* win, ColSegmentGrid* grid,
01905                                          ScrollView::Color color) {
01906 #ifndef GRAPHICS_DISABLED
01907   // Iterate the ColPartitions in the grid.
01908   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
01909     gsearch(grid);
01910   gsearch.StartFullSearch();
01911   ColSegment* seg = NULL;
01912   while ((seg = gsearch.NextFullSearch()) != NULL) {
01913     const TBOX& box = seg->bounding_box();
01914     int left_x = box.left();
01915     int right_x = box.right();
01916     int top_y = box.top();
01917     int bottom_y = box.bottom();
01918     win->Brush(ScrollView::NONE);
01919     win->Pen(color);
01920     win->Rectangle(left_x, bottom_y, right_x, top_y);
01921   }
01922   win->UpdateWindow();
01923 #endif
01924 }
01925 
01926 // Displays the colpartitions using a new coloring on an existing window.
01927 // Note: This method is only for debug purpose during development and
01928 // would not be part of checked in code
01929 void TableFinder::DisplayColPartitions(ScrollView* win,
01930                                        ColPartitionGrid* grid,
01931                                        ScrollView::Color default_color,
01932                                        ScrollView::Color table_color) {
01933 #ifndef GRAPHICS_DISABLED
01934   ScrollView::Color color = default_color;
01935   // Iterate the ColPartitions in the grid.
01936   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01937     gsearch(grid);
01938   gsearch.StartFullSearch();
01939   ColPartition* part = NULL;
01940   while ((part = gsearch.NextFullSearch()) != NULL) {
01941     color = default_color;
01942     if (part->type() == PT_TABLE)
01943       color = table_color;
01944 
01945     const TBOX& box = part->bounding_box();
01946     int left_x = box.left();
01947     int right_x = box.right();
01948     int top_y = box.top();
01949     int bottom_y = box.bottom();
01950     win->Brush(ScrollView::NONE);
01951     win->Pen(color);
01952     win->Rectangle(left_x, bottom_y, right_x, top_y);
01953   }
01954   win->UpdateWindow();
01955 #endif
01956 }
01957 void TableFinder::DisplayColPartitions(ScrollView* win,
01958                                        ColPartitionGrid* grid,
01959                                        ScrollView::Color default_color) {
01960   DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
01961 }
01962 
01963 void TableFinder::DisplayColPartitionConnections(
01964                      ScrollView* win,
01965                      ColPartitionGrid* grid,
01966                      ScrollView::Color color) {
01967 #ifndef GRAPHICS_DISABLED
01968   // Iterate the ColPartitions in the grid.
01969   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01970     gsearch(grid);
01971   gsearch.StartFullSearch();
01972   ColPartition* part = NULL;
01973   while ((part = gsearch.NextFullSearch()) != NULL) {
01974     const TBOX& box = part->bounding_box();
01975     int left_x = box.left();
01976     int right_x = box.right();
01977     int top_y = box.top();
01978     int bottom_y = box.bottom();
01979 
01980     ColPartition* upper_part = part->nearest_neighbor_above();
01981     if (upper_part) {
01982       TBOX upper_box = upper_part->bounding_box();
01983       int mid_x = (left_x + right_x) / 2;
01984       int mid_y = (top_y + bottom_y) / 2;
01985       int other_x = (upper_box.left() + upper_box.right()) / 2;
01986       int other_y = (upper_box.top() + upper_box.bottom()) / 2;
01987       win->Brush(ScrollView::NONE);
01988       win->Pen(color);
01989       win->Line(mid_x, mid_y, other_x, other_y);
01990     }
01991     ColPartition* lower_part = part->nearest_neighbor_below();
01992     if (lower_part) {
01993       TBOX lower_box = lower_part->bounding_box();
01994       int mid_x = (left_x + right_x) / 2;
01995       int mid_y = (top_y + bottom_y) / 2;
01996       int other_x = (lower_box.left() + lower_box.right()) / 2;
01997       int other_y = (lower_box.top() + lower_box.bottom()) / 2;
01998       win->Brush(ScrollView::NONE);
01999       win->Pen(color);
02000       win->Line(mid_x, mid_y, other_x, other_y);
02001     }
02002   }
02003   win->UpdateWindow();
02004 #endif
02005 }
02006 
02007 
02008 // Write debug image and text file.
02009 // Note: This method is only for debug purpose during development and
02010 // would not be part of checked in code
02011 void TableFinder::WriteToPix(const FCOORD& reskew) {
02012   // Input file must be named test1.tif
02013   PIX* pix = pixRead("test1.tif");
02014   if (!pix) {
02015     tprintf("Input file test1.tif not found.\n");
02016     return;
02017   }
02018   int img_height = pixGetHeight(pix);
02019   int img_width = pixGetWidth(pix);
02020   // Maximum number of text or table partitions
02021   int num_boxes = 10;
02022   BOXA* text_box_array = boxaCreate(num_boxes);
02023   BOXA* table_box_array = boxaCreate(num_boxes);
02024   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
02025     gsearch(&clean_part_grid_);
02026   gsearch.StartFullSearch();
02027   ColPartition* part;
02028   // load colpartitions into text_box_array and table_box_array
02029   while ((part = gsearch.NextFullSearch()) != NULL) {
02030     TBOX box = part->bounding_box();
02031     box.rotate_large(reskew);
02032     BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
02033                               box.right() - box.left(),
02034                               box.top() - box.bottom());
02035     if (part->type() == PT_TABLE)
02036       boxaAddBox(table_box_array, lept_box, L_INSERT);
02037     else
02038       boxaAddBox(text_box_array, lept_box, L_INSERT);
02039   }
02040   // draw colpartitions on the output image
02041   PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000);
02042   out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00);
02043 
02044   BOXA* table_array = boxaCreate(num_boxes);
02045   // text file containing detected table bounding boxes
02046   FILE* fptr = fopen("tess-table.txt", "wb");
02047   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
02048       table_search(&table_grid_);
02049   table_search.StartFullSearch();
02050   ColSegment* table;
02051   // load table boxes to table_array and write them to text file as well
02052   while ((table = table_search.NextFullSearch()) != NULL) {
02053     TBOX box = table->bounding_box();
02054     box.rotate_large(reskew);
02055     // Since deskewing introduces negative coordinates, reskewing
02056     // might not completely recover from that since both steps enlarge
02057     // the actual box. Hence a box that undergoes deskewing/reskewing
02058     // may go out of image boundaries. Crop a table box if needed to
02059     // contain it inside the image dimensions.
02060     box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1));
02061     BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
02062                               box.right() - box.left(),
02063                               box.top() - box.bottom());
02064     boxaAddBox(table_array, lept_box, L_INSERT);
02065     fprintf(fptr, "%d %d %d %d TABLE\n", box.left(),
02066             img_height - box.top(), box.right(), img_height - box.bottom());
02067   }
02068   fclose(fptr);
02069   // paint table boxes on the debug image
02070   out = pixDrawBoxa(out, table_array, 5, 0x7fff0000);
02071 
02072   pixWrite("out.png", out, IFF_PNG);
02073   // memory cleanup
02074   boxaDestroy(&text_box_array);
02075   boxaDestroy(&table_box_array);
02076   boxaDestroy(&table_array);
02077   pixDestroy(&pix);
02078   pixDestroy(&out);
02079 }
02080 
02081 // Merge all colpartitions in table regions to make them a single
02082 // colpartition and revert types of isolated table cells not
02083 // assigned to any table to their original types.
02084 void TableFinder::MakeTableBlocks(ColPartitionGrid* grid,
02085                                   ColPartitionSet** all_columns,
02086                                   WidthCallback* width_cb) {
02087   // Since we have table blocks already, remove table tags from all
02088   // colpartitions
02089   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
02090     gsearch(grid);
02091   gsearch.StartFullSearch();
02092   ColPartition* part = NULL;
02093 
02094   while ((part = gsearch.NextFullSearch()) != NULL) {
02095     if (part->type() == PT_TABLE) {
02096       part->clear_table_type();
02097     }
02098   }
02099   // Now make a single colpartition out of each table block and remove
02100   // all colpartitions contained within a table
02101   GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT>
02102       table_search(&table_grid_);
02103   table_search.StartFullSearch();
02104   ColSegment* table;
02105   while ((table = table_search.NextFullSearch()) != NULL) {
02106     TBOX table_box = table->bounding_box();
02107     // Start a rect search on table_box
02108     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
02109         rectsearch(grid);
02110     rectsearch.StartRectSearch(table_box);
02111     ColPartition* part;
02112     ColPartition* table_partition = NULL;
02113     while ((part = rectsearch.NextRectSearch()) != NULL) {
02114      // Do not consider image partitions
02115       if (!part->IsTextType())
02116         continue;
02117       TBOX part_box = part->bounding_box();
02118       // Include partition in the table if more than half of it
02119       // is covered by the table
02120       if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
02121         rectsearch.RemoveBBox();
02122         if (table_partition) {
02123           table_partition->Absorb(part, width_cb);
02124         } else {
02125           table_partition = part;
02126         }
02127       }
02128     }
02129     // Insert table colpartition back to part_grid_
02130     if (table_partition) {
02131       // To match the columns used when transforming to blocks, the new table
02132       // partition must have its first and last column set at the grid y that
02133       // corresponds to its bottom.
02134       const TBOX& table_box = table_partition->bounding_box();
02135       int grid_x, grid_y;
02136       grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
02137       table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
02138       table_partition->set_table_type();
02139       table_partition->set_blob_type(BRT_TEXT);
02140       table_partition->set_flow(BTFT_CHAIN);
02141       table_partition->SetBlobTypes();
02142       grid->InsertBBox(true, true, table_partition);
02143     }
02144   }
02145 }
02146 
02149 ColSegment::ColSegment()
02150     : ELIST_LINK(),
02151       num_table_cells_(0),
02152       num_text_cells_(0),
02153       type_(COL_UNKNOWN) {
02154 }
02155 ColSegment::~ColSegment() {
02156 }
02157 
02158 // Provides a color for BBGrid to draw the rectangle.
02159 ScrollView::Color  ColSegment::BoxColor() const {
02160   const ScrollView::Color kBoxColors[PT_COUNT] = {
02161     ScrollView::YELLOW,
02162     ScrollView::BLUE,
02163     ScrollView::YELLOW,
02164     ScrollView::MAGENTA,
02165   };
02166   return kBoxColors[type_];
02167 }
02168 
02169 // Insert a box into this column segment
02170 void ColSegment::InsertBox(const TBOX& other) {
02171   bounding_box_ = bounding_box_.bounding_union(other);
02172 }
02173 
02174 // Set column segment type based on the ratio of text and table partitions
02175 // in it.
02176 void ColSegment::set_type() {
02177   if (num_table_cells_ > kTableColumnThreshold * num_text_cells_)
02178     type_ = COL_TABLE;
02179   else if (num_text_cells_ > num_table_cells_)
02180     type_ = COL_TEXT;
02181   else
02182     type_ = COL_MIXED;
02183 }
02184 
02185 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines