tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/textord/tablerecog.cpp
Go to the documentation of this file.
00001 
00002 // File:        tablerecog.cpp
00003 // Description: Helper class to help structure table areas. Given an bounding
00004 //              box from TableFinder, the TableRecognizer should give a
00005 //              StructuredTable (maybe a list in the future) of "good" tables
00006 //              in that area.
00007 // Author:      Nicholas Beato
00008 // Created:     Friday, Aug. 20, 2010
00009 //
00010 // (C) Copyright 2009, Google Inc.
00011 // Licensed under the Apache License, Version 2.0 (the "License");
00012 // you may not use this file except in compliance with the License.
00013 // You may obtain a copy of the License at
00014 // http://www.apache.org/licenses/LICENSE-2.0
00015 // Unless required by applicable law or agreed to in writing, software
00016 // distributed under the License is distributed on an "AS IS" BASIS,
00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00018 // See the License for the specific language governing permissions and
00019 // limitations under the License.
00020 //
00022 
00023 #ifdef HAVE_CONFIG_H
00024 #include "config_auto.h"
00025 #endif
00026 
00027 #include "tablerecog.h"
00028 
00029 namespace tesseract {
00030 
00031 // The amount of space required between the ColPartitions in 2 columns
00032 // of a non-lined table as a multiple of the median width.
00033 const double kHorizontalSpacing = 0.30;
00034 // The amount of space required between the ColPartitions in 2 rows
00035 // of a non-lined table as multiples of the median height.
00036 const double kVerticalSpacing = -0.2;
00037 // The number of cells that the grid lines may intersect.
00038 // See FindCellSplitLocations for explanation.
00039 const int kCellSplitRowThreshold = 0;
00040 const int kCellSplitColumnThreshold = 0;
00041 // For "lined tables", the number of required lines. Currently a guess.
00042 const int kLinedTableMinVerticalLines = 3;
00043 const int kLinedTableMinHorizontalLines = 3;
00044 // Number of columns required, as a fraction of the most columns found.
00045 // None of these are tweaked at all.
00046 const double kRequiredColumns = 0.7;
00047 // The tolerance for comparing margins of potential tables.
00048 const double kMarginFactor = 1.1;
00049 // The first and last row should be consistent cell height.
00050 // This factor is the first and last row cell height max.
00051 const double kMaxRowSize = 2.5;
00052 // Number of filled columns required to form a strong table row.
00053 // For small tables, this is an absolute number.
00054 const double kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 };
00055 const int kGoodRowNumberOfColumnsSmallSize = 
00056     sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1;
00057 // For large tables, it is a relative number
00058 const double kGoodRowNumberOfColumnsLarge = 0.7;
00059 // The amount of area that must be covered in a cell by ColPartitions to
00060 // be considered "filled"
00061 const double kMinFilledArea = 0.35;
00062 
00066 
00067 StructuredTable::StructuredTable()
00068     : text_grid_(NULL),
00069       line_grid_(NULL),
00070       is_lined_(false),
00071       space_above_(0),
00072       space_below_(0),
00073       space_left_(0),
00074       space_right_(0),
00075       median_cell_height_(0),
00076       median_cell_width_(0),
00077       max_text_height_(MAX_INT32) {
00078 }
00079 
00080 StructuredTable::~StructuredTable() {
00081 }
00082 
00083 void StructuredTable::Init() {
00084 }
00085 
00086 void StructuredTable::set_text_grid(ColPartitionGrid* text_grid) {
00087   text_grid_ = text_grid;
00088 }
00089 void StructuredTable::set_line_grid(ColPartitionGrid* line_grid) {
00090   line_grid_ = line_grid;
00091 }
00092 void StructuredTable::set_max_text_height(int height) {
00093   max_text_height_ = height;
00094 }
00095 bool StructuredTable::is_lined() const {
00096   return is_lined_;
00097 }
00098 int StructuredTable::row_count() const {
00099   return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1;
00100 }
00101 int StructuredTable::column_count() const {
00102   return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1;
00103 }
00104 int StructuredTable::cell_count() const {
00105   return row_count() * column_count();
00106 }
00107 void StructuredTable::set_bounding_box(const TBOX& box) {
00108   bounding_box_ = box;
00109 }
00110 const TBOX& StructuredTable::bounding_box() const {
00111   return bounding_box_;
00112 }
00113 int StructuredTable::median_cell_height() {
00114   return median_cell_height_;
00115 }
00116 int StructuredTable::median_cell_width() {
00117   return median_cell_width_;
00118 }
00119 int StructuredTable::row_height(int row) const {
00120   ASSERT_HOST(0 <= row && row < row_count());
00121   return cell_y_[row + 1] - cell_y_[row];
00122 }
00123 int StructuredTable::column_width(int column) const {
00124   ASSERT_HOST(0 <= column && column < column_count());
00125   return cell_x_[column + 1] - cell_x_[column];
00126 }
00127 int StructuredTable::space_above() const {
00128   return space_above_;
00129 }
00130 int StructuredTable::space_below() const {
00131   return space_below_;
00132 }
00133 
00134 // At this point, we know that the lines are contained
00135 // by the box (by FindLinesBoundingBox).
00136 // So try to find the cell structure and make sure it works out.
00137 // The assumption is that all lines span the table. If this
00138 // assumption fails, the VerifyLinedTable method will
00139 // abort the lined table. The TableRecognizer will fall
00140 // back on FindWhitespacedStructure.
00141 bool StructuredTable::FindLinedStructure() {
00142   ClearStructure();
00143 
00144   // Search for all of the lines in the current box.
00145   // Update the cellular structure with the exact lines.
00146   ColPartitionGridSearch box_search(line_grid_);
00147   box_search.SetUniqueMode(true);
00148   box_search.StartRectSearch(bounding_box_);
00149   ColPartition* line = NULL;
00150 
00151   while ((line = box_search.NextRectSearch()) != NULL) {
00152     if (line->IsHorizontalLine())
00153       cell_y_.push_back(line->MidY());
00154     if (line->IsVerticalLine())
00155       cell_x_.push_back(line->MidX());
00156   }
00157 
00158   // HasSignificantLines should guarantee cells.
00159   // Because that code is a different class, just gracefully
00160   // return false. This could be an assert.
00161   if (cell_x_.length() < 3 || cell_y_.length() < 3)
00162     return false;
00163 
00164   cell_x_.sort();
00165   cell_y_.sort();
00166 
00167   // Remove duplicates that may have occurred due to split lines.
00168   cell_x_.compact_sorted();
00169   cell_y_.compact_sorted();
00170 
00171   // The border should be the extents of line boxes, not middle.
00172   cell_x_[0] = bounding_box_.left();
00173   cell_x_[cell_x_.length() - 1] = bounding_box_.right();
00174   cell_y_[0] = bounding_box_.bottom();
00175   cell_y_[cell_y_.length() - 1] = bounding_box_.top();
00176 
00177   // Remove duplicates that may have occurred due to moving the borders.
00178   cell_x_.compact_sorted();
00179   cell_y_.compact_sorted();
00180 
00181   CalculateMargins();
00182   CalculateStats();
00183   is_lined_ = VerifyLinedTableCells();
00184   return is_lined_;
00185 }
00186 
00187 // Finds the cellular structure given a particular box.
00188 bool StructuredTable::FindWhitespacedStructure() {
00189   ClearStructure();
00190   FindWhitespacedColumns();
00191   FindWhitespacedRows();
00192 
00193   if (!VerifyWhitespacedTable()) {
00194     return false;
00195   } else {
00196     bounding_box_.set_left(cell_x_[0]);
00197     bounding_box_.set_right(cell_x_[cell_x_.length() - 1]);
00198     bounding_box_.set_bottom(cell_y_[0]);
00199     bounding_box_.set_top(cell_y_[cell_y_.length() - 1]);
00200     AbsorbNearbyLines();
00201     CalculateMargins();
00202     CalculateStats();
00203     return true;
00204   }
00205 }
00206 
00207 // Tests if a partition fits inside the table structure.
00208 // Partitions must fully span a grid line in order to intersect it.
00209 // This means that a partition does not intersect a line
00210 // that it "just" touches. This is mainly because the assumption
00211 // throughout the code is that "0" distance is a very very small space.
00212 bool StructuredTable::DoesPartitionFit(const ColPartition& part) const {
00213   const TBOX& box = part.bounding_box();
00214   for (int i = 0; i < cell_x_.length(); ++i)
00215     if (box.left() < cell_x_[i] && cell_x_[i] < box.right())
00216       return false;
00217   for (int i = 0; i < cell_y_.length(); ++i)
00218     if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top())
00219       return false;
00220   return true;
00221 }
00222 
00223 // Checks if a sub-table has multiple data cells filled.
00224 int StructuredTable::CountFilledCells() {
00225   return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
00226 }
00227 int StructuredTable::CountFilledCellsInRow(int row) {
00228   return CountFilledCells(row, row, 0, column_count() - 1);
00229 }
00230 int StructuredTable::CountFilledCellsInColumn(int column) {
00231   return CountFilledCells(0, row_count() - 1, column, column);
00232 }
00233 int StructuredTable::CountFilledCells(int row_start, int row_end,
00234                             int column_start, int column_end) {
00235   ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
00236   ASSERT_HOST(0 <= column_start && column_start <= column_end &&
00237               column_end < column_count());
00238   int cell_count = 0;
00239   TBOX cell_box;
00240   for (int row = row_start; row <= row_end; ++row) {
00241     cell_box.set_bottom(cell_y_[row]);
00242     cell_box.set_top(cell_y_[row + 1]);
00243     for (int col = column_start; col <= column_end; ++col) {
00244       cell_box.set_left(cell_x_[col]);
00245       cell_box.set_right(cell_x_[col + 1]);
00246       if (CountPartitions(cell_box) > 0)
00247         ++cell_count;
00248     }
00249   }
00250   return cell_count;
00251 }
00252 
00253 // Makes sure that at least one cell in a row has substantial area filled.
00254 // This can filter out large whitespace caused by growing tables too far
00255 // and page numbers.
00256 bool StructuredTable::VerifyRowFilled(int row) {
00257   for (int i = 0; i < column_count(); ++i) {
00258     double area_filled = CalculateCellFilledPercentage(row, i);
00259     if (area_filled >= kMinFilledArea)
00260       return true;
00261   }
00262   return false;
00263 }
00264 
00265 // Finds the filled area in a cell.
00266 // Assume ColPartitions do not overlap for simplicity (even though they do).
00267 double StructuredTable::CalculateCellFilledPercentage(int row, int column) {
00268   ASSERT_HOST(0 <= row && row <= row_count());
00269   ASSERT_HOST(0 <= column && column <= column_count());
00270   const TBOX kCellBox(cell_x_[column], cell_y_[row],
00271                       cell_x_[column + 1], cell_y_[row + 1]);
00272   ASSERT_HOST(!kCellBox.null_box());
00273 
00274   ColPartitionGridSearch gsearch(text_grid_);
00275   gsearch.SetUniqueMode(true);
00276   gsearch.StartRectSearch(kCellBox);
00277   double area_covered = 0;
00278   ColPartition* text = NULL;
00279   while ((text = gsearch.NextRectSearch()) != NULL) {
00280     if (text->IsTextType())
00281       area_covered += text->bounding_box().intersection(kCellBox).area();
00282   }
00283   return MIN(1.0, area_covered / kCellBox.area());
00284 }
00285 
00286 void StructuredTable::Display(ScrollView* window, ScrollView::Color color) {
00287 #ifndef GRAPHICS_DISABLED
00288   window->Brush(ScrollView::NONE);
00289   window->Pen(color);
00290   window->Rectangle(bounding_box_.left(), bounding_box_.bottom(),
00291                     bounding_box_.right(), bounding_box_.top());
00292   for (int i = 0; i < cell_x_.length(); i++) {
00293     window->Line(cell_x_[i], bounding_box_.bottom(),
00294                  cell_x_[i], bounding_box_.top());
00295   }
00296   for (int i = 0; i < cell_y_.length(); i++) {
00297     window->Line(bounding_box_.left(), cell_y_[i],
00298                  bounding_box_.right(), cell_y_[i]);
00299   }
00300   window->UpdateWindow();
00301 #endif
00302 }
00303 
00304 // Clear structure information.
00305 void StructuredTable::ClearStructure() {
00306   cell_x_.clear();
00307   cell_y_.clear();
00308   is_lined_ = false;
00309   space_above_ = 0;
00310   space_below_ = 0;
00311   space_left_ = 0;
00312   space_right_ = 0;
00313   median_cell_height_ = 0;
00314   median_cell_width_ = 0;
00315 }
00316 
00317 // When a table has lines, the lines should not intersect any partitions.
00318 // The following function makes sure the previous assumption is met.
00319 bool StructuredTable::VerifyLinedTableCells() {
00320   // Function only called when lines exist.
00321   ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2);
00322   for (int i = 0; i < cell_y_.length(); ++i) {
00323     if (CountHorizontalIntersections(cell_y_[i]) > 0)
00324       return false;
00325   }
00326   for (int i = 0; i < cell_x_.length(); ++i) {
00327     if (CountVerticalIntersections(cell_x_[i]) > 0)
00328       return false;
00329   }
00330   return true;
00331 }
00332 
00333 // TODO(nbeato): Could be much better than this.
00334 // Examples:
00335 //   - Caclulate the percentage of filled cells.
00336 //   - Calculate the average number of ColPartitions per cell.
00337 //   - Calculate the number of cells per row with partitions.
00338 //   - Check if ColPartitions in adjacent cells are similar.
00339 //   - Check that all columns are at least a certain width.
00340 //   - etc.
00341 bool StructuredTable::VerifyWhitespacedTable() {
00342   // criteria for a table, must be at least 2x3 or 3x2
00343   return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
00344 }
00345 
00346 // Finds vertical splits in the ColPartitions of text_grid_ by considering
00347 // all possible "good" guesses. A good guess is just the left/right sides of
00348 // the partitions, since these locations will uniquely define where the
00349 // extremal values where the splits can occur. The split happens
00350 // in the middle of the two nearest partitions.
00351 void StructuredTable::FindWhitespacedColumns() {
00352   // Set of the extents of all partitions on the page.
00353   GenericVectorEqEq<int> left_sides;
00354   GenericVectorEqEq<int> right_sides;
00355 
00356   // Look at each text partition. We want to find the partitions
00357   // that have extremal left/right sides. These will give us a basis
00358   // for the table columns.
00359   ColPartitionGridSearch gsearch(text_grid_);
00360   gsearch.SetUniqueMode(true);
00361   gsearch.StartRectSearch(bounding_box_);
00362   ColPartition* text = NULL;
00363   while ((text = gsearch.NextRectSearch()) != NULL) {
00364     if (!text->IsTextType())
00365       continue;
00366 
00367     ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
00368     int spacing = static_cast<int>(text->median_width() *
00369                                    kHorizontalSpacing / 2.0 + 0.5);
00370     left_sides.push_back(text->bounding_box().left() - spacing);
00371     right_sides.push_back(text->bounding_box().right() + spacing);
00372   }
00373   // It causes disaster below, so avoid it!
00374   if (left_sides.length() == 0 || right_sides.length() == 0)
00375     return;
00376 
00377   // Since data may be inserted in grid order, we sort the left/right sides.
00378   left_sides.sort();
00379   right_sides.sort();
00380 
00381   // At this point, in the "merged list", we expect to have a left side,
00382   // followed by either more left sides or a right side. The last number
00383   // should be a right side. We find places where the splits occur by looking
00384   // for "valleys". If we want to force gap sizes or allow overlap, change
00385   // the spacing above. If you want to let lines "slice" partitions as long
00386   // as it is infrequent, change the following function.
00387   FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold,
00388                          &cell_x_);
00389 }
00390 
00391 // Finds horizontal splits in the ColPartitions of text_grid_ by considering
00392 // all possible "good" guesses. A good guess is just the bottom/top sides of
00393 // the partitions, since these locations will uniquely define where the
00394 // extremal values where the splits can occur. The split happens
00395 // in the middle of the two nearest partitions.
00396 void StructuredTable::FindWhitespacedRows() {
00397   // Set of the extents of all partitions on the page.
00398   GenericVectorEqEq<int> bottom_sides;
00399   GenericVectorEqEq<int> top_sides;
00400   // We will be "shrinking" partitions, so keep the min/max around to
00401   // make sure the bottom/top lines do not intersect text.
00402   int min_bottom = MAX_INT32;
00403   int max_top = MIN_INT32;
00404 
00405   // Look at each text partition. We want to find the partitions
00406   // that have extremal bottom/top sides. These will give us a basis
00407   // for the table rows. Because the textlines can be skewed and close due
00408   // to warping, the height of the partitions is toned down a little bit.
00409   ColPartitionGridSearch gsearch(text_grid_);
00410   gsearch.SetUniqueMode(true);
00411   gsearch.StartRectSearch(bounding_box_);
00412   ColPartition* text = NULL;
00413   while ((text = gsearch.NextRectSearch()) != NULL) {
00414     if (!text->IsTextType())
00415       continue;
00416 
00417     ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
00418     min_bottom = MIN(min_bottom, text->bounding_box().bottom());
00419     max_top = MAX(max_top, text->bounding_box().top());
00420 
00421     // Ignore "tall" text partitions, as these are usually false positive
00422     // vertical text or multiple lines pulled together.
00423     if (text->bounding_box().height() > max_text_height_)
00424       continue;
00425 
00426     int spacing = static_cast<int>(text->bounding_box().height() *
00427                                    kVerticalSpacing / 2.0 + 0.5);
00428     int bottom = text->bounding_box().bottom() - spacing;
00429     int top = text->bounding_box().top() + spacing;
00430     // For horizontal text, the factor can be negative. This should
00431     // probably cause a warning or failure. I haven't actually checked if
00432     // it happens.
00433     if (bottom >= top)
00434       continue;
00435 
00436     bottom_sides.push_back(bottom);
00437     top_sides.push_back(top);
00438   }
00439   // It causes disaster below, so avoid it!
00440   if (bottom_sides.length() == 0 || top_sides.length() == 0)
00441     return;
00442 
00443   // Since data may be inserted in grid order, we sort the bottom/top sides.
00444   bottom_sides.sort();
00445   top_sides.sort();
00446 
00447   // At this point, in the "merged list", we expect to have a bottom side,
00448   // followed by either more bottom sides or a top side. The last number
00449   // should be a top side. We find places where the splits occur by looking
00450   // for "valleys". If we want to force gap sizes or allow overlap, change
00451   // the spacing above. If you want to let lines "slice" partitions as long
00452   // as it is infrequent, change the following function.
00453   FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold,
00454                          &cell_y_);
00455 
00456   // Recover the min/max correctly since it was shifted.
00457   cell_y_[0] = min_bottom;
00458   cell_y_[cell_y_.length() - 1] = max_top;
00459 }
00460 
00461 void StructuredTable::CalculateMargins() {
00462   space_above_ = MAX_INT32;
00463   space_below_ = MAX_INT32;
00464   space_right_ = MAX_INT32;
00465   space_left_ = MAX_INT32;
00466   UpdateMargins(text_grid_);
00467   UpdateMargins(line_grid_);
00468 }
00469 // Finds the nearest partition in grid to the table
00470 // boundaries and updates the margin.
00471 void StructuredTable::UpdateMargins(ColPartitionGrid* grid) {
00472   int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
00473   space_below_ = MIN(space_below_, below);
00474   int above = FindVerticalMargin(grid, bounding_box_.top(), false);
00475   space_above_ = MIN(space_above_, above);
00476   int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
00477   space_left_ = MIN(space_left_, left);
00478   int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
00479   space_right_ = MIN(space_right_, right);
00480 }
00481 int StructuredTable::FindVerticalMargin(ColPartitionGrid* grid, int border,
00482                                         bool decrease) const {
00483   ColPartitionGridSearch gsearch(grid);
00484   gsearch.SetUniqueMode(true);
00485   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
00486                               border);
00487   ColPartition* part = NULL;
00488   while ((part = gsearch.NextVerticalSearch(decrease)) != NULL) {
00489     if (!part->IsTextType() && !part->IsHorizontalLine())
00490       continue;
00491     int distance = decrease ? border - part->bounding_box().top()
00492                             : part->bounding_box().bottom() - border;
00493     if (distance >= 0)
00494       return distance;
00495   }
00496   return MAX_INT32;
00497 }
00498 int StructuredTable::FindHorizontalMargin(ColPartitionGrid* grid, int border,
00499                                           bool decrease) const {
00500   ColPartitionGridSearch gsearch(grid);
00501   gsearch.SetUniqueMode(true);
00502   gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
00503   ColPartition* part = NULL;
00504   while ((part = gsearch.NextSideSearch(decrease)) != NULL) {
00505     if (!part->IsTextType() && !part->IsVerticalLine())
00506       continue;
00507     int distance = decrease ? border - part->bounding_box().right()
00508                             : part->bounding_box().left() - border;
00509     if (distance >= 0)
00510       return distance;
00511   }
00512   return MAX_INT32;
00513 }
00514 
00515 void StructuredTable::CalculateStats() {
00516   const int kMaxCellHeight = 1000;
00517   const int kMaxCellWidth = 1000;
00518   STATS height_stats(0, kMaxCellHeight + 1);
00519   STATS width_stats(0, kMaxCellWidth + 1);
00520 
00521   for (int i = 0; i < row_count(); ++i)
00522     height_stats.add(row_height(i), column_count());
00523   for (int i = 0; i < column_count(); ++i)
00524     width_stats.add(column_width(i), row_count());
00525 
00526   median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
00527   median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
00528 }
00529 
00530 // Looks for grid lines near the current bounding box and
00531 // grows the bounding box to include them if no intersections
00532 // will occur as a result. This is necessary because the margins
00533 // are calculated relative to the closest line/text. If the
00534 // line isn't absorbed, the margin will be the distance to the line.
00535 void StructuredTable::AbsorbNearbyLines() {
00536   ColPartitionGridSearch gsearch(line_grid_);
00537   gsearch.SetUniqueMode(true);
00538 
00539   // Is the closest line above good? Loop multiple times for tables with
00540   // multi-line (sometimes 2) borders. Limit the number of lines by
00541   // making sure they stay within a table cell or so.
00542   ColPartition* line = NULL;
00543   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
00544                               bounding_box_.top());
00545   while ((line = gsearch.NextVerticalSearch(false)) != NULL) {
00546     if (!line->IsHorizontalLine())
00547       break;
00548     TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1,
00549                      bounding_box_.right(), line->MidY());
00550     if (text_search.height() > median_cell_height_ * 2)
00551       break;
00552     if (CountPartitions(text_search) > 0)
00553       break;
00554     bounding_box_.set_top(line->MidY());
00555   }
00556   // As above, is the closest line below good?
00557   line = NULL;
00558   gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
00559                               bounding_box_.bottom());
00560   while ((line = gsearch.NextVerticalSearch(true)) != NULL) {
00561     if (!line->IsHorizontalLine())
00562       break;
00563     TBOX text_search(bounding_box_.left(), line->MidY(),
00564                      bounding_box_.right(), bounding_box_.bottom() - 1);
00565     if (text_search.height() > median_cell_height_ * 2)
00566       break;
00567     if (CountPartitions(text_search) > 0)
00568       break;
00569     bounding_box_.set_bottom(line->MidY());
00570   }
00571   // TODO(nbeato): vertical lines
00572 }
00573 
00574 
00575 // This function will find all "0 valleys" (of any length) given two
00576 // arrays. The arrays are the mins and maxes of partitions (either
00577 // left and right or bottom and top). Since the min/max lists are generated
00578 // with pairs of increasing integers, we can make some assumptions in
00579 // the function about ordering of the overall list, which are shown in the
00580 // asserts.
00581 // The algorithm works as follows:
00582 //   While there are numbers to process, take the smallest number.
00583 //     If it is from the min_list, increment the "hill" counter.
00584 //     Otherwise, decrement the "hill" counter.
00585 //     In the process of doing this, keep track of "crossing" the
00586 //     desired height.
00587 // The first/last items are extremal values of the list and known.
00588 // NOTE: This function assumes the lists are sorted!
00589 void StructuredTable::FindCellSplitLocations(const GenericVector<int>& min_list,
00590                                              const GenericVector<int>& max_list,
00591                                              int max_merged,
00592                                              GenericVector<int>* locations) {
00593   locations->clear();
00594   ASSERT_HOST(min_list.length() == max_list.length());
00595   if (min_list.length() == 0)
00596     return;
00597   ASSERT_HOST(min_list.get(0) < max_list.get(0));
00598   ASSERT_HOST(min_list.get(min_list.length() - 1) <
00599               max_list.get(max_list.length() - 1));
00600 
00601   locations->push_back(min_list.get(0));
00602   int min_index = 0;
00603   int max_index = 0;
00604   int stacked_partitions = 0;
00605   int last_cross_position = MAX_INT32;
00606   // max_index will expire after min_index.
00607   // However, we can't "increase" the hill size if min_index expired.
00608   // So finish processing when min_index expires.
00609   while (min_index < min_list.length()) {
00610     // Increase the hill count.
00611     if (min_list[min_index] < max_list[max_index]) {
00612       ++stacked_partitions;
00613       if (last_cross_position != MAX_INT32 &&
00614           stacked_partitions > max_merged) {
00615         int mid = (last_cross_position + min_list[min_index]) / 2;
00616         locations->push_back(mid);
00617         last_cross_position = MAX_INT32;
00618       }
00619       ++min_index;
00620     } else {
00621       // Decrease the hill count.
00622       --stacked_partitions;
00623       if (last_cross_position == MAX_INT32 &&
00624           stacked_partitions <= max_merged) {
00625         last_cross_position = max_list[max_index];
00626       }
00627       ++max_index;
00628     }
00629   }
00630   locations->push_back(max_list.get(max_list.length() - 1));
00631 }
00632 
00633 // Counts the number of partitions in the table
00634 // box that intersection the given x value.
00635 int StructuredTable::CountVerticalIntersections(int x) {
00636   int count = 0;
00637   // Make a small box to keep the search time down.
00638   const int kGridSize = text_grid_->gridsize();
00639   TBOX vertical_box = bounding_box_;
00640   vertical_box.set_left(x - kGridSize);
00641   vertical_box.set_right(x + kGridSize);
00642 
00643   ColPartitionGridSearch gsearch(text_grid_);
00644   gsearch.SetUniqueMode(true);
00645   gsearch.StartRectSearch(vertical_box);
00646   ColPartition* text = NULL;
00647   while ((text = gsearch.NextRectSearch()) != NULL) {
00648     if (!text->IsTextType())
00649       continue;
00650     const TBOX& box = text->bounding_box();
00651     if (box.left() < x && x < box.right())
00652       ++count;
00653   }
00654   return count;
00655 }
00656 
00657 // Counts the number of partitions in the table
00658 // box that intersection the given y value.
00659 int StructuredTable::CountHorizontalIntersections(int y) {
00660   int count = 0;
00661   // Make a small box to keep the search time down.
00662   const int kGridSize = text_grid_->gridsize();
00663   TBOX horizontal_box = bounding_box_;
00664   horizontal_box.set_bottom(y - kGridSize);
00665   horizontal_box.set_top(y + kGridSize);
00666 
00667   ColPartitionGridSearch gsearch(text_grid_);
00668   gsearch.SetUniqueMode(true);
00669   gsearch.StartRectSearch(horizontal_box);
00670   ColPartition* text = NULL;
00671   while ((text = gsearch.NextRectSearch()) != NULL) {
00672     if (!text->IsTextType())
00673       continue;
00674 
00675     const TBOX& box = text->bounding_box();
00676     if (box.bottom() < y && y < box.top())
00677       ++count;
00678   }
00679   return count;
00680 }
00681 
00682 // Counts how many text partitions are in this box.
00683 // This is used to count partitons in cells, as that can indicate
00684 // how "strong" a potential table row/colum (or even full table) actually is.
00685 int StructuredTable::CountPartitions(const TBOX& box) {
00686   ColPartitionGridSearch gsearch(text_grid_);
00687   gsearch.SetUniqueMode(true);
00688   gsearch.StartRectSearch(box);
00689   int count = 0;
00690   ColPartition* text = NULL;
00691   while ((text = gsearch.NextRectSearch()) != NULL) {
00692     if (text->IsTextType())
00693       ++count;
00694   }
00695   return count;
00696 }
00697 
00701 
00702 TableRecognizer::TableRecognizer()
00703     : text_grid_(NULL),
00704       line_grid_(NULL),
00705       min_height_(0),
00706       min_width_(0),
00707       max_text_height_(MAX_INT32) {
00708 }
00709 
00710 TableRecognizer::~TableRecognizer() {
00711 }
00712 
00713 void TableRecognizer::Init() {
00714 }
00715 
00716 void TableRecognizer::set_text_grid(ColPartitionGrid* text_grid) {
00717   text_grid_ = text_grid;
00718 }
00719 void TableRecognizer::set_line_grid(ColPartitionGrid* line_grid) {
00720   line_grid_ = line_grid;
00721 }
00722 void TableRecognizer::set_min_height(int height) {
00723   min_height_ = height;
00724 }
00725 void TableRecognizer::set_min_width(int width) {
00726   min_width_ = width;
00727 }
00728 void TableRecognizer::set_max_text_height(int height) {
00729   max_text_height_ = height;
00730 }
00731 
00732 StructuredTable* TableRecognizer::RecognizeTable(const TBOX& guess) {
00733   StructuredTable* table = new StructuredTable();
00734   table->Init();
00735   table->set_text_grid(text_grid_);
00736   table->set_line_grid(line_grid_);
00737   table->set_max_text_height(max_text_height_);
00738 
00739   // Try to solve ths simple case, a table with *both*
00740   // vertical and horizontal lines.
00741   if (RecognizeLinedTable(guess, table))
00742     return table;
00743 
00744   // Fallback to whitespace if that failed.
00745   // TODO(nbeato): Break this apart to take advantage of horizontal
00746   // lines or vertical lines when present.
00747   if (RecognizeWhitespacedTable(guess, table))
00748     return table;
00749 
00750   // No table found...
00751   delete table;
00752   return NULL;
00753 }
00754 
00755 bool TableRecognizer::RecognizeLinedTable(const TBOX& guess_box,
00756                                           StructuredTable* table) {
00757   if (!HasSignificantLines(guess_box))
00758     return false;
00759   TBOX line_bound = guess_box;
00760   if (!FindLinesBoundingBox(&line_bound))
00761     return false;
00762   table->set_bounding_box(line_bound);
00763   return table->FindLinedStructure();
00764 }
00765 
00766 // Quick implementation. Just count the number of lines in the box.
00767 // A better implementation would counter intersections and look for connected
00768 // components. It could even go as far as finding similar length lines.
00769 // To account for these possible issues, the VerifyLinedTableCells function
00770 // will reject lined tables that cause intersections with text on the page.
00771 // TODO(nbeato): look for "better" lines
00772 bool TableRecognizer::HasSignificantLines(const TBOX& guess) {
00773   ColPartitionGridSearch box_search(line_grid_);
00774   box_search.SetUniqueMode(true);
00775   box_search.StartRectSearch(guess);
00776   ColPartition* line = NULL;
00777   int vertical_count = 0;
00778   int horizontal_count = 0;
00779 
00780   while ((line = box_search.NextRectSearch()) != NULL) {
00781     if (line->IsHorizontalLine())
00782       ++horizontal_count;
00783     if (line->IsVerticalLine())
00784       ++vertical_count;
00785   }
00786 
00787   return vertical_count >= kLinedTableMinVerticalLines &&
00788          horizontal_count >= kLinedTableMinHorizontalLines;
00789 }
00790 
00791 // Given a bounding box with a bunch of horizontal / vertical lines,
00792 // we just find the extents of all of these lines iteratively.
00793 // The box will be at least as large as guess. This
00794 // could possibly be a bad assumption.
00795 // It is guaranteed to halt in at least O(n * gridarea) where n
00796 // is the number of lines.
00797 // The assumption is that growing the box iteratively will add lines
00798 // several times, but eventually we'll find the extents.
00799 //
00800 // For tables, the approach is a bit aggressive, a single line (which could be
00801 // noise or a column ruling) can destroy the table inside.
00802 //
00803 // TODO(nbeato): This is a quick first implementation.
00804 // A better implementation would actually look for consistency
00805 // in extents of the lines and find the extents using lines
00806 // that clearly describe the table. This would allow the
00807 // lines to "vote" for height/width. An approach like
00808 // this would solve issues with page layout rulings.
00809 // I haven't looked for these issues yet, so I can't even
00810 // say they happen confidently.
00811 bool TableRecognizer::FindLinesBoundingBox(TBOX* bounding_box) {
00812   // The first iteration will tell us if there are lines
00813   // present and shrink the box to a minimal iterative size.
00814   if (!FindLinesBoundingBoxIteration(bounding_box))
00815     return false;
00816 
00817   // Keep growing until the area of the table stabilizes.
00818   // The box can only get bigger, increasing area.
00819   bool changed = true;
00820   while (changed) {
00821     changed = false;
00822     int old_area = bounding_box->area();
00823     bool check = FindLinesBoundingBoxIteration(bounding_box);
00824     // At this point, the function will return true.
00825     ASSERT_HOST(check);
00826     ASSERT_HOST(bounding_box->area() >= old_area);
00827     changed = (bounding_box->area() > old_area);
00828   }
00829 
00830   return true;
00831 }
00832 
00833 bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX* bounding_box) {
00834   // Search for all of the lines in the current box, keeping track of extents.
00835   ColPartitionGridSearch box_search(line_grid_);
00836   box_search.SetUniqueMode(true);
00837   box_search.StartRectSearch(*bounding_box);
00838   ColPartition* line = NULL;
00839   bool first_line = true;
00840 
00841   while ((line = box_search.NextRectSearch()) != NULL) {
00842     if (line->IsLineType()) {
00843       if (first_line) {
00844         // The first iteration can shrink the box.
00845         *bounding_box = line->bounding_box();
00846         first_line = false;
00847       } else {
00848         *bounding_box += line->bounding_box();
00849       }
00850     }
00851   }
00852   return !first_line;
00853 }
00854 
00855 // The goal of this function is to move the table boundaries around and find
00856 // a table that maximizes the whitespace around the table while maximizing
00857 // the cellular structure. As a result, it gets confused by headers, footers,
00858 // and merged columns (text that crosses columns). There is a tolerance
00859 // that allows a few partitions to count towards potential cell merges.
00860 // It's the max_merged parameter to FindPartitionLocations.
00861 // It can work, but it needs some false positive remove on boundaries.
00862 // For now, the grid structure must not intersect any partitions.
00863 // Also, small tolerance is added to the horizontal lines for tightly packed
00864 // tables. The tolerance is added by adjusting the bounding boxes of the
00865 // partitions (in FindHorizontalPartitions). The current implementation
00866 // only adjusts the vertical extents of the table.
00867 //
00868 // Also note. This was hacked at a lot. It could probably use some
00869 // more hacking at to find a good set of border conditions and then a
00870 // nice clean up.
00871 bool TableRecognizer::RecognizeWhitespacedTable(const TBOX& guess_box,
00872                                                 StructuredTable* table) {
00873   TBOX best_box = guess_box;  // Best borders known.
00874   int best_below = 0;         // Margin size above best table.
00875   int best_above = 0;         // Margin size below best table.
00876   TBOX adjusted = guess_box;  // The search box.
00877 
00878   // We assume that the guess box is somewhat accurate, so we don't allow
00879   // the adjusted border to pass half of the guessed area. This prevents
00880   // "negative" tables from forming.
00881   const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
00882   // Keeps track of the most columns in an accepted table. The resulting table
00883   // may be less than the max, but we don't want to stray too far.
00884   int best_cols = 0;
00885   // Make sure we find a good border.
00886   bool found_good_border = false;
00887 
00888   // Find the bottom of the table by trying a few different locations. For
00889   // each location, the top, left, and right are fixed. We start the search
00890   // in a smaller table to favor best_cols getting a good estimate sooner.
00891   int last_bottom = MAX_INT32;
00892   int bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00893                                    kMidGuessY - min_height_ / 2, true);
00894   int top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00895                                 kMidGuessY + min_height_ / 2, false);
00896   adjusted.set_top(top);
00897 
00898   // Headers/footers can be spaced far from everything.
00899   // Make sure that the space below is greater than the space above
00900   // the lowest row.
00901   int previous_below = 0;
00902   const int kMaxChances = 10;
00903   int chances = kMaxChances;
00904   while (bottom != last_bottom) {
00905     adjusted.set_bottom(bottom);
00906 
00907     if (adjusted.height() >= min_height_) {
00908       // Try to fit the grid on the current box. We give it a chance
00909       // if the number of columns didn't significantly drop.
00910       table->set_bounding_box(adjusted);
00911       if (table->FindWhitespacedStructure() &&
00912           table->column_count() >= best_cols * kRequiredColumns) {
00913         if (false && IsWeakTableRow(table, 0)) {
00914           // Currently buggy, but was looking promising so disabled.
00915           --chances;
00916         } else {
00917           // We favor 2 things,
00918           //   1- Adding rows that have partitioned data.
00919           //   2- Better margins (to find header/footer).
00920           // For better tables, we just look for multiple cells in the
00921           // bottom row with data in them.
00922           // For margins, the space below the last row should
00923           // be better than a table with the last row removed.
00924           chances = kMaxChances;
00925           double max_row_height = kMaxRowSize * table->median_cell_height();
00926           if ((table->space_below() * kMarginFactor >= best_below &&
00927                table->space_below() >= previous_below) ||
00928               (table->CountFilledCellsInRow(0) > 1 &&
00929                table->row_height(0) < max_row_height)) {
00930             best_box.set_bottom(bottom);
00931             best_below = table->space_below();
00932             best_cols = MAX(table->column_count(), best_cols);
00933             found_good_border = true;
00934           }
00935         }
00936         previous_below = table->space_below();
00937       } else {
00938        --chances;
00939       }
00940     }
00941     if (chances <= 0)
00942       break;
00943 
00944     last_bottom = bottom;
00945     bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00946                                  last_bottom, true);
00947   }
00948   if (!found_good_border)
00949     return false;
00950 
00951   // TODO(nbeato) comments: follow modified code above... put it in a function!
00952   found_good_border = false;
00953   int last_top = MIN_INT32;
00954   top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00955                             kMidGuessY + min_height_ / 2, false);
00956   int previous_above = 0;
00957   chances = kMaxChances;
00958 
00959   adjusted.set_bottom(best_box.bottom());
00960   while (last_top != top) {
00961     adjusted.set_top(top);
00962     if (adjusted.height() >= min_height_) {
00963       table->set_bounding_box(adjusted);
00964       if (table->FindWhitespacedStructure() &&
00965           table->column_count() >= best_cols * kRequiredColumns) {
00966         int last_row = table->row_count() - 1;
00967         if (false && IsWeakTableRow(table, last_row)) {
00968           // Currently buggy, but was looking promising so disabled.
00969           --chances;
00970         } else {
00971           chances = kMaxChances;
00972           double max_row_height = kMaxRowSize * table->median_cell_height();
00973           if ((table->space_above() * kMarginFactor >= best_above &&
00974                table->space_above() >= previous_above) ||
00975               (table->CountFilledCellsInRow(last_row) > 1 &&
00976                table->row_height(last_row) < max_row_height)) {
00977             best_box.set_top(top);
00978             best_above = table->space_above();
00979             best_cols = MAX(table->column_count(), best_cols);
00980             found_good_border = true;
00981           }
00982         }
00983         previous_above = table->space_above();
00984       } else {
00985        --chances;
00986       }
00987     }
00988     if (chances <= 0)
00989       break;
00990 
00991     last_top = top;
00992     top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
00993                               last_top, false);
00994   }
00995 
00996   if (!found_good_border)
00997     return false;
00998 
00999   // If we get here, this shouldn't happen. It can be an assert, but
01000   // I haven't tested it enough to make it crash things.
01001   if (best_box.null_box())
01002     return false;
01003 
01004   // Given the best locations, fit the box to those locations.
01005   table->set_bounding_box(best_box);
01006   return table->FindWhitespacedStructure();
01007 }
01008 
01009 // Finds the closest value to y that can safely cause a horizontal
01010 // split in the partitions.
01011 // This function has been buggy and not as reliable as I would've
01012 // liked. I suggest finding all of the splits using the
01013 // FindPartitionLocations once and then just keeping the results
01014 // of that function cached somewhere.
01015 int TableRecognizer::NextHorizontalSplit(int left, int right, int y,
01016                                          bool top_to_bottom) {
01017   ColPartitionGridSearch gsearch(text_grid_);
01018   gsearch.SetUniqueMode(true);
01019   gsearch.StartVerticalSearch(left, right, y);
01020   ColPartition* text = NULL;
01021   int last_y = y;
01022   while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != NULL) {
01023     if (!text->IsTextType() || !text->IsHorizontalType())
01024       continue;
01025     if (text->bounding_box().height() > max_text_height_)
01026       continue;
01027 
01028     const TBOX& text_box = text->bounding_box();
01029     if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
01030       last_y = MIN(last_y, text_box.bottom());
01031       continue;
01032     }
01033     if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
01034       last_y = MAX(last_y, text_box.top());
01035       continue;
01036     }
01037 
01038     return last_y;
01039   }
01040   // If none is found, we at least want to preserve the min/max,
01041   // which defines the overlap of y with the last partition in the grid.
01042   return last_y;
01043 }
01044 
01045 // Code is buggy right now. It is disabled in the calling function.
01046 // It seems like sometimes the row that is passed in is not correct
01047 // sometimes (like a phantom row is introduced). There's something going
01048 // on in the cell_y_ data member before this is called... not certain.
01049 bool TableRecognizer::IsWeakTableRow(StructuredTable* table, int row) {
01050   if (!table->VerifyRowFilled(row))
01051     return false;
01052 
01053   double threshold = 0.0;
01054   if (table->column_count() > kGoodRowNumberOfColumnsSmallSize)
01055     threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
01056   else
01057     threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
01058 
01059   return table->CountFilledCellsInRow(row) < threshold;
01060 }
01061 
01062 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines