tesseract
3.03
|
00001 00002 // File: tablerecog.cpp 00003 // Description: Helper class to help structure table areas. Given an bounding 00004 // box from TableFinder, the TableRecognizer should give a 00005 // StructuredTable (maybe a list in the future) of "good" tables 00006 // in that area. 00007 // Author: Nicholas Beato 00008 // Created: Friday, Aug. 20, 2010 00009 // 00010 // (C) Copyright 2009, Google Inc. 00011 // Licensed under the Apache License, Version 2.0 (the "License"); 00012 // you may not use this file except in compliance with the License. 00013 // You may obtain a copy of the License at 00014 // http://www.apache.org/licenses/LICENSE-2.0 00015 // Unless required by applicable law or agreed to in writing, software 00016 // distributed under the License is distributed on an "AS IS" BASIS, 00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00018 // See the License for the specific language governing permissions and 00019 // limitations under the License. 00020 // 00022 00023 #ifdef HAVE_CONFIG_H 00024 #include "config_auto.h" 00025 #endif 00026 00027 #include "tablerecog.h" 00028 00029 namespace tesseract { 00030 00031 // The amount of space required between the ColPartitions in 2 columns 00032 // of a non-lined table as a multiple of the median width. 00033 const double kHorizontalSpacing = 0.30; 00034 // The amount of space required between the ColPartitions in 2 rows 00035 // of a non-lined table as multiples of the median height. 00036 const double kVerticalSpacing = -0.2; 00037 // The number of cells that the grid lines may intersect. 00038 // See FindCellSplitLocations for explanation. 00039 const int kCellSplitRowThreshold = 0; 00040 const int kCellSplitColumnThreshold = 0; 00041 // For "lined tables", the number of required lines. Currently a guess. 00042 const int kLinedTableMinVerticalLines = 3; 00043 const int kLinedTableMinHorizontalLines = 3; 00044 // Number of columns required, as a fraction of the most columns found. 00045 // None of these are tweaked at all. 00046 const double kRequiredColumns = 0.7; 00047 // The tolerance for comparing margins of potential tables. 00048 const double kMarginFactor = 1.1; 00049 // The first and last row should be consistent cell height. 00050 // This factor is the first and last row cell height max. 00051 const double kMaxRowSize = 2.5; 00052 // Number of filled columns required to form a strong table row. 00053 // For small tables, this is an absolute number. 00054 const double kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 }; 00055 const int kGoodRowNumberOfColumnsSmallSize = 00056 sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1; 00057 // For large tables, it is a relative number 00058 const double kGoodRowNumberOfColumnsLarge = 0.7; 00059 // The amount of area that must be covered in a cell by ColPartitions to 00060 // be considered "filled" 00061 const double kMinFilledArea = 0.35; 00062 00066 00067 StructuredTable::StructuredTable() 00068 : text_grid_(NULL), 00069 line_grid_(NULL), 00070 is_lined_(false), 00071 space_above_(0), 00072 space_below_(0), 00073 space_left_(0), 00074 space_right_(0), 00075 median_cell_height_(0), 00076 median_cell_width_(0), 00077 max_text_height_(MAX_INT32) { 00078 } 00079 00080 StructuredTable::~StructuredTable() { 00081 } 00082 00083 void StructuredTable::Init() { 00084 } 00085 00086 void StructuredTable::set_text_grid(ColPartitionGrid* text_grid) { 00087 text_grid_ = text_grid; 00088 } 00089 void StructuredTable::set_line_grid(ColPartitionGrid* line_grid) { 00090 line_grid_ = line_grid; 00091 } 00092 void StructuredTable::set_max_text_height(int height) { 00093 max_text_height_ = height; 00094 } 00095 bool StructuredTable::is_lined() const { 00096 return is_lined_; 00097 } 00098 int StructuredTable::row_count() const { 00099 return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1; 00100 } 00101 int StructuredTable::column_count() const { 00102 return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1; 00103 } 00104 int StructuredTable::cell_count() const { 00105 return row_count() * column_count(); 00106 } 00107 void StructuredTable::set_bounding_box(const TBOX& box) { 00108 bounding_box_ = box; 00109 } 00110 const TBOX& StructuredTable::bounding_box() const { 00111 return bounding_box_; 00112 } 00113 int StructuredTable::median_cell_height() { 00114 return median_cell_height_; 00115 } 00116 int StructuredTable::median_cell_width() { 00117 return median_cell_width_; 00118 } 00119 int StructuredTable::row_height(int row) const { 00120 ASSERT_HOST(0 <= row && row < row_count()); 00121 return cell_y_[row + 1] - cell_y_[row]; 00122 } 00123 int StructuredTable::column_width(int column) const { 00124 ASSERT_HOST(0 <= column && column < column_count()); 00125 return cell_x_[column + 1] - cell_x_[column]; 00126 } 00127 int StructuredTable::space_above() const { 00128 return space_above_; 00129 } 00130 int StructuredTable::space_below() const { 00131 return space_below_; 00132 } 00133 00134 // At this point, we know that the lines are contained 00135 // by the box (by FindLinesBoundingBox). 00136 // So try to find the cell structure and make sure it works out. 00137 // The assumption is that all lines span the table. If this 00138 // assumption fails, the VerifyLinedTable method will 00139 // abort the lined table. The TableRecognizer will fall 00140 // back on FindWhitespacedStructure. 00141 bool StructuredTable::FindLinedStructure() { 00142 ClearStructure(); 00143 00144 // Search for all of the lines in the current box. 00145 // Update the cellular structure with the exact lines. 00146 ColPartitionGridSearch box_search(line_grid_); 00147 box_search.SetUniqueMode(true); 00148 box_search.StartRectSearch(bounding_box_); 00149 ColPartition* line = NULL; 00150 00151 while ((line = box_search.NextRectSearch()) != NULL) { 00152 if (line->IsHorizontalLine()) 00153 cell_y_.push_back(line->MidY()); 00154 if (line->IsVerticalLine()) 00155 cell_x_.push_back(line->MidX()); 00156 } 00157 00158 // HasSignificantLines should guarantee cells. 00159 // Because that code is a different class, just gracefully 00160 // return false. This could be an assert. 00161 if (cell_x_.length() < 3 || cell_y_.length() < 3) 00162 return false; 00163 00164 cell_x_.sort(); 00165 cell_y_.sort(); 00166 00167 // Remove duplicates that may have occurred due to split lines. 00168 cell_x_.compact_sorted(); 00169 cell_y_.compact_sorted(); 00170 00171 // The border should be the extents of line boxes, not middle. 00172 cell_x_[0] = bounding_box_.left(); 00173 cell_x_[cell_x_.length() - 1] = bounding_box_.right(); 00174 cell_y_[0] = bounding_box_.bottom(); 00175 cell_y_[cell_y_.length() - 1] = bounding_box_.top(); 00176 00177 // Remove duplicates that may have occurred due to moving the borders. 00178 cell_x_.compact_sorted(); 00179 cell_y_.compact_sorted(); 00180 00181 CalculateMargins(); 00182 CalculateStats(); 00183 is_lined_ = VerifyLinedTableCells(); 00184 return is_lined_; 00185 } 00186 00187 // Finds the cellular structure given a particular box. 00188 bool StructuredTable::FindWhitespacedStructure() { 00189 ClearStructure(); 00190 FindWhitespacedColumns(); 00191 FindWhitespacedRows(); 00192 00193 if (!VerifyWhitespacedTable()) { 00194 return false; 00195 } else { 00196 bounding_box_.set_left(cell_x_[0]); 00197 bounding_box_.set_right(cell_x_[cell_x_.length() - 1]); 00198 bounding_box_.set_bottom(cell_y_[0]); 00199 bounding_box_.set_top(cell_y_[cell_y_.length() - 1]); 00200 AbsorbNearbyLines(); 00201 CalculateMargins(); 00202 CalculateStats(); 00203 return true; 00204 } 00205 } 00206 00207 // Tests if a partition fits inside the table structure. 00208 // Partitions must fully span a grid line in order to intersect it. 00209 // This means that a partition does not intersect a line 00210 // that it "just" touches. This is mainly because the assumption 00211 // throughout the code is that "0" distance is a very very small space. 00212 bool StructuredTable::DoesPartitionFit(const ColPartition& part) const { 00213 const TBOX& box = part.bounding_box(); 00214 for (int i = 0; i < cell_x_.length(); ++i) 00215 if (box.left() < cell_x_[i] && cell_x_[i] < box.right()) 00216 return false; 00217 for (int i = 0; i < cell_y_.length(); ++i) 00218 if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top()) 00219 return false; 00220 return true; 00221 } 00222 00223 // Checks if a sub-table has multiple data cells filled. 00224 int StructuredTable::CountFilledCells() { 00225 return CountFilledCells(0, row_count() - 1, 0, column_count() - 1); 00226 } 00227 int StructuredTable::CountFilledCellsInRow(int row) { 00228 return CountFilledCells(row, row, 0, column_count() - 1); 00229 } 00230 int StructuredTable::CountFilledCellsInColumn(int column) { 00231 return CountFilledCells(0, row_count() - 1, column, column); 00232 } 00233 int StructuredTable::CountFilledCells(int row_start, int row_end, 00234 int column_start, int column_end) { 00235 ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count()); 00236 ASSERT_HOST(0 <= column_start && column_start <= column_end && 00237 column_end < column_count()); 00238 int cell_count = 0; 00239 TBOX cell_box; 00240 for (int row = row_start; row <= row_end; ++row) { 00241 cell_box.set_bottom(cell_y_[row]); 00242 cell_box.set_top(cell_y_[row + 1]); 00243 for (int col = column_start; col <= column_end; ++col) { 00244 cell_box.set_left(cell_x_[col]); 00245 cell_box.set_right(cell_x_[col + 1]); 00246 if (CountPartitions(cell_box) > 0) 00247 ++cell_count; 00248 } 00249 } 00250 return cell_count; 00251 } 00252 00253 // Makes sure that at least one cell in a row has substantial area filled. 00254 // This can filter out large whitespace caused by growing tables too far 00255 // and page numbers. 00256 bool StructuredTable::VerifyRowFilled(int row) { 00257 for (int i = 0; i < column_count(); ++i) { 00258 double area_filled = CalculateCellFilledPercentage(row, i); 00259 if (area_filled >= kMinFilledArea) 00260 return true; 00261 } 00262 return false; 00263 } 00264 00265 // Finds the filled area in a cell. 00266 // Assume ColPartitions do not overlap for simplicity (even though they do). 00267 double StructuredTable::CalculateCellFilledPercentage(int row, int column) { 00268 ASSERT_HOST(0 <= row && row <= row_count()); 00269 ASSERT_HOST(0 <= column && column <= column_count()); 00270 const TBOX kCellBox(cell_x_[column], cell_y_[row], 00271 cell_x_[column + 1], cell_y_[row + 1]); 00272 ASSERT_HOST(!kCellBox.null_box()); 00273 00274 ColPartitionGridSearch gsearch(text_grid_); 00275 gsearch.SetUniqueMode(true); 00276 gsearch.StartRectSearch(kCellBox); 00277 double area_covered = 0; 00278 ColPartition* text = NULL; 00279 while ((text = gsearch.NextRectSearch()) != NULL) { 00280 if (text->IsTextType()) 00281 area_covered += text->bounding_box().intersection(kCellBox).area(); 00282 } 00283 return MIN(1.0, area_covered / kCellBox.area()); 00284 } 00285 00286 void StructuredTable::Display(ScrollView* window, ScrollView::Color color) { 00287 #ifndef GRAPHICS_DISABLED 00288 window->Brush(ScrollView::NONE); 00289 window->Pen(color); 00290 window->Rectangle(bounding_box_.left(), bounding_box_.bottom(), 00291 bounding_box_.right(), bounding_box_.top()); 00292 for (int i = 0; i < cell_x_.length(); i++) { 00293 window->Line(cell_x_[i], bounding_box_.bottom(), 00294 cell_x_[i], bounding_box_.top()); 00295 } 00296 for (int i = 0; i < cell_y_.length(); i++) { 00297 window->Line(bounding_box_.left(), cell_y_[i], 00298 bounding_box_.right(), cell_y_[i]); 00299 } 00300 window->UpdateWindow(); 00301 #endif 00302 } 00303 00304 // Clear structure information. 00305 void StructuredTable::ClearStructure() { 00306 cell_x_.clear(); 00307 cell_y_.clear(); 00308 is_lined_ = false; 00309 space_above_ = 0; 00310 space_below_ = 0; 00311 space_left_ = 0; 00312 space_right_ = 0; 00313 median_cell_height_ = 0; 00314 median_cell_width_ = 0; 00315 } 00316 00317 // When a table has lines, the lines should not intersect any partitions. 00318 // The following function makes sure the previous assumption is met. 00319 bool StructuredTable::VerifyLinedTableCells() { 00320 // Function only called when lines exist. 00321 ASSERT_HOST(cell_y_.length() >= 2 && cell_x_.length() >= 2); 00322 for (int i = 0; i < cell_y_.length(); ++i) { 00323 if (CountHorizontalIntersections(cell_y_[i]) > 0) 00324 return false; 00325 } 00326 for (int i = 0; i < cell_x_.length(); ++i) { 00327 if (CountVerticalIntersections(cell_x_[i]) > 0) 00328 return false; 00329 } 00330 return true; 00331 } 00332 00333 // TODO(nbeato): Could be much better than this. 00334 // Examples: 00335 // - Caclulate the percentage of filled cells. 00336 // - Calculate the average number of ColPartitions per cell. 00337 // - Calculate the number of cells per row with partitions. 00338 // - Check if ColPartitions in adjacent cells are similar. 00339 // - Check that all columns are at least a certain width. 00340 // - etc. 00341 bool StructuredTable::VerifyWhitespacedTable() { 00342 // criteria for a table, must be at least 2x3 or 3x2 00343 return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6; 00344 } 00345 00346 // Finds vertical splits in the ColPartitions of text_grid_ by considering 00347 // all possible "good" guesses. A good guess is just the left/right sides of 00348 // the partitions, since these locations will uniquely define where the 00349 // extremal values where the splits can occur. The split happens 00350 // in the middle of the two nearest partitions. 00351 void StructuredTable::FindWhitespacedColumns() { 00352 // Set of the extents of all partitions on the page. 00353 GenericVectorEqEq<int> left_sides; 00354 GenericVectorEqEq<int> right_sides; 00355 00356 // Look at each text partition. We want to find the partitions 00357 // that have extremal left/right sides. These will give us a basis 00358 // for the table columns. 00359 ColPartitionGridSearch gsearch(text_grid_); 00360 gsearch.SetUniqueMode(true); 00361 gsearch.StartRectSearch(bounding_box_); 00362 ColPartition* text = NULL; 00363 while ((text = gsearch.NextRectSearch()) != NULL) { 00364 if (!text->IsTextType()) 00365 continue; 00366 00367 ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right()); 00368 int spacing = static_cast<int>(text->median_width() * 00369 kHorizontalSpacing / 2.0 + 0.5); 00370 left_sides.push_back(text->bounding_box().left() - spacing); 00371 right_sides.push_back(text->bounding_box().right() + spacing); 00372 } 00373 // It causes disaster below, so avoid it! 00374 if (left_sides.length() == 0 || right_sides.length() == 0) 00375 return; 00376 00377 // Since data may be inserted in grid order, we sort the left/right sides. 00378 left_sides.sort(); 00379 right_sides.sort(); 00380 00381 // At this point, in the "merged list", we expect to have a left side, 00382 // followed by either more left sides or a right side. The last number 00383 // should be a right side. We find places where the splits occur by looking 00384 // for "valleys". If we want to force gap sizes or allow overlap, change 00385 // the spacing above. If you want to let lines "slice" partitions as long 00386 // as it is infrequent, change the following function. 00387 FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold, 00388 &cell_x_); 00389 } 00390 00391 // Finds horizontal splits in the ColPartitions of text_grid_ by considering 00392 // all possible "good" guesses. A good guess is just the bottom/top sides of 00393 // the partitions, since these locations will uniquely define where the 00394 // extremal values where the splits can occur. The split happens 00395 // in the middle of the two nearest partitions. 00396 void StructuredTable::FindWhitespacedRows() { 00397 // Set of the extents of all partitions on the page. 00398 GenericVectorEqEq<int> bottom_sides; 00399 GenericVectorEqEq<int> top_sides; 00400 // We will be "shrinking" partitions, so keep the min/max around to 00401 // make sure the bottom/top lines do not intersect text. 00402 int min_bottom = MAX_INT32; 00403 int max_top = MIN_INT32; 00404 00405 // Look at each text partition. We want to find the partitions 00406 // that have extremal bottom/top sides. These will give us a basis 00407 // for the table rows. Because the textlines can be skewed and close due 00408 // to warping, the height of the partitions is toned down a little bit. 00409 ColPartitionGridSearch gsearch(text_grid_); 00410 gsearch.SetUniqueMode(true); 00411 gsearch.StartRectSearch(bounding_box_); 00412 ColPartition* text = NULL; 00413 while ((text = gsearch.NextRectSearch()) != NULL) { 00414 if (!text->IsTextType()) 00415 continue; 00416 00417 ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top()); 00418 min_bottom = MIN(min_bottom, text->bounding_box().bottom()); 00419 max_top = MAX(max_top, text->bounding_box().top()); 00420 00421 // Ignore "tall" text partitions, as these are usually false positive 00422 // vertical text or multiple lines pulled together. 00423 if (text->bounding_box().height() > max_text_height_) 00424 continue; 00425 00426 int spacing = static_cast<int>(text->bounding_box().height() * 00427 kVerticalSpacing / 2.0 + 0.5); 00428 int bottom = text->bounding_box().bottom() - spacing; 00429 int top = text->bounding_box().top() + spacing; 00430 // For horizontal text, the factor can be negative. This should 00431 // probably cause a warning or failure. I haven't actually checked if 00432 // it happens. 00433 if (bottom >= top) 00434 continue; 00435 00436 bottom_sides.push_back(bottom); 00437 top_sides.push_back(top); 00438 } 00439 // It causes disaster below, so avoid it! 00440 if (bottom_sides.length() == 0 || top_sides.length() == 0) 00441 return; 00442 00443 // Since data may be inserted in grid order, we sort the bottom/top sides. 00444 bottom_sides.sort(); 00445 top_sides.sort(); 00446 00447 // At this point, in the "merged list", we expect to have a bottom side, 00448 // followed by either more bottom sides or a top side. The last number 00449 // should be a top side. We find places where the splits occur by looking 00450 // for "valleys". If we want to force gap sizes or allow overlap, change 00451 // the spacing above. If you want to let lines "slice" partitions as long 00452 // as it is infrequent, change the following function. 00453 FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold, 00454 &cell_y_); 00455 00456 // Recover the min/max correctly since it was shifted. 00457 cell_y_[0] = min_bottom; 00458 cell_y_[cell_y_.length() - 1] = max_top; 00459 } 00460 00461 void StructuredTable::CalculateMargins() { 00462 space_above_ = MAX_INT32; 00463 space_below_ = MAX_INT32; 00464 space_right_ = MAX_INT32; 00465 space_left_ = MAX_INT32; 00466 UpdateMargins(text_grid_); 00467 UpdateMargins(line_grid_); 00468 } 00469 // Finds the nearest partition in grid to the table 00470 // boundaries and updates the margin. 00471 void StructuredTable::UpdateMargins(ColPartitionGrid* grid) { 00472 int below = FindVerticalMargin(grid, bounding_box_.bottom(), true); 00473 space_below_ = MIN(space_below_, below); 00474 int above = FindVerticalMargin(grid, bounding_box_.top(), false); 00475 space_above_ = MIN(space_above_, above); 00476 int left = FindHorizontalMargin(grid, bounding_box_.left(), true); 00477 space_left_ = MIN(space_left_, left); 00478 int right = FindHorizontalMargin(grid, bounding_box_.right(), false); 00479 space_right_ = MIN(space_right_, right); 00480 } 00481 int StructuredTable::FindVerticalMargin(ColPartitionGrid* grid, int border, 00482 bool decrease) const { 00483 ColPartitionGridSearch gsearch(grid); 00484 gsearch.SetUniqueMode(true); 00485 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), 00486 border); 00487 ColPartition* part = NULL; 00488 while ((part = gsearch.NextVerticalSearch(decrease)) != NULL) { 00489 if (!part->IsTextType() && !part->IsHorizontalLine()) 00490 continue; 00491 int distance = decrease ? border - part->bounding_box().top() 00492 : part->bounding_box().bottom() - border; 00493 if (distance >= 0) 00494 return distance; 00495 } 00496 return MAX_INT32; 00497 } 00498 int StructuredTable::FindHorizontalMargin(ColPartitionGrid* grid, int border, 00499 bool decrease) const { 00500 ColPartitionGridSearch gsearch(grid); 00501 gsearch.SetUniqueMode(true); 00502 gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top()); 00503 ColPartition* part = NULL; 00504 while ((part = gsearch.NextSideSearch(decrease)) != NULL) { 00505 if (!part->IsTextType() && !part->IsVerticalLine()) 00506 continue; 00507 int distance = decrease ? border - part->bounding_box().right() 00508 : part->bounding_box().left() - border; 00509 if (distance >= 0) 00510 return distance; 00511 } 00512 return MAX_INT32; 00513 } 00514 00515 void StructuredTable::CalculateStats() { 00516 const int kMaxCellHeight = 1000; 00517 const int kMaxCellWidth = 1000; 00518 STATS height_stats(0, kMaxCellHeight + 1); 00519 STATS width_stats(0, kMaxCellWidth + 1); 00520 00521 for (int i = 0; i < row_count(); ++i) 00522 height_stats.add(row_height(i), column_count()); 00523 for (int i = 0; i < column_count(); ++i) 00524 width_stats.add(column_width(i), row_count()); 00525 00526 median_cell_height_ = static_cast<int>(height_stats.median() + 0.5); 00527 median_cell_width_ = static_cast<int>(width_stats.median() + 0.5); 00528 } 00529 00530 // Looks for grid lines near the current bounding box and 00531 // grows the bounding box to include them if no intersections 00532 // will occur as a result. This is necessary because the margins 00533 // are calculated relative to the closest line/text. If the 00534 // line isn't absorbed, the margin will be the distance to the line. 00535 void StructuredTable::AbsorbNearbyLines() { 00536 ColPartitionGridSearch gsearch(line_grid_); 00537 gsearch.SetUniqueMode(true); 00538 00539 // Is the closest line above good? Loop multiple times for tables with 00540 // multi-line (sometimes 2) borders. Limit the number of lines by 00541 // making sure they stay within a table cell or so. 00542 ColPartition* line = NULL; 00543 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), 00544 bounding_box_.top()); 00545 while ((line = gsearch.NextVerticalSearch(false)) != NULL) { 00546 if (!line->IsHorizontalLine()) 00547 break; 00548 TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1, 00549 bounding_box_.right(), line->MidY()); 00550 if (text_search.height() > median_cell_height_ * 2) 00551 break; 00552 if (CountPartitions(text_search) > 0) 00553 break; 00554 bounding_box_.set_top(line->MidY()); 00555 } 00556 // As above, is the closest line below good? 00557 line = NULL; 00558 gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(), 00559 bounding_box_.bottom()); 00560 while ((line = gsearch.NextVerticalSearch(true)) != NULL) { 00561 if (!line->IsHorizontalLine()) 00562 break; 00563 TBOX text_search(bounding_box_.left(), line->MidY(), 00564 bounding_box_.right(), bounding_box_.bottom() - 1); 00565 if (text_search.height() > median_cell_height_ * 2) 00566 break; 00567 if (CountPartitions(text_search) > 0) 00568 break; 00569 bounding_box_.set_bottom(line->MidY()); 00570 } 00571 // TODO(nbeato): vertical lines 00572 } 00573 00574 00575 // This function will find all "0 valleys" (of any length) given two 00576 // arrays. The arrays are the mins and maxes of partitions (either 00577 // left and right or bottom and top). Since the min/max lists are generated 00578 // with pairs of increasing integers, we can make some assumptions in 00579 // the function about ordering of the overall list, which are shown in the 00580 // asserts. 00581 // The algorithm works as follows: 00582 // While there are numbers to process, take the smallest number. 00583 // If it is from the min_list, increment the "hill" counter. 00584 // Otherwise, decrement the "hill" counter. 00585 // In the process of doing this, keep track of "crossing" the 00586 // desired height. 00587 // The first/last items are extremal values of the list and known. 00588 // NOTE: This function assumes the lists are sorted! 00589 void StructuredTable::FindCellSplitLocations(const GenericVector<int>& min_list, 00590 const GenericVector<int>& max_list, 00591 int max_merged, 00592 GenericVector<int>* locations) { 00593 locations->clear(); 00594 ASSERT_HOST(min_list.length() == max_list.length()); 00595 if (min_list.length() == 0) 00596 return; 00597 ASSERT_HOST(min_list.get(0) < max_list.get(0)); 00598 ASSERT_HOST(min_list.get(min_list.length() - 1) < 00599 max_list.get(max_list.length() - 1)); 00600 00601 locations->push_back(min_list.get(0)); 00602 int min_index = 0; 00603 int max_index = 0; 00604 int stacked_partitions = 0; 00605 int last_cross_position = MAX_INT32; 00606 // max_index will expire after min_index. 00607 // However, we can't "increase" the hill size if min_index expired. 00608 // So finish processing when min_index expires. 00609 while (min_index < min_list.length()) { 00610 // Increase the hill count. 00611 if (min_list[min_index] < max_list[max_index]) { 00612 ++stacked_partitions; 00613 if (last_cross_position != MAX_INT32 && 00614 stacked_partitions > max_merged) { 00615 int mid = (last_cross_position + min_list[min_index]) / 2; 00616 locations->push_back(mid); 00617 last_cross_position = MAX_INT32; 00618 } 00619 ++min_index; 00620 } else { 00621 // Decrease the hill count. 00622 --stacked_partitions; 00623 if (last_cross_position == MAX_INT32 && 00624 stacked_partitions <= max_merged) { 00625 last_cross_position = max_list[max_index]; 00626 } 00627 ++max_index; 00628 } 00629 } 00630 locations->push_back(max_list.get(max_list.length() - 1)); 00631 } 00632 00633 // Counts the number of partitions in the table 00634 // box that intersection the given x value. 00635 int StructuredTable::CountVerticalIntersections(int x) { 00636 int count = 0; 00637 // Make a small box to keep the search time down. 00638 const int kGridSize = text_grid_->gridsize(); 00639 TBOX vertical_box = bounding_box_; 00640 vertical_box.set_left(x - kGridSize); 00641 vertical_box.set_right(x + kGridSize); 00642 00643 ColPartitionGridSearch gsearch(text_grid_); 00644 gsearch.SetUniqueMode(true); 00645 gsearch.StartRectSearch(vertical_box); 00646 ColPartition* text = NULL; 00647 while ((text = gsearch.NextRectSearch()) != NULL) { 00648 if (!text->IsTextType()) 00649 continue; 00650 const TBOX& box = text->bounding_box(); 00651 if (box.left() < x && x < box.right()) 00652 ++count; 00653 } 00654 return count; 00655 } 00656 00657 // Counts the number of partitions in the table 00658 // box that intersection the given y value. 00659 int StructuredTable::CountHorizontalIntersections(int y) { 00660 int count = 0; 00661 // Make a small box to keep the search time down. 00662 const int kGridSize = text_grid_->gridsize(); 00663 TBOX horizontal_box = bounding_box_; 00664 horizontal_box.set_bottom(y - kGridSize); 00665 horizontal_box.set_top(y + kGridSize); 00666 00667 ColPartitionGridSearch gsearch(text_grid_); 00668 gsearch.SetUniqueMode(true); 00669 gsearch.StartRectSearch(horizontal_box); 00670 ColPartition* text = NULL; 00671 while ((text = gsearch.NextRectSearch()) != NULL) { 00672 if (!text->IsTextType()) 00673 continue; 00674 00675 const TBOX& box = text->bounding_box(); 00676 if (box.bottom() < y && y < box.top()) 00677 ++count; 00678 } 00679 return count; 00680 } 00681 00682 // Counts how many text partitions are in this box. 00683 // This is used to count partitons in cells, as that can indicate 00684 // how "strong" a potential table row/colum (or even full table) actually is. 00685 int StructuredTable::CountPartitions(const TBOX& box) { 00686 ColPartitionGridSearch gsearch(text_grid_); 00687 gsearch.SetUniqueMode(true); 00688 gsearch.StartRectSearch(box); 00689 int count = 0; 00690 ColPartition* text = NULL; 00691 while ((text = gsearch.NextRectSearch()) != NULL) { 00692 if (text->IsTextType()) 00693 ++count; 00694 } 00695 return count; 00696 } 00697 00701 00702 TableRecognizer::TableRecognizer() 00703 : text_grid_(NULL), 00704 line_grid_(NULL), 00705 min_height_(0), 00706 min_width_(0), 00707 max_text_height_(MAX_INT32) { 00708 } 00709 00710 TableRecognizer::~TableRecognizer() { 00711 } 00712 00713 void TableRecognizer::Init() { 00714 } 00715 00716 void TableRecognizer::set_text_grid(ColPartitionGrid* text_grid) { 00717 text_grid_ = text_grid; 00718 } 00719 void TableRecognizer::set_line_grid(ColPartitionGrid* line_grid) { 00720 line_grid_ = line_grid; 00721 } 00722 void TableRecognizer::set_min_height(int height) { 00723 min_height_ = height; 00724 } 00725 void TableRecognizer::set_min_width(int width) { 00726 min_width_ = width; 00727 } 00728 void TableRecognizer::set_max_text_height(int height) { 00729 max_text_height_ = height; 00730 } 00731 00732 StructuredTable* TableRecognizer::RecognizeTable(const TBOX& guess) { 00733 StructuredTable* table = new StructuredTable(); 00734 table->Init(); 00735 table->set_text_grid(text_grid_); 00736 table->set_line_grid(line_grid_); 00737 table->set_max_text_height(max_text_height_); 00738 00739 // Try to solve ths simple case, a table with *both* 00740 // vertical and horizontal lines. 00741 if (RecognizeLinedTable(guess, table)) 00742 return table; 00743 00744 // Fallback to whitespace if that failed. 00745 // TODO(nbeato): Break this apart to take advantage of horizontal 00746 // lines or vertical lines when present. 00747 if (RecognizeWhitespacedTable(guess, table)) 00748 return table; 00749 00750 // No table found... 00751 delete table; 00752 return NULL; 00753 } 00754 00755 bool TableRecognizer::RecognizeLinedTable(const TBOX& guess_box, 00756 StructuredTable* table) { 00757 if (!HasSignificantLines(guess_box)) 00758 return false; 00759 TBOX line_bound = guess_box; 00760 if (!FindLinesBoundingBox(&line_bound)) 00761 return false; 00762 table->set_bounding_box(line_bound); 00763 return table->FindLinedStructure(); 00764 } 00765 00766 // Quick implementation. Just count the number of lines in the box. 00767 // A better implementation would counter intersections and look for connected 00768 // components. It could even go as far as finding similar length lines. 00769 // To account for these possible issues, the VerifyLinedTableCells function 00770 // will reject lined tables that cause intersections with text on the page. 00771 // TODO(nbeato): look for "better" lines 00772 bool TableRecognizer::HasSignificantLines(const TBOX& guess) { 00773 ColPartitionGridSearch box_search(line_grid_); 00774 box_search.SetUniqueMode(true); 00775 box_search.StartRectSearch(guess); 00776 ColPartition* line = NULL; 00777 int vertical_count = 0; 00778 int horizontal_count = 0; 00779 00780 while ((line = box_search.NextRectSearch()) != NULL) { 00781 if (line->IsHorizontalLine()) 00782 ++horizontal_count; 00783 if (line->IsVerticalLine()) 00784 ++vertical_count; 00785 } 00786 00787 return vertical_count >= kLinedTableMinVerticalLines && 00788 horizontal_count >= kLinedTableMinHorizontalLines; 00789 } 00790 00791 // Given a bounding box with a bunch of horizontal / vertical lines, 00792 // we just find the extents of all of these lines iteratively. 00793 // The box will be at least as large as guess. This 00794 // could possibly be a bad assumption. 00795 // It is guaranteed to halt in at least O(n * gridarea) where n 00796 // is the number of lines. 00797 // The assumption is that growing the box iteratively will add lines 00798 // several times, but eventually we'll find the extents. 00799 // 00800 // For tables, the approach is a bit aggressive, a single line (which could be 00801 // noise or a column ruling) can destroy the table inside. 00802 // 00803 // TODO(nbeato): This is a quick first implementation. 00804 // A better implementation would actually look for consistency 00805 // in extents of the lines and find the extents using lines 00806 // that clearly describe the table. This would allow the 00807 // lines to "vote" for height/width. An approach like 00808 // this would solve issues with page layout rulings. 00809 // I haven't looked for these issues yet, so I can't even 00810 // say they happen confidently. 00811 bool TableRecognizer::FindLinesBoundingBox(TBOX* bounding_box) { 00812 // The first iteration will tell us if there are lines 00813 // present and shrink the box to a minimal iterative size. 00814 if (!FindLinesBoundingBoxIteration(bounding_box)) 00815 return false; 00816 00817 // Keep growing until the area of the table stabilizes. 00818 // The box can only get bigger, increasing area. 00819 bool changed = true; 00820 while (changed) { 00821 changed = false; 00822 int old_area = bounding_box->area(); 00823 bool check = FindLinesBoundingBoxIteration(bounding_box); 00824 // At this point, the function will return true. 00825 ASSERT_HOST(check); 00826 ASSERT_HOST(bounding_box->area() >= old_area); 00827 changed = (bounding_box->area() > old_area); 00828 } 00829 00830 return true; 00831 } 00832 00833 bool TableRecognizer::FindLinesBoundingBoxIteration(TBOX* bounding_box) { 00834 // Search for all of the lines in the current box, keeping track of extents. 00835 ColPartitionGridSearch box_search(line_grid_); 00836 box_search.SetUniqueMode(true); 00837 box_search.StartRectSearch(*bounding_box); 00838 ColPartition* line = NULL; 00839 bool first_line = true; 00840 00841 while ((line = box_search.NextRectSearch()) != NULL) { 00842 if (line->IsLineType()) { 00843 if (first_line) { 00844 // The first iteration can shrink the box. 00845 *bounding_box = line->bounding_box(); 00846 first_line = false; 00847 } else { 00848 *bounding_box += line->bounding_box(); 00849 } 00850 } 00851 } 00852 return !first_line; 00853 } 00854 00855 // The goal of this function is to move the table boundaries around and find 00856 // a table that maximizes the whitespace around the table while maximizing 00857 // the cellular structure. As a result, it gets confused by headers, footers, 00858 // and merged columns (text that crosses columns). There is a tolerance 00859 // that allows a few partitions to count towards potential cell merges. 00860 // It's the max_merged parameter to FindPartitionLocations. 00861 // It can work, but it needs some false positive remove on boundaries. 00862 // For now, the grid structure must not intersect any partitions. 00863 // Also, small tolerance is added to the horizontal lines for tightly packed 00864 // tables. The tolerance is added by adjusting the bounding boxes of the 00865 // partitions (in FindHorizontalPartitions). The current implementation 00866 // only adjusts the vertical extents of the table. 00867 // 00868 // Also note. This was hacked at a lot. It could probably use some 00869 // more hacking at to find a good set of border conditions and then a 00870 // nice clean up. 00871 bool TableRecognizer::RecognizeWhitespacedTable(const TBOX& guess_box, 00872 StructuredTable* table) { 00873 TBOX best_box = guess_box; // Best borders known. 00874 int best_below = 0; // Margin size above best table. 00875 int best_above = 0; // Margin size below best table. 00876 TBOX adjusted = guess_box; // The search box. 00877 00878 // We assume that the guess box is somewhat accurate, so we don't allow 00879 // the adjusted border to pass half of the guessed area. This prevents 00880 // "negative" tables from forming. 00881 const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2; 00882 // Keeps track of the most columns in an accepted table. The resulting table 00883 // may be less than the max, but we don't want to stray too far. 00884 int best_cols = 0; 00885 // Make sure we find a good border. 00886 bool found_good_border = false; 00887 00888 // Find the bottom of the table by trying a few different locations. For 00889 // each location, the top, left, and right are fixed. We start the search 00890 // in a smaller table to favor best_cols getting a good estimate sooner. 00891 int last_bottom = MAX_INT32; 00892 int bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00893 kMidGuessY - min_height_ / 2, true); 00894 int top = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00895 kMidGuessY + min_height_ / 2, false); 00896 adjusted.set_top(top); 00897 00898 // Headers/footers can be spaced far from everything. 00899 // Make sure that the space below is greater than the space above 00900 // the lowest row. 00901 int previous_below = 0; 00902 const int kMaxChances = 10; 00903 int chances = kMaxChances; 00904 while (bottom != last_bottom) { 00905 adjusted.set_bottom(bottom); 00906 00907 if (adjusted.height() >= min_height_) { 00908 // Try to fit the grid on the current box. We give it a chance 00909 // if the number of columns didn't significantly drop. 00910 table->set_bounding_box(adjusted); 00911 if (table->FindWhitespacedStructure() && 00912 table->column_count() >= best_cols * kRequiredColumns) { 00913 if (false && IsWeakTableRow(table, 0)) { 00914 // Currently buggy, but was looking promising so disabled. 00915 --chances; 00916 } else { 00917 // We favor 2 things, 00918 // 1- Adding rows that have partitioned data. 00919 // 2- Better margins (to find header/footer). 00920 // For better tables, we just look for multiple cells in the 00921 // bottom row with data in them. 00922 // For margins, the space below the last row should 00923 // be better than a table with the last row removed. 00924 chances = kMaxChances; 00925 double max_row_height = kMaxRowSize * table->median_cell_height(); 00926 if ((table->space_below() * kMarginFactor >= best_below && 00927 table->space_below() >= previous_below) || 00928 (table->CountFilledCellsInRow(0) > 1 && 00929 table->row_height(0) < max_row_height)) { 00930 best_box.set_bottom(bottom); 00931 best_below = table->space_below(); 00932 best_cols = MAX(table->column_count(), best_cols); 00933 found_good_border = true; 00934 } 00935 } 00936 previous_below = table->space_below(); 00937 } else { 00938 --chances; 00939 } 00940 } 00941 if (chances <= 0) 00942 break; 00943 00944 last_bottom = bottom; 00945 bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00946 last_bottom, true); 00947 } 00948 if (!found_good_border) 00949 return false; 00950 00951 // TODO(nbeato) comments: follow modified code above... put it in a function! 00952 found_good_border = false; 00953 int last_top = MIN_INT32; 00954 top = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00955 kMidGuessY + min_height_ / 2, false); 00956 int previous_above = 0; 00957 chances = kMaxChances; 00958 00959 adjusted.set_bottom(best_box.bottom()); 00960 while (last_top != top) { 00961 adjusted.set_top(top); 00962 if (adjusted.height() >= min_height_) { 00963 table->set_bounding_box(adjusted); 00964 if (table->FindWhitespacedStructure() && 00965 table->column_count() >= best_cols * kRequiredColumns) { 00966 int last_row = table->row_count() - 1; 00967 if (false && IsWeakTableRow(table, last_row)) { 00968 // Currently buggy, but was looking promising so disabled. 00969 --chances; 00970 } else { 00971 chances = kMaxChances; 00972 double max_row_height = kMaxRowSize * table->median_cell_height(); 00973 if ((table->space_above() * kMarginFactor >= best_above && 00974 table->space_above() >= previous_above) || 00975 (table->CountFilledCellsInRow(last_row) > 1 && 00976 table->row_height(last_row) < max_row_height)) { 00977 best_box.set_top(top); 00978 best_above = table->space_above(); 00979 best_cols = MAX(table->column_count(), best_cols); 00980 found_good_border = true; 00981 } 00982 } 00983 previous_above = table->space_above(); 00984 } else { 00985 --chances; 00986 } 00987 } 00988 if (chances <= 0) 00989 break; 00990 00991 last_top = top; 00992 top = NextHorizontalSplit(guess_box.left(), guess_box.right(), 00993 last_top, false); 00994 } 00995 00996 if (!found_good_border) 00997 return false; 00998 00999 // If we get here, this shouldn't happen. It can be an assert, but 01000 // I haven't tested it enough to make it crash things. 01001 if (best_box.null_box()) 01002 return false; 01003 01004 // Given the best locations, fit the box to those locations. 01005 table->set_bounding_box(best_box); 01006 return table->FindWhitespacedStructure(); 01007 } 01008 01009 // Finds the closest value to y that can safely cause a horizontal 01010 // split in the partitions. 01011 // This function has been buggy and not as reliable as I would've 01012 // liked. I suggest finding all of the splits using the 01013 // FindPartitionLocations once and then just keeping the results 01014 // of that function cached somewhere. 01015 int TableRecognizer::NextHorizontalSplit(int left, int right, int y, 01016 bool top_to_bottom) { 01017 ColPartitionGridSearch gsearch(text_grid_); 01018 gsearch.SetUniqueMode(true); 01019 gsearch.StartVerticalSearch(left, right, y); 01020 ColPartition* text = NULL; 01021 int last_y = y; 01022 while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != NULL) { 01023 if (!text->IsTextType() || !text->IsHorizontalType()) 01024 continue; 01025 if (text->bounding_box().height() > max_text_height_) 01026 continue; 01027 01028 const TBOX& text_box = text->bounding_box(); 01029 if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) { 01030 last_y = MIN(last_y, text_box.bottom()); 01031 continue; 01032 } 01033 if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) { 01034 last_y = MAX(last_y, text_box.top()); 01035 continue; 01036 } 01037 01038 return last_y; 01039 } 01040 // If none is found, we at least want to preserve the min/max, 01041 // which defines the overlap of y with the last partition in the grid. 01042 return last_y; 01043 } 01044 01045 // Code is buggy right now. It is disabled in the calling function. 01046 // It seems like sometimes the row that is passed in is not correct 01047 // sometimes (like a phantom row is introduced). There's something going 01048 // on in the cell_y_ data member before this is called... not certain. 01049 bool TableRecognizer::IsWeakTableRow(StructuredTable* table, int row) { 01050 if (!table->VerifyRowFilled(row)) 01051 return false; 01052 01053 double threshold = 0.0; 01054 if (table->column_count() > kGoodRowNumberOfColumnsSmallSize) 01055 threshold = table->column_count() * kGoodRowNumberOfColumnsLarge; 01056 else 01057 threshold = kGoodRowNumberOfColumnsSmall[table->column_count()]; 01058 01059 return table->CountFilledCellsInRow(row) < threshold; 01060 } 01061 01062 } // namespace tesseract