tesseract
3.03
|
00001 00002 // File: tabvector.cpp 00003 // Description: Class to hold a near-vertical vector representing a tab-stop. 00004 // Author: Ray Smith 00005 // Created: Thu Apr 10 16:28:01 PST 2008 00006 // 00007 // (C) Copyright 2008, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #ifdef HAVE_CONFIG_H 00025 #include "config_auto.h" 00026 #endif 00027 00028 #include "tabvector.h" 00029 #include "blobbox.h" 00030 #include "colfind.h" 00031 #include "colpartitionset.h" 00032 #include "detlinefit.h" 00033 #include "statistc.h" 00034 00035 namespace tesseract { 00036 00037 // Multiple of height used as a gutter for evaluation search. 00038 const int kGutterMultiple = 4; 00039 // Multiple of neighbour gap that we expect the gutter gap to be at minimum. 00040 const int kGutterToNeighbourRatio = 3; 00041 // Pixel distance for tab vectors to be considered the same. 00042 const int kSimilarVectorDist = 10; 00043 // Pixel distance for ragged tab vectors to be considered the same if there 00044 // is nothing in the overlap box 00045 const int kSimilarRaggedDist = 50; 00046 // Max multiple of height to allow filling in between blobs when evaluating. 00047 const int kMaxFillinMultiple = 11; 00048 // Min fraction of mean gutter size to allow a gutter on a good tab blob. 00049 const double kMinGutterFraction = 0.5; 00050 // Multiple of 1/n lines as a minimum gutter in evaluation. 00051 const double kLineCountReciprocal = 4.0; 00052 // Constant add-on for minimum gutter for aligned tabs. 00053 const double kMinAlignedGutter = 0.25; 00054 // Constant add-on for minimum gutter for ragged tabs. 00055 const double kMinRaggedGutter = 1.5; 00056 00057 double_VAR(textord_tabvector_vertical_gap_fraction, 0.5, 00058 "max fraction of mean blob width allowed for vertical gaps in vertical text"); 00059 00060 double_VAR(textord_tabvector_vertical_box_ratio, 0.5, 00061 "Fraction of box matches required to declare a line vertical"); 00062 00063 ELISTIZE(TabConstraint) 00064 00065 // Create a constraint for the top or bottom of this TabVector. 00066 void TabConstraint::CreateConstraint(TabVector* vector, bool is_top) { 00067 TabConstraint* constraint = new TabConstraint(vector, is_top); 00068 TabConstraint_LIST* constraints = new TabConstraint_LIST; 00069 TabConstraint_IT it(constraints); 00070 it.add_to_end(constraint); 00071 if (is_top) 00072 vector->set_top_constraints(constraints); 00073 else 00074 vector->set_bottom_constraints(constraints); 00075 } 00076 00077 // Test to see if the constraints are compatible enough to merge. 00078 bool TabConstraint::CompatibleConstraints(TabConstraint_LIST* list1, 00079 TabConstraint_LIST* list2) { 00080 if (list1 == list2) 00081 return false; 00082 int y_min = -MAX_INT32; 00083 int y_max = MAX_INT32; 00084 if (textord_debug_tabfind > 3) 00085 tprintf("Testing constraint compatibility\n"); 00086 GetConstraints(list1, &y_min, &y_max); 00087 GetConstraints(list2, &y_min, &y_max); 00088 if (textord_debug_tabfind > 3) 00089 tprintf("Resulting range = [%d,%d]\n", y_min, y_max); 00090 return y_max >= y_min; 00091 } 00092 00093 // Merge the lists of constraints and update the TabVector pointers. 00094 // The second list is deleted. 00095 void TabConstraint::MergeConstraints(TabConstraint_LIST* list1, 00096 TabConstraint_LIST* list2) { 00097 if (list1 == list2) 00098 return; 00099 TabConstraint_IT it(list2); 00100 if (textord_debug_tabfind > 3) 00101 tprintf("Merging constraints\n"); 00102 // The vectors of all constraints on list2 are now going to be on list1. 00103 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00104 TabConstraint* constraint = it.data(); 00105 if (textord_debug_tabfind> 3) 00106 constraint->vector_->Print("Merge"); 00107 if (constraint->is_top_) 00108 constraint->vector_->set_top_constraints(list1); 00109 else 00110 constraint->vector_->set_bottom_constraints(list1); 00111 } 00112 it = list1; 00113 it.add_list_before(list2); 00114 delete list2; 00115 } 00116 00117 // Set all the tops and bottoms as appropriate to a mean of the 00118 // constrained range. Delete all the constraints and list. 00119 void TabConstraint::ApplyConstraints(TabConstraint_LIST* constraints) { 00120 int y_min = -MAX_INT32; 00121 int y_max = MAX_INT32; 00122 GetConstraints(constraints, &y_min, &y_max); 00123 int y = (y_min + y_max) / 2; 00124 TabConstraint_IT it(constraints); 00125 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00126 TabConstraint* constraint = it.data(); 00127 TabVector* v = constraint->vector_; 00128 if (constraint->is_top_) { 00129 v->SetYEnd(y); 00130 v->set_top_constraints(NULL); 00131 } else { 00132 v->SetYStart(y); 00133 v->set_bottom_constraints(NULL); 00134 } 00135 } 00136 delete constraints; 00137 } 00138 00139 TabConstraint::TabConstraint(TabVector* vector, bool is_top) 00140 : vector_(vector), is_top_(is_top) { 00141 if (is_top) { 00142 y_min_ = vector->endpt().y(); 00143 y_max_ = vector->extended_ymax(); 00144 } else { 00145 y_max_ = vector->startpt().y(); 00146 y_min_ = vector->extended_ymin(); 00147 } 00148 } 00149 00150 // Get the max of the mins and the min of the maxes. 00151 void TabConstraint::GetConstraints(TabConstraint_LIST* constraints, 00152 int* y_min, int* y_max) { 00153 TabConstraint_IT it(constraints); 00154 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00155 TabConstraint* constraint = it.data(); 00156 if (textord_debug_tabfind > 3) { 00157 tprintf("Constraint is [%d,%d]", constraint->y_min_, constraint->y_max_); 00158 constraint->vector_->Print(" for"); 00159 } 00160 *y_min = MAX(*y_min, constraint->y_min_); 00161 *y_max = MIN(*y_max, constraint->y_max_); 00162 } 00163 } 00164 00165 ELIST2IZE(TabVector) 00166 CLISTIZE(TabVector) 00167 00168 // The constructor is private. See the bottom of the file... 00169 00170 TabVector::~TabVector() { 00171 } 00172 00173 00174 // Public factory to build a TabVector from a list of boxes. 00175 // The TabVector will be of the given alignment type. 00176 // The input vertical vector is used in fitting, and the output 00177 // vertical_x, vertical_y have the resulting line vector added to them 00178 // if the alignment is not ragged. 00179 // The extended_start_y and extended_end_y are the maximum possible 00180 // extension to the line segment that can be used to align with others. 00181 // The input CLIST of BLOBNBOX good_points is consumed and taken over. 00182 TabVector* TabVector::FitVector(TabAlignment alignment, ICOORD vertical, 00183 int extended_start_y, int extended_end_y, 00184 BLOBNBOX_CLIST* good_points, 00185 int* vertical_x, int* vertical_y) { 00186 TabVector* vector = new TabVector(extended_start_y, extended_end_y, 00187 alignment, good_points); 00188 if (!vector->Fit(vertical, false)) { 00189 delete vector; 00190 return NULL; 00191 } 00192 if (!vector->IsRagged()) { 00193 vertical = vector->endpt_ - vector->startpt_; 00194 int weight = vector->BoxCount(); 00195 *vertical_x += vertical.x() * weight; 00196 *vertical_y += vertical.y() * weight; 00197 } 00198 return vector; 00199 } 00200 00201 // Build a ragged TabVector by copying another's direction, shifting it 00202 // to match the given blob, and making its initial extent the height 00203 // of the blob, but its extended bounds from the bounds of the original. 00204 TabVector::TabVector(const TabVector& src, TabAlignment alignment, 00205 const ICOORD& vertical_skew, BLOBNBOX* blob) 00206 : extended_ymin_(src.extended_ymin_), extended_ymax_(src.extended_ymax_), 00207 sort_key_(0), percent_score_(0), mean_width_(0), 00208 needs_refit_(true), needs_evaluation_(true), intersects_other_lines_(false), 00209 alignment_(alignment), 00210 top_constraints_(NULL), bottom_constraints_(NULL) { 00211 BLOBNBOX_C_IT it(&boxes_); 00212 it.add_to_end(blob); 00213 TBOX box = blob->bounding_box(); 00214 if (IsLeftTab()) { 00215 startpt_ = box.botleft(); 00216 endpt_ = box.topleft(); 00217 } else { 00218 startpt_ = box.botright(); 00219 endpt_ = box.topright(); 00220 } 00221 sort_key_ = SortKey(vertical_skew, 00222 (startpt_.x() + endpt_.x()) / 2, 00223 (startpt_.y() + endpt_.y()) / 2); 00224 if (textord_debug_tabfind > 3) 00225 Print("Constructed a new tab vector:"); 00226 } 00227 00228 // Copies basic attributes of a tab vector for simple operations. 00229 // Copies things such startpt, endpt, range. 00230 // Does not copy things such as partners, boxes, or constraints. 00231 // This is useful if you only need vector information for processing, such 00232 // as in the table detection code. 00233 TabVector* TabVector::ShallowCopy() const { 00234 TabVector* copy = new TabVector(); 00235 copy->startpt_ = startpt_; 00236 copy->endpt_ = endpt_; 00237 copy->alignment_ = alignment_; 00238 copy->extended_ymax_ = extended_ymax_; 00239 copy->extended_ymin_ = extended_ymin_; 00240 copy->intersects_other_lines_ = intersects_other_lines_; 00241 return copy; 00242 } 00243 00244 // Extend this vector to include the supplied blob if it doesn't 00245 // already have it. 00246 void TabVector::ExtendToBox(BLOBNBOX* new_blob) { 00247 TBOX new_box = new_blob->bounding_box(); 00248 BLOBNBOX_C_IT it(&boxes_); 00249 if (!it.empty()) { 00250 BLOBNBOX* blob = it.data(); 00251 TBOX box = blob->bounding_box(); 00252 while (!it.at_last() && box.top() <= new_box.top()) { 00253 if (blob == new_blob) 00254 return; // We have it already. 00255 it.forward(); 00256 blob = it.data(); 00257 box = blob->bounding_box(); 00258 } 00259 if (box.top() >= new_box.top()) { 00260 it.add_before_stay_put(new_blob); 00261 needs_refit_ = true; 00262 return; 00263 } 00264 } 00265 needs_refit_ = true; 00266 it.add_after_stay_put(new_blob); 00267 } 00268 00269 // Set the ycoord of the start and move the xcoord to match. 00270 void TabVector::SetYStart(int start_y) { 00271 startpt_.set_x(XAtY(start_y)); 00272 startpt_.set_y(start_y); 00273 } 00274 // Set the ycoord of the end and move the xcoord to match. 00275 void TabVector::SetYEnd(int end_y) { 00276 endpt_.set_x(XAtY(end_y)); 00277 endpt_.set_y(end_y); 00278 } 00279 00280 // Rotate the ends by the given vector. Auto flip start and end if needed. 00281 void TabVector::Rotate(const FCOORD& rotation) { 00282 startpt_.rotate(rotation); 00283 endpt_.rotate(rotation); 00284 int dx = endpt_.x() - startpt_.x(); 00285 int dy = endpt_.y() - startpt_.y(); 00286 if ((dy < 0 && abs(dy) > abs(dx)) || (dx < 0 && abs(dx) > abs(dy))) { 00287 // Need to flip start/end. 00288 ICOORD tmp = startpt_; 00289 startpt_ = endpt_; 00290 endpt_ = tmp; 00291 } 00292 } 00293 00294 // Setup the initial constraints, being the limits of 00295 // the vector and the extended ends. 00296 void TabVector::SetupConstraints() { 00297 TabConstraint::CreateConstraint(this, false); 00298 TabConstraint::CreateConstraint(this, true); 00299 } 00300 00301 // Setup the constraints between the partners of this TabVector. 00302 void TabVector::SetupPartnerConstraints() { 00303 // With the first and last partner, we want a common bottom and top, 00304 // respectively, and for each change of partner, we want a common 00305 // top of first with bottom of next. 00306 TabVector_C_IT it(&partners_); 00307 TabVector* prev_partner = NULL; 00308 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00309 TabVector* partner = it.data(); 00310 if (partner->top_constraints_ == NULL || 00311 partner->bottom_constraints_ == NULL) { 00312 partner->Print("Impossible: has no constraints"); 00313 Print("This vector has it as a partner"); 00314 continue; 00315 } 00316 if (prev_partner == NULL) { 00317 // This is the first partner, so common bottom. 00318 if (TabConstraint::CompatibleConstraints(bottom_constraints_, 00319 partner->bottom_constraints_)) 00320 TabConstraint::MergeConstraints(bottom_constraints_, 00321 partner->bottom_constraints_); 00322 } else { 00323 // We need prev top to be common with partner bottom. 00324 if (TabConstraint::CompatibleConstraints(prev_partner->top_constraints_, 00325 partner->bottom_constraints_)) 00326 TabConstraint::MergeConstraints(prev_partner->top_constraints_, 00327 partner->bottom_constraints_); 00328 } 00329 prev_partner = partner; 00330 if (it.at_last()) { 00331 // This is the last partner, so common top. 00332 if (TabConstraint::CompatibleConstraints(top_constraints_, 00333 partner->top_constraints_)) 00334 TabConstraint::MergeConstraints(top_constraints_, 00335 partner->top_constraints_); 00336 } 00337 } 00338 } 00339 00340 // Setup the constraints between this and its partner. 00341 void TabVector::SetupPartnerConstraints(TabVector* partner) { 00342 if (TabConstraint::CompatibleConstraints(bottom_constraints_, 00343 partner->bottom_constraints_)) 00344 TabConstraint::MergeConstraints(bottom_constraints_, 00345 partner->bottom_constraints_); 00346 if (TabConstraint::CompatibleConstraints(top_constraints_, 00347 partner->top_constraints_)) 00348 TabConstraint::MergeConstraints(top_constraints_, 00349 partner->top_constraints_); 00350 } 00351 00352 // Use the constraints to modify the top and bottom. 00353 void TabVector::ApplyConstraints() { 00354 if (top_constraints_ != NULL) 00355 TabConstraint::ApplyConstraints(top_constraints_); 00356 if (bottom_constraints_ != NULL) 00357 TabConstraint::ApplyConstraints(bottom_constraints_); 00358 } 00359 00360 // Merge close tab vectors of the same side that overlap. 00361 void TabVector::MergeSimilarTabVectors(const ICOORD& vertical, 00362 TabVector_LIST* vectors, 00363 BlobGrid* grid) { 00364 TabVector_IT it1(vectors); 00365 for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) { 00366 TabVector* v1 = it1.data(); 00367 TabVector_IT it2(it1); 00368 for (it2.forward(); !it2.at_first(); it2.forward()) { 00369 TabVector* v2 = it2.data(); 00370 if (v2->SimilarTo(vertical, *v1, grid)) { 00371 // Merge into the forward one, in case the combined vector now 00372 // overlaps one in between. 00373 if (textord_debug_tabfind) { 00374 v2->Print("Merging"); 00375 v1->Print("by deleting"); 00376 } 00377 v2->MergeWith(vertical, it1.extract()); 00378 if (textord_debug_tabfind) { 00379 v2->Print("Producing"); 00380 } 00381 ICOORD merged_vector = v2->endpt(); 00382 merged_vector -= v2->startpt(); 00383 if (textord_debug_tabfind && abs(merged_vector.x()) > 100) { 00384 v2->Print("Garbage result of merge?"); 00385 } 00386 break; 00387 } 00388 } 00389 } 00390 } 00391 00392 // Return true if this vector is the same side, overlaps, and close 00393 // enough to the other to be merged. 00394 bool TabVector::SimilarTo(const ICOORD& vertical, 00395 const TabVector& other, BlobGrid* grid) const { 00396 if ((IsRightTab() && other.IsRightTab()) || 00397 (IsLeftTab() && other.IsLeftTab())) { 00398 // If they don't overlap, at least in extensions, then there is no chance. 00399 if (ExtendedOverlap(other.extended_ymax_, other.extended_ymin_) < 0) 00400 return false; 00401 // A fast approximation to the scale factor of the sort_key_. 00402 int v_scale = abs(vertical.y()); 00403 if (v_scale == 0) 00404 v_scale = 1; 00405 // If they are close enough, then OK. 00406 if (sort_key_ + kSimilarVectorDist * v_scale >= other.sort_key_ && 00407 sort_key_ - kSimilarVectorDist * v_scale <= other.sort_key_) 00408 return true; 00409 // Ragged tabs get a bigger threshold. 00410 if (!IsRagged() || !other.IsRagged() || 00411 sort_key_ + kSimilarRaggedDist * v_scale < other.sort_key_ || 00412 sort_key_ - kSimilarRaggedDist * v_scale > other.sort_key_) 00413 return false; 00414 if (grid == NULL) { 00415 // There is nothing else to test! 00416 return true; 00417 } 00418 // If there is nothing in the rectangle between the vector that is going to 00419 // move, and the place it is moving to, then they can be merged. 00420 // Setup a vertical search for any blob. 00421 const TabVector* mover = (IsRightTab() && 00422 sort_key_ < other.sort_key_) ? this : &other; 00423 int top_y = mover->endpt_.y(); 00424 int bottom_y = mover->startpt_.y(); 00425 int left = MIN(mover->XAtY(top_y), mover->XAtY(bottom_y)); 00426 int right = MAX(mover->XAtY(top_y), mover->XAtY(bottom_y)); 00427 int shift = abs(sort_key_ - other.sort_key_) / v_scale; 00428 if (IsRightTab()) { 00429 right += shift; 00430 } else { 00431 left -= shift; 00432 } 00433 00434 GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> vsearch(grid); 00435 vsearch.StartVerticalSearch(left, right, top_y); 00436 BLOBNBOX* blob; 00437 while ((blob = vsearch.NextVerticalSearch(true)) != NULL) { 00438 TBOX box = blob->bounding_box(); 00439 if (box.top() > bottom_y) 00440 return true; // Nothing found. 00441 if (box.bottom() < top_y) 00442 continue; // Doesn't overlap. 00443 int left_at_box = XAtY(box.bottom()); 00444 int right_at_box = left_at_box; 00445 if (IsRightTab()) 00446 right_at_box += shift; 00447 else 00448 left_at_box -= shift; 00449 if (MIN(right_at_box, box.right()) > MAX(left_at_box, box.left())) 00450 return false; 00451 } 00452 return true; // Nothing found. 00453 } 00454 return false; 00455 } 00456 00457 // Eat the other TabVector into this and delete it. 00458 void TabVector::MergeWith(const ICOORD& vertical, TabVector* other) { 00459 extended_ymin_ = MIN(extended_ymin_, other->extended_ymin_); 00460 extended_ymax_ = MAX(extended_ymax_, other->extended_ymax_); 00461 if (other->IsRagged()) { 00462 alignment_ = other->alignment_; 00463 } 00464 // Merge sort the two lists of boxes. 00465 BLOBNBOX_C_IT it1(&boxes_); 00466 BLOBNBOX_C_IT it2(&other->boxes_); 00467 while (!it2.empty()) { 00468 BLOBNBOX* bbox2 = it2.extract(); 00469 it2.forward(); 00470 TBOX box2 = bbox2->bounding_box(); 00471 BLOBNBOX* bbox1 = it1.data(); 00472 TBOX box1 = bbox1->bounding_box(); 00473 while (box1.bottom() < box2.bottom() && !it1.at_last()) { 00474 it1.forward(); 00475 bbox1 = it1.data(); 00476 box1 = bbox1->bounding_box(); 00477 } 00478 if (box1.bottom() < box2.bottom()) { 00479 it1.add_to_end(bbox2); 00480 } else if (bbox1 != bbox2) { 00481 it1.add_before_stay_put(bbox2); 00482 } 00483 } 00484 Fit(vertical, true); 00485 other->Delete(this); 00486 } 00487 00488 // Add a new element to the list of partner TabVectors. 00489 // Partners must be added in order of increasing y coordinate of the text line 00490 // that makes them partners. 00491 // Groups of identical partners are merged into one. 00492 void TabVector::AddPartner(TabVector* partner) { 00493 if (IsSeparator() || partner->IsSeparator()) 00494 return; 00495 TabVector_C_IT it(&partners_); 00496 if (!it.empty()) { 00497 it.move_to_last(); 00498 if (it.data() == partner) 00499 return; 00500 } 00501 it.add_after_then_move(partner); 00502 } 00503 00504 // Return true if other is a partner of this. 00505 bool TabVector::IsAPartner(const TabVector* other) { 00506 TabVector_C_IT it(&partners_); 00507 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00508 if (it.data() == other) 00509 return true; 00510 } 00511 return false; 00512 } 00513 00514 // These names must be synced with the TabAlignment enum in tabvector.h. 00515 const char* kAlignmentNames[] = { 00516 "Left Aligned", 00517 "Left Ragged", 00518 "Center", 00519 "Right Aligned", 00520 "Right Ragged", 00521 "Separator" 00522 }; 00523 00524 // Print basic information about this tab vector. 00525 void TabVector::Print(const char* prefix) { 00526 if (this == NULL) { 00527 tprintf("%s <null>\n", prefix); 00528 } else { 00529 tprintf("%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d," 00530 " partners=%d\n", 00531 prefix, kAlignmentNames[alignment_], 00532 startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y(), 00533 mean_width_, percent_score_, sort_key_, 00534 boxes_.length(), partners_.length()); 00535 } 00536 } 00537 00538 // Print basic information about this tab vector and every box in it. 00539 void TabVector::Debug(const char* prefix) { 00540 Print(prefix); 00541 BLOBNBOX_C_IT it(&boxes_); 00542 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00543 BLOBNBOX* bbox = it.data(); 00544 const TBOX& box = bbox->bounding_box(); 00545 tprintf("Box at (%d,%d)->(%d,%d)\n", 00546 box.left(), box.bottom(), box.right(), box.top()); 00547 } 00548 } 00549 00550 // Draw this tabvector in place in the given window. 00551 void TabVector::Display(ScrollView* tab_win) { 00552 #ifndef GRAPHICS_DISABLED 00553 if (textord_debug_printable) 00554 tab_win->Pen(ScrollView::BLUE); 00555 else if (alignment_ == TA_LEFT_ALIGNED) 00556 tab_win->Pen(ScrollView::LIME_GREEN); 00557 else if (alignment_ == TA_LEFT_RAGGED) 00558 tab_win->Pen(ScrollView::DARK_GREEN); 00559 else if (alignment_ == TA_RIGHT_ALIGNED) 00560 tab_win->Pen(ScrollView::PINK); 00561 else if (alignment_ == TA_RIGHT_RAGGED) 00562 tab_win->Pen(ScrollView::CORAL); 00563 else 00564 tab_win->Pen(ScrollView::WHITE); 00565 tab_win->Line(startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y()); 00566 tab_win->Pen(ScrollView::GREY); 00567 tab_win->Line(startpt_.x(), startpt_.y(), startpt_.x(), extended_ymin_); 00568 tab_win->Line(endpt_.x(), extended_ymax_, endpt_.x(), endpt_.y()); 00569 char score_buf[64]; 00570 snprintf(score_buf, sizeof(score_buf), "%d", percent_score_); 00571 tab_win->TextAttributes("Times", 50, false, false, false); 00572 tab_win->Text(startpt_.x(), startpt_.y(), score_buf); 00573 #endif 00574 } 00575 00576 // Refit the line and/or re-evaluate the vector if the dirty flags are set. 00577 void TabVector::FitAndEvaluateIfNeeded(const ICOORD& vertical, 00578 TabFind* finder) { 00579 if (needs_refit_) 00580 Fit(vertical, true); 00581 if (needs_evaluation_) 00582 Evaluate(vertical, finder); 00583 } 00584 00585 // Evaluate the vector in terms of coverage of its length by good-looking 00586 // box edges. A good looking box is one where its nearest neighbour on the 00587 // inside is nearer than half the distance its nearest neighbour on the 00588 // outside of the putative column. Bad boxes are removed from the line. 00589 // A second pass then further filters boxes by requiring that the gutter 00590 // width be a minimum fraction of the mean gutter along the line. 00591 void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) { 00592 bool debug = false; 00593 needs_evaluation_ = false; 00594 int length = endpt_.y() - startpt_.y(); 00595 if (length == 0 || boxes_.empty()) { 00596 percent_score_ = 0; 00597 Print("Zero length in evaluate"); 00598 return; 00599 } 00600 // Compute the mean box height. 00601 BLOBNBOX_C_IT it(&boxes_); 00602 int mean_height = 0; 00603 int height_count = 0; 00604 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00605 BLOBNBOX* bbox = it.data(); 00606 const TBOX& box = bbox->bounding_box(); 00607 int height = box.height(); 00608 mean_height += height; 00609 ++height_count; 00610 } 00611 if (height_count > 0) mean_height /= height_count; 00612 int max_gutter = kGutterMultiple * mean_height; 00613 if (IsRagged()) { 00614 // Ragged edges face a tougher test in that the gap must always be within 00615 // the height of the blob. 00616 max_gutter = kGutterToNeighbourRatio * mean_height; 00617 } 00618 00619 STATS gutters(0, max_gutter + 1); 00620 // Evaluate the boxes for their goodness, calculating the coverage as we go. 00621 // Remove boxes that are not good and shorten the list to the first and 00622 // last good boxes. 00623 int num_deleted_boxes = 0; 00624 bool text_on_image = false; 00625 int good_length = 0; 00626 const TBOX* prev_good_box = NULL; 00627 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00628 BLOBNBOX* bbox = it.data(); 00629 const TBOX& box = bbox->bounding_box(); 00630 int mid_y = (box.top() + box.bottom()) / 2; 00631 if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) { 00632 if (!debug) { 00633 tprintf("After already deleting %d boxes, ", num_deleted_boxes); 00634 Print("Starting evaluation"); 00635 } 00636 debug = true; 00637 } 00638 // A good box is one where the nearest neighbour on the inside is closer 00639 // than half the distance to the nearest neighbour on the outside 00640 // (of the putative column). 00641 bool left = IsLeftTab(); 00642 int tab_x = XAtY(mid_y); 00643 int gutter_width; 00644 int neighbour_gap; 00645 finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, 00646 bbox, &gutter_width, &neighbour_gap); 00647 if (debug) { 00648 tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n", 00649 box.left(), box.bottom(), box.right(), box.top(), 00650 gutter_width, neighbour_gap); 00651 } 00652 // Now we can make the test. 00653 if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) { 00654 // A good box contributes its height to the good_length. 00655 good_length += box.top() - box.bottom(); 00656 gutters.add(gutter_width, 1); 00657 // Two good boxes together contribute the gap between them 00658 // to the good_length as well, as long as the gap is not 00659 // too big. 00660 if (prev_good_box != NULL) { 00661 int vertical_gap = box.bottom() - prev_good_box->top(); 00662 double size1 = sqrt(static_cast<double>(prev_good_box->area())); 00663 double size2 = sqrt(static_cast<double>(box.area())); 00664 if (vertical_gap < kMaxFillinMultiple * MIN(size1, size2)) 00665 good_length += vertical_gap; 00666 if (debug) { 00667 tprintf("Box and prev good, gap=%d, target %g, goodlength=%d\n", 00668 vertical_gap, kMaxFillinMultiple * MIN(size1, size2), 00669 good_length); 00670 } 00671 } else { 00672 // Adjust the start to the first good box. 00673 SetYStart(box.bottom()); 00674 } 00675 prev_good_box = &box; 00676 if (bbox->flow() == BTFT_TEXT_ON_IMAGE) 00677 text_on_image = true; 00678 } else { 00679 // Get rid of boxes that are not good. 00680 if (debug) { 00681 tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\n", 00682 box.left(), box.bottom(), box.right(), box.top(), 00683 gutter_width, neighbour_gap); 00684 } 00685 it.extract(); 00686 ++num_deleted_boxes; 00687 } 00688 } 00689 if (debug) { 00690 Print("Evaluating:"); 00691 } 00692 // If there are any good boxes, do it again, except this time get rid of 00693 // boxes that have a gutter that is a small fraction of the mean gutter. 00694 // This filters out ends that run into a coincidental gap in the text. 00695 int search_top = endpt_.y(); 00696 int search_bottom = startpt_.y(); 00697 int median_gutter = IntCastRounded(gutters.median()); 00698 if (gutters.get_total() > 0) { 00699 prev_good_box = NULL; 00700 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00701 BLOBNBOX* bbox = it.data(); 00702 const TBOX& box = bbox->bounding_box(); 00703 int mid_y = (box.top() + box.bottom()) / 2; 00704 // A good box is one where the gutter width is at least some constant 00705 // fraction of the mean gutter width. 00706 bool left = IsLeftTab(); 00707 int tab_x = XAtY(mid_y); 00708 int max_gutter = kGutterMultiple * mean_height; 00709 if (IsRagged()) { 00710 // Ragged edges face a tougher test in that the gap must always be 00711 // within the height of the blob. 00712 max_gutter = kGutterToNeighbourRatio * mean_height; 00713 } 00714 int gutter_width; 00715 int neighbour_gap; 00716 finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, 00717 bbox, &gutter_width, &neighbour_gap); 00718 // Now we can make the test. 00719 if (gutter_width >= median_gutter * kMinGutterFraction) { 00720 if (prev_good_box == NULL) { 00721 // Adjust the start to the first good box. 00722 SetYStart(box.bottom()); 00723 search_bottom = box.top(); 00724 } 00725 prev_good_box = &box; 00726 search_top = box.bottom(); 00727 } else { 00728 // Get rid of boxes that are not good. 00729 if (debug) { 00730 tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n", 00731 box.left(), box.bottom(), box.right(), box.top(), 00732 gutter_width, median_gutter); 00733 } 00734 it.extract(); 00735 ++num_deleted_boxes = true; 00736 } 00737 } 00738 } 00739 // If there has been a good box, adjust the end. 00740 if (prev_good_box != NULL) { 00741 SetYEnd(prev_good_box->top()); 00742 // Compute the percentage of the vector that is occupied by good boxes. 00743 int length = endpt_.y() - startpt_.y(); 00744 percent_score_ = 100 * good_length / length; 00745 if (num_deleted_boxes > 0) { 00746 needs_refit_ = true; 00747 FitAndEvaluateIfNeeded(vertical, finder); 00748 if (boxes_.empty()) 00749 return; 00750 } 00751 // Test the gutter over the whole vector, instead of just at the boxes. 00752 int required_shift; 00753 if (search_bottom > search_top) { 00754 search_bottom = startpt_.y(); 00755 search_top = endpt_.y(); 00756 } 00757 double min_gutter_width = kLineCountReciprocal / boxes_.length(); 00758 min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter; 00759 min_gutter_width *= mean_height; 00760 int max_gutter_width = IntCastRounded(min_gutter_width) + 1; 00761 if (median_gutter > max_gutter_width) 00762 max_gutter_width = median_gutter; 00763 int gutter_width = finder->GutterWidth(search_bottom, search_top, *this, 00764 text_on_image, max_gutter_width, 00765 &required_shift); 00766 if (gutter_width < min_gutter_width) { 00767 if (debug) { 00768 tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n", 00769 gutter_width, min_gutter_width); 00770 } 00771 boxes_.shallow_clear(); 00772 percent_score_ = 0; 00773 } else if (debug) { 00774 tprintf("Final gutter %d, vs limit of %g, required shift = %d\n", 00775 gutter_width, min_gutter_width, required_shift); 00776 } 00777 } else { 00778 // There are no good boxes left, so score is 0. 00779 percent_score_ = 0; 00780 } 00781 00782 if (debug) { 00783 Print("Evaluation complete:"); 00784 } 00785 } 00786 00787 // (Re)Fit a line to the stored points. Returns false if the line 00788 // is degenerate. Althougth the TabVector code mostly doesn't care about the 00789 // direction of lines, XAtY would give silly results for a horizontal line. 00790 // The class is mostly aimed at use for vertical lines representing 00791 // horizontal tab stops. 00792 bool TabVector::Fit(ICOORD vertical, bool force_parallel) { 00793 needs_refit_ = false; 00794 if (boxes_.empty()) { 00795 // Don't refit something with no boxes, as that only happens 00796 // in Evaluate, and we don't want to end up with a zero vector. 00797 if (!force_parallel) 00798 return false; 00799 // If we are forcing parallel, then we just need to set the sort_key_. 00800 ICOORD midpt = startpt_; 00801 midpt += endpt_; 00802 midpt /= 2; 00803 sort_key_ = SortKey(vertical, midpt.x(), midpt.y()); 00804 return startpt_.y() != endpt_.y(); 00805 } 00806 if (!force_parallel && !IsRagged()) { 00807 // Use a fitted line as the vertical. 00808 DetLineFit linepoints; 00809 BLOBNBOX_C_IT it(&boxes_); 00810 // Fit a line to all the boxes in the list. 00811 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00812 BLOBNBOX* bbox = it.data(); 00813 TBOX box = bbox->bounding_box(); 00814 int x1 = IsRightTab() ? box.right() : box.left(); 00815 ICOORD boxpt(x1, box.bottom()); 00816 linepoints.Add(boxpt); 00817 if (it.at_last()) { 00818 ICOORD top_pt(x1, box.top()); 00819 linepoints.Add(top_pt); 00820 } 00821 } 00822 linepoints.Fit(&startpt_, &endpt_); 00823 if (startpt_.y() != endpt_.y()) { 00824 vertical = endpt_; 00825 vertical -= startpt_; 00826 } 00827 } 00828 int start_y = startpt_.y(); 00829 int end_y = endpt_.y(); 00830 sort_key_ = IsLeftTab() ? MAX_INT32 : -MAX_INT32; 00831 BLOBNBOX_C_IT it(&boxes_); 00832 // Choose a line parallel to the vertical such that all boxes are on the 00833 // correct side of it. 00834 mean_width_ = 0; 00835 int width_count = 0; 00836 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00837 BLOBNBOX* bbox = it.data(); 00838 TBOX box = bbox->bounding_box(); 00839 mean_width_ += box.width(); 00840 ++width_count; 00841 int x1 = IsRightTab() ? box.right() : box.left(); 00842 // Test both the bottom and the top, as one will be more extreme, depending 00843 // on the direction of skew. 00844 int bottom_y = box.bottom(); 00845 int top_y = box.top(); 00846 int key = SortKey(vertical, x1, bottom_y); 00847 if (IsLeftTab() == (key < sort_key_)) { 00848 sort_key_ = key; 00849 startpt_ = ICOORD(x1, bottom_y); 00850 } 00851 key = SortKey(vertical, x1, top_y); 00852 if (IsLeftTab() == (key < sort_key_)) { 00853 sort_key_ = key; 00854 startpt_ = ICOORD(x1, top_y); 00855 } 00856 if (it.at_first()) 00857 start_y = bottom_y; 00858 if (it.at_last()) 00859 end_y = top_y; 00860 } 00861 if (width_count > 0) { 00862 mean_width_ = (mean_width_ + width_count - 1) / width_count; 00863 } 00864 endpt_ = startpt_ + vertical; 00865 needs_evaluation_ = true; 00866 if (start_y != end_y) { 00867 // Set the ends of the vector to fully include the first and last blobs. 00868 startpt_.set_x(XAtY(vertical, sort_key_, start_y)); 00869 startpt_.set_y(start_y); 00870 endpt_.set_x(XAtY(vertical, sort_key_, end_y)); 00871 endpt_.set_y(end_y); 00872 return true; 00873 } 00874 return false; 00875 } 00876 00877 // Returns the singleton partner if there is one, or NULL otherwise. 00878 TabVector* TabVector::GetSinglePartner() { 00879 if (!partners_.singleton()) 00880 return NULL; 00881 TabVector_C_IT partner_it(&partners_); 00882 TabVector* partner = partner_it.data(); 00883 return partner; 00884 } 00885 00886 // Return the partner of this TabVector if the vector qualifies as 00887 // being a vertical text line, otherwise NULL. 00888 TabVector* TabVector::VerticalTextlinePartner() { 00889 if (!partners_.singleton()) 00890 return NULL; 00891 TabVector_C_IT partner_it(&partners_); 00892 TabVector* partner = partner_it.data(); 00893 BLOBNBOX_C_IT box_it1(&boxes_); 00894 BLOBNBOX_C_IT box_it2(&partner->boxes_); 00895 // Count how many boxes are also in the other list. 00896 // At the same time, gather the mean width and median vertical gap. 00897 if (textord_debug_tabfind > 1) { 00898 Print("Testing for vertical text"); 00899 partner->Print(" partner"); 00900 } 00901 int num_matched = 0; 00902 int num_unmatched = 0; 00903 int total_widths = 0; 00904 int width = startpt().x() - partner->startpt().x(); 00905 if (width < 0) 00906 width = -width; 00907 STATS gaps(0, width * 2); 00908 BLOBNBOX* prev_bbox = NULL; 00909 box_it2.mark_cycle_pt(); 00910 for (box_it1.mark_cycle_pt(); !box_it1.cycled_list(); box_it1.forward()) { 00911 BLOBNBOX* bbox = box_it1.data(); 00912 TBOX box = bbox->bounding_box(); 00913 if (prev_bbox != NULL) { 00914 gaps.add(box.bottom() - prev_bbox->bounding_box().top(), 1); 00915 } 00916 while (!box_it2.cycled_list() && box_it2.data() != bbox && 00917 box_it2.data()->bounding_box().bottom() < box.bottom()) { 00918 box_it2.forward(); 00919 } 00920 if (!box_it2.cycled_list() && box_it2.data() == bbox && 00921 bbox->region_type() >= BRT_UNKNOWN && 00922 (prev_bbox == NULL || prev_bbox->region_type() >= BRT_UNKNOWN)) 00923 ++num_matched; 00924 else 00925 ++num_unmatched; 00926 total_widths += box.width(); 00927 prev_bbox = bbox; 00928 } 00929 double avg_width = total_widths * 1.0 / (num_unmatched + num_matched); 00930 double max_gap = textord_tabvector_vertical_gap_fraction * avg_width; 00931 int min_box_match = static_cast<int>((num_matched + num_unmatched) * 00932 textord_tabvector_vertical_box_ratio); 00933 bool is_vertical = (gaps.get_total() > 0 && 00934 num_matched >= min_box_match && 00935 gaps.median() <= max_gap); 00936 if (textord_debug_tabfind > 1) { 00937 tprintf("gaps=%d, matched=%d, unmatched=%d, min_match=%d " 00938 "median gap=%.2f, width=%.2f max_gap=%.2f Vertical=%s\n", 00939 gaps.get_total(), num_matched, num_unmatched, min_box_match, 00940 gaps.median(), avg_width, max_gap, is_vertical?"Yes":"No"); 00941 } 00942 return (is_vertical) ? partner : NULL; 00943 } 00944 00945 // The constructor is private. 00946 TabVector::TabVector(int extended_ymin, int extended_ymax, 00947 TabAlignment alignment, BLOBNBOX_CLIST* boxes) 00948 : extended_ymin_(extended_ymin), extended_ymax_(extended_ymax), 00949 sort_key_(0), percent_score_(0), mean_width_(0), 00950 needs_refit_(true), needs_evaluation_(true), alignment_(alignment), 00951 top_constraints_(NULL), bottom_constraints_(NULL) { 00952 BLOBNBOX_C_IT it(&boxes_); 00953 it.add_list_after(boxes); 00954 } 00955 00956 // Delete this, but first, repoint all the partners to point to 00957 // replacement. If replacement is NULL, then partner relationships 00958 // are removed. 00959 void TabVector::Delete(TabVector* replacement) { 00960 TabVector_C_IT it(&partners_); 00961 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00962 TabVector* partner = it.data(); 00963 TabVector_C_IT p_it(&partner->partners_); 00964 // If partner already has replacement in its list, then make 00965 // replacement null, and just remove this TabVector when we find it. 00966 TabVector* partner_replacement = replacement; 00967 for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) { 00968 TabVector* p_partner = p_it.data(); 00969 if (p_partner == partner_replacement) { 00970 partner_replacement = NULL; 00971 break; 00972 } 00973 } 00974 // Remove all references to this, and replace with replacement if not NULL. 00975 for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) { 00976 TabVector* p_partner = p_it.data(); 00977 if (p_partner == this) { 00978 p_it.extract(); 00979 if (partner_replacement != NULL) 00980 p_it.add_before_stay_put(partner_replacement); 00981 } 00982 } 00983 if (partner_replacement != NULL) { 00984 partner_replacement->AddPartner(partner); 00985 } 00986 } 00987 delete this; 00988 } 00989 00990 00991 } // namespace tesseract.