tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/textord/tabvector.cpp
Go to the documentation of this file.
00001 
00002 // File:        tabvector.cpp
00003 // Description: Class to hold a near-vertical vector representing a tab-stop.
00004 // Author:      Ray Smith
00005 // Created:     Thu Apr 10 16:28:01 PST 2008
00006 //
00007 // (C) Copyright 2008, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #ifdef HAVE_CONFIG_H
00025 #include "config_auto.h"
00026 #endif
00027 
00028 #include "tabvector.h"
00029 #include "blobbox.h"
00030 #include "colfind.h"
00031 #include "colpartitionset.h"
00032 #include "detlinefit.h"
00033 #include "statistc.h"
00034 
00035 namespace tesseract {
00036 
00037 // Multiple of height used as a gutter for evaluation search.
00038 const int kGutterMultiple = 4;
00039 // Multiple of neighbour gap that we expect the gutter gap to be at minimum.
00040 const int kGutterToNeighbourRatio = 3;
00041 // Pixel distance for tab vectors to be considered the same.
00042 const int kSimilarVectorDist = 10;
00043 // Pixel distance for ragged tab vectors to be considered the same if there
00044 // is nothing in the overlap box
00045 const int kSimilarRaggedDist = 50;
00046 // Max multiple of height to allow filling in between blobs when evaluating.
00047 const int kMaxFillinMultiple = 11;
00048 // Min fraction of mean gutter size to allow a gutter on a good tab blob.
00049 const double kMinGutterFraction = 0.5;
00050 // Multiple of 1/n lines as a minimum gutter in evaluation.
00051 const double kLineCountReciprocal = 4.0;
00052 // Constant add-on for minimum gutter for aligned tabs.
00053 const double kMinAlignedGutter = 0.25;
00054 // Constant add-on for minimum gutter for ragged tabs.
00055 const double kMinRaggedGutter = 1.5;
00056 
00057 double_VAR(textord_tabvector_vertical_gap_fraction, 0.5,
00058   "max fraction of mean blob width allowed for vertical gaps in vertical text");
00059 
00060 double_VAR(textord_tabvector_vertical_box_ratio, 0.5,
00061   "Fraction of box matches required to declare a line vertical");
00062 
00063 ELISTIZE(TabConstraint)
00064 
00065 // Create a constraint for the top or bottom of this TabVector.
00066 void TabConstraint::CreateConstraint(TabVector* vector, bool is_top) {
00067   TabConstraint* constraint = new TabConstraint(vector, is_top);
00068   TabConstraint_LIST* constraints = new TabConstraint_LIST;
00069   TabConstraint_IT it(constraints);
00070   it.add_to_end(constraint);
00071   if (is_top)
00072     vector->set_top_constraints(constraints);
00073   else
00074     vector->set_bottom_constraints(constraints);
00075 }
00076 
00077 // Test to see if the constraints are compatible enough to merge.
00078 bool TabConstraint::CompatibleConstraints(TabConstraint_LIST* list1,
00079                                           TabConstraint_LIST* list2) {
00080   if (list1 == list2)
00081     return false;
00082   int y_min = -MAX_INT32;
00083   int y_max = MAX_INT32;
00084   if (textord_debug_tabfind > 3)
00085     tprintf("Testing constraint compatibility\n");
00086   GetConstraints(list1, &y_min, &y_max);
00087   GetConstraints(list2, &y_min, &y_max);
00088   if (textord_debug_tabfind > 3)
00089     tprintf("Resulting range = [%d,%d]\n", y_min, y_max);
00090   return y_max >= y_min;
00091 }
00092 
00093 // Merge the lists of constraints and update the TabVector pointers.
00094 // The second list is deleted.
00095 void TabConstraint::MergeConstraints(TabConstraint_LIST* list1,
00096                                      TabConstraint_LIST* list2) {
00097   if (list1 == list2)
00098     return;
00099   TabConstraint_IT it(list2);
00100   if (textord_debug_tabfind > 3)
00101     tprintf("Merging constraints\n");
00102   // The vectors of all constraints on list2 are now going to be on list1.
00103   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00104     TabConstraint* constraint = it.data();
00105     if (textord_debug_tabfind> 3)
00106       constraint->vector_->Print("Merge");
00107     if (constraint->is_top_)
00108       constraint->vector_->set_top_constraints(list1);
00109     else
00110       constraint->vector_->set_bottom_constraints(list1);
00111   }
00112   it = list1;
00113   it.add_list_before(list2);
00114   delete list2;
00115 }
00116 
00117 // Set all the tops and bottoms as appropriate to a mean of the
00118 // constrained range. Delete all the constraints and list.
00119 void TabConstraint::ApplyConstraints(TabConstraint_LIST* constraints) {
00120   int y_min = -MAX_INT32;
00121   int y_max = MAX_INT32;
00122   GetConstraints(constraints, &y_min, &y_max);
00123   int y = (y_min + y_max) / 2;
00124   TabConstraint_IT it(constraints);
00125   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00126     TabConstraint* constraint = it.data();
00127     TabVector* v = constraint->vector_;
00128     if (constraint->is_top_) {
00129       v->SetYEnd(y);
00130       v->set_top_constraints(NULL);
00131     } else {
00132       v->SetYStart(y);
00133       v->set_bottom_constraints(NULL);
00134     }
00135   }
00136   delete constraints;
00137 }
00138 
00139 TabConstraint::TabConstraint(TabVector* vector, bool is_top)
00140   : vector_(vector), is_top_(is_top) {
00141   if (is_top) {
00142     y_min_ = vector->endpt().y();
00143     y_max_ = vector->extended_ymax();
00144   } else {
00145     y_max_ = vector->startpt().y();
00146     y_min_ = vector->extended_ymin();
00147   }
00148 }
00149 
00150 // Get the max of the mins and the min of the maxes.
00151 void TabConstraint::GetConstraints(TabConstraint_LIST* constraints,
00152                                    int* y_min, int* y_max) {
00153   TabConstraint_IT it(constraints);
00154   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00155     TabConstraint* constraint = it.data();
00156     if (textord_debug_tabfind > 3) {
00157       tprintf("Constraint is [%d,%d]", constraint->y_min_, constraint->y_max_);
00158       constraint->vector_->Print(" for");
00159     }
00160     *y_min = MAX(*y_min, constraint->y_min_);
00161     *y_max = MIN(*y_max, constraint->y_max_);
00162   }
00163 }
00164 
00165 ELIST2IZE(TabVector)
00166 CLISTIZE(TabVector)
00167 
00168 // The constructor is private. See the bottom of the file...
00169 
00170 TabVector::~TabVector() {
00171 }
00172 
00173 
00174 // Public factory to build a TabVector from a list of boxes.
00175 // The TabVector will be of the given alignment type.
00176 // The input vertical vector is used in fitting, and the output
00177 // vertical_x, vertical_y have the resulting line vector added to them
00178 // if the alignment is not ragged.
00179 // The extended_start_y and extended_end_y are the maximum possible
00180 // extension to the line segment that can be used to align with others.
00181 // The input CLIST of BLOBNBOX good_points is consumed and taken over.
00182 TabVector* TabVector::FitVector(TabAlignment alignment, ICOORD vertical,
00183                                 int  extended_start_y, int extended_end_y,
00184                                 BLOBNBOX_CLIST* good_points,
00185                                 int* vertical_x, int* vertical_y) {
00186   TabVector* vector = new TabVector(extended_start_y, extended_end_y,
00187                                     alignment, good_points);
00188   if (!vector->Fit(vertical, false)) {
00189     delete vector;
00190     return NULL;
00191   }
00192   if (!vector->IsRagged()) {
00193     vertical = vector->endpt_ - vector->startpt_;
00194     int weight = vector->BoxCount();
00195     *vertical_x += vertical.x() * weight;
00196     *vertical_y += vertical.y() * weight;
00197   }
00198   return vector;
00199 }
00200 
00201 // Build a ragged TabVector by copying another's direction, shifting it
00202 // to match the given blob, and making its initial extent the height
00203 // of the blob, but its extended bounds from the bounds of the original.
00204 TabVector::TabVector(const TabVector& src, TabAlignment alignment,
00205                      const ICOORD& vertical_skew, BLOBNBOX* blob)
00206   : extended_ymin_(src.extended_ymin_), extended_ymax_(src.extended_ymax_),
00207     sort_key_(0), percent_score_(0), mean_width_(0),
00208     needs_refit_(true), needs_evaluation_(true), intersects_other_lines_(false),
00209     alignment_(alignment),
00210     top_constraints_(NULL), bottom_constraints_(NULL) {
00211   BLOBNBOX_C_IT it(&boxes_);
00212   it.add_to_end(blob);
00213   TBOX box = blob->bounding_box();
00214   if (IsLeftTab()) {
00215     startpt_ = box.botleft();
00216     endpt_ = box.topleft();
00217   } else {
00218     startpt_ = box.botright();
00219     endpt_ = box.topright();
00220   }
00221   sort_key_ = SortKey(vertical_skew,
00222                       (startpt_.x() + endpt_.x()) / 2,
00223                       (startpt_.y() + endpt_.y()) / 2);
00224   if (textord_debug_tabfind > 3)
00225     Print("Constructed a new tab vector:");
00226 }
00227 
00228 // Copies basic attributes of a tab vector for simple operations.
00229 // Copies things such startpt, endpt, range.
00230 // Does not copy things such as partners, boxes, or constraints.
00231 // This is useful if you only need vector information for processing, such
00232 // as in the table detection code.
00233 TabVector* TabVector::ShallowCopy() const {
00234   TabVector* copy = new TabVector();
00235   copy->startpt_ = startpt_;
00236   copy->endpt_ = endpt_;
00237   copy->alignment_ = alignment_;
00238   copy->extended_ymax_ = extended_ymax_;
00239   copy->extended_ymin_ = extended_ymin_;
00240   copy->intersects_other_lines_ = intersects_other_lines_;
00241   return copy;
00242 }
00243 
00244 // Extend this vector to include the supplied blob if it doesn't
00245 // already have it.
00246 void TabVector::ExtendToBox(BLOBNBOX* new_blob) {
00247   TBOX new_box = new_blob->bounding_box();
00248   BLOBNBOX_C_IT it(&boxes_);
00249   if (!it.empty()) {
00250     BLOBNBOX* blob = it.data();
00251     TBOX box = blob->bounding_box();
00252     while (!it.at_last() && box.top() <= new_box.top()) {
00253       if (blob == new_blob)
00254         return;  // We have it already.
00255       it.forward();
00256       blob = it.data();
00257       box = blob->bounding_box();
00258     }
00259     if (box.top() >= new_box.top()) {
00260       it.add_before_stay_put(new_blob);
00261       needs_refit_ = true;
00262       return;
00263     }
00264   }
00265   needs_refit_ = true;
00266   it.add_after_stay_put(new_blob);
00267 }
00268 
00269 // Set the ycoord of the start and move the xcoord to match.
00270 void TabVector::SetYStart(int start_y) {
00271   startpt_.set_x(XAtY(start_y));
00272   startpt_.set_y(start_y);
00273 }
00274 // Set the ycoord of the end and move the xcoord to match.
00275 void TabVector::SetYEnd(int end_y) {
00276   endpt_.set_x(XAtY(end_y));
00277   endpt_.set_y(end_y);
00278 }
00279 
00280 // Rotate the ends by the given vector. Auto flip start and end if needed.
00281 void TabVector::Rotate(const FCOORD& rotation) {
00282   startpt_.rotate(rotation);
00283   endpt_.rotate(rotation);
00284   int dx = endpt_.x() - startpt_.x();
00285   int dy = endpt_.y() - startpt_.y();
00286   if ((dy < 0 && abs(dy) > abs(dx)) || (dx < 0 && abs(dx) > abs(dy))) {
00287     // Need to flip start/end.
00288     ICOORD tmp = startpt_;
00289     startpt_ = endpt_;
00290     endpt_ = tmp;
00291   }
00292 }
00293 
00294 // Setup the initial constraints, being the limits of
00295 // the vector and the extended ends.
00296 void TabVector::SetupConstraints() {
00297   TabConstraint::CreateConstraint(this, false);
00298   TabConstraint::CreateConstraint(this, true);
00299 }
00300 
00301 // Setup the constraints between the partners of this TabVector.
00302 void TabVector::SetupPartnerConstraints() {
00303   // With the first and last partner, we want a common bottom and top,
00304   // respectively, and for each change of partner, we want a common
00305   // top of first with bottom of next.
00306   TabVector_C_IT it(&partners_);
00307   TabVector* prev_partner = NULL;
00308   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00309     TabVector* partner = it.data();
00310     if (partner->top_constraints_ == NULL ||
00311         partner->bottom_constraints_ == NULL) {
00312       partner->Print("Impossible: has no constraints");
00313       Print("This vector has it as a partner");
00314       continue;
00315     }
00316     if (prev_partner == NULL) {
00317       // This is the first partner, so common bottom.
00318       if (TabConstraint::CompatibleConstraints(bottom_constraints_,
00319                                                partner->bottom_constraints_))
00320         TabConstraint::MergeConstraints(bottom_constraints_,
00321                                         partner->bottom_constraints_);
00322     } else {
00323       // We need prev top to be common with partner bottom.
00324       if (TabConstraint::CompatibleConstraints(prev_partner->top_constraints_,
00325                                                partner->bottom_constraints_))
00326         TabConstraint::MergeConstraints(prev_partner->top_constraints_,
00327                                         partner->bottom_constraints_);
00328     }
00329     prev_partner = partner;
00330     if (it.at_last()) {
00331       // This is the last partner, so common top.
00332       if (TabConstraint::CompatibleConstraints(top_constraints_,
00333                                                partner->top_constraints_))
00334         TabConstraint::MergeConstraints(top_constraints_,
00335                                         partner->top_constraints_);
00336     }
00337   }
00338 }
00339 
00340 // Setup the constraints between this and its partner.
00341 void TabVector::SetupPartnerConstraints(TabVector* partner) {
00342   if (TabConstraint::CompatibleConstraints(bottom_constraints_,
00343                                            partner->bottom_constraints_))
00344     TabConstraint::MergeConstraints(bottom_constraints_,
00345                                     partner->bottom_constraints_);
00346   if (TabConstraint::CompatibleConstraints(top_constraints_,
00347                                            partner->top_constraints_))
00348     TabConstraint::MergeConstraints(top_constraints_,
00349                                     partner->top_constraints_);
00350 }
00351 
00352 // Use the constraints to modify the top and bottom.
00353 void TabVector::ApplyConstraints() {
00354   if (top_constraints_ != NULL)
00355     TabConstraint::ApplyConstraints(top_constraints_);
00356   if (bottom_constraints_ != NULL)
00357     TabConstraint::ApplyConstraints(bottom_constraints_);
00358 }
00359 
00360 // Merge close tab vectors of the same side that overlap.
00361 void TabVector::MergeSimilarTabVectors(const ICOORD& vertical,
00362                                        TabVector_LIST* vectors,
00363                                        BlobGrid* grid) {
00364   TabVector_IT it1(vectors);
00365   for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
00366     TabVector* v1 = it1.data();
00367     TabVector_IT it2(it1);
00368     for (it2.forward(); !it2.at_first(); it2.forward()) {
00369       TabVector* v2 = it2.data();
00370       if (v2->SimilarTo(vertical, *v1, grid)) {
00371         // Merge into the forward one, in case the combined vector now
00372         // overlaps one in between.
00373         if (textord_debug_tabfind) {
00374           v2->Print("Merging");
00375           v1->Print("by deleting");
00376         }
00377         v2->MergeWith(vertical, it1.extract());
00378         if (textord_debug_tabfind) {
00379           v2->Print("Producing");
00380         }
00381         ICOORD merged_vector = v2->endpt();
00382         merged_vector -= v2->startpt();
00383         if (textord_debug_tabfind && abs(merged_vector.x()) > 100) {
00384           v2->Print("Garbage result of merge?");
00385         }
00386         break;
00387       }
00388     }
00389   }
00390 }
00391 
00392 // Return true if this vector is the same side, overlaps, and close
00393 // enough to the other to be merged.
00394 bool TabVector::SimilarTo(const ICOORD& vertical,
00395                           const TabVector& other, BlobGrid* grid) const {
00396   if ((IsRightTab() && other.IsRightTab()) ||
00397       (IsLeftTab() && other.IsLeftTab())) {
00398     // If they don't overlap, at least in extensions, then there is no chance.
00399     if (ExtendedOverlap(other.extended_ymax_, other.extended_ymin_) < 0)
00400       return false;
00401     // A fast approximation to the scale factor of the sort_key_.
00402     int v_scale = abs(vertical.y());
00403     if (v_scale == 0)
00404       v_scale = 1;
00405     // If they are close enough, then OK.
00406     if (sort_key_ + kSimilarVectorDist * v_scale >= other.sort_key_ &&
00407         sort_key_ - kSimilarVectorDist * v_scale <= other.sort_key_)
00408       return true;
00409     // Ragged tabs get a bigger threshold.
00410     if (!IsRagged() || !other.IsRagged() ||
00411         sort_key_ + kSimilarRaggedDist * v_scale < other.sort_key_ ||
00412         sort_key_ - kSimilarRaggedDist * v_scale > other.sort_key_)
00413       return false;
00414     if (grid == NULL) {
00415       // There is nothing else to test!
00416       return true;
00417     }
00418     // If there is nothing in the rectangle between the vector that is going to
00419     // move, and the place it is moving to, then they can be merged.
00420     // Setup a vertical search for any blob.
00421     const TabVector* mover = (IsRightTab() &&
00422        sort_key_ < other.sort_key_) ? this : &other;
00423     int top_y = mover->endpt_.y();
00424     int bottom_y = mover->startpt_.y();
00425     int left = MIN(mover->XAtY(top_y), mover->XAtY(bottom_y));
00426     int right = MAX(mover->XAtY(top_y), mover->XAtY(bottom_y));
00427     int shift = abs(sort_key_ - other.sort_key_) / v_scale;
00428     if (IsRightTab()) {
00429       right += shift;
00430     } else {
00431       left -= shift;
00432     }
00433 
00434     GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> vsearch(grid);
00435     vsearch.StartVerticalSearch(left, right, top_y);
00436     BLOBNBOX* blob;
00437     while ((blob = vsearch.NextVerticalSearch(true)) != NULL) {
00438       TBOX box = blob->bounding_box();
00439       if (box.top() > bottom_y)
00440         return true;  // Nothing found.
00441       if (box.bottom() < top_y)
00442         continue;  // Doesn't overlap.
00443       int left_at_box = XAtY(box.bottom());
00444       int right_at_box = left_at_box;
00445       if (IsRightTab())
00446         right_at_box += shift;
00447       else
00448         left_at_box -= shift;
00449       if (MIN(right_at_box, box.right()) > MAX(left_at_box, box.left()))
00450         return false;
00451     }
00452     return true;  // Nothing found.
00453   }
00454   return false;
00455 }
00456 
00457 // Eat the other TabVector into this and delete it.
00458 void TabVector::MergeWith(const ICOORD& vertical, TabVector* other) {
00459   extended_ymin_ = MIN(extended_ymin_, other->extended_ymin_);
00460   extended_ymax_ = MAX(extended_ymax_, other->extended_ymax_);
00461   if (other->IsRagged()) {
00462     alignment_ = other->alignment_;
00463   }
00464   // Merge sort the two lists of boxes.
00465   BLOBNBOX_C_IT it1(&boxes_);
00466   BLOBNBOX_C_IT it2(&other->boxes_);
00467   while (!it2.empty()) {
00468     BLOBNBOX* bbox2 = it2.extract();
00469     it2.forward();
00470     TBOX box2 = bbox2->bounding_box();
00471     BLOBNBOX* bbox1 = it1.data();
00472     TBOX box1 = bbox1->bounding_box();
00473     while (box1.bottom() < box2.bottom() && !it1.at_last()) {
00474       it1.forward();
00475       bbox1 = it1.data();
00476       box1 = bbox1->bounding_box();
00477     }
00478     if (box1.bottom() < box2.bottom()) {
00479       it1.add_to_end(bbox2);
00480     } else if (bbox1 != bbox2) {
00481       it1.add_before_stay_put(bbox2);
00482     }
00483   }
00484   Fit(vertical, true);
00485   other->Delete(this);
00486 }
00487 
00488 // Add a new element to the list of partner TabVectors.
00489 // Partners must be added in order of increasing y coordinate of the text line
00490 // that makes them partners.
00491 // Groups of identical partners are merged into one.
00492 void TabVector::AddPartner(TabVector* partner) {
00493   if (IsSeparator() || partner->IsSeparator())
00494     return;
00495   TabVector_C_IT it(&partners_);
00496   if (!it.empty()) {
00497     it.move_to_last();
00498     if (it.data() == partner)
00499       return;
00500   }
00501   it.add_after_then_move(partner);
00502 }
00503 
00504 // Return true if other is a partner of this.
00505 bool TabVector::IsAPartner(const TabVector* other) {
00506   TabVector_C_IT it(&partners_);
00507   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00508     if (it.data() == other)
00509       return true;
00510   }
00511   return false;
00512 }
00513 
00514 // These names must be synced with the TabAlignment enum in tabvector.h.
00515 const char* kAlignmentNames[] = {
00516   "Left Aligned",
00517   "Left Ragged",
00518   "Center",
00519   "Right Aligned",
00520   "Right Ragged",
00521   "Separator"
00522 };
00523 
00524 // Print basic information about this tab vector.
00525 void TabVector::Print(const char* prefix) {
00526   if (this == NULL) {
00527     tprintf("%s <null>\n", prefix);
00528   } else {
00529     tprintf("%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d,"
00530             " partners=%d\n",
00531             prefix, kAlignmentNames[alignment_],
00532             startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y(),
00533             mean_width_, percent_score_, sort_key_,
00534             boxes_.length(), partners_.length());
00535   }
00536 }
00537 
00538 // Print basic information about this tab vector and every box in it.
00539 void TabVector::Debug(const char* prefix) {
00540   Print(prefix);
00541   BLOBNBOX_C_IT it(&boxes_);
00542   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00543     BLOBNBOX* bbox = it.data();
00544     const TBOX& box = bbox->bounding_box();
00545     tprintf("Box at (%d,%d)->(%d,%d)\n",
00546             box.left(), box.bottom(), box.right(), box.top());
00547   }
00548 }
00549 
00550 // Draw this tabvector in place in the given window.
00551 void TabVector::Display(ScrollView* tab_win) {
00552 #ifndef GRAPHICS_DISABLED
00553   if (textord_debug_printable)
00554     tab_win->Pen(ScrollView::BLUE);
00555   else if (alignment_ == TA_LEFT_ALIGNED)
00556     tab_win->Pen(ScrollView::LIME_GREEN);
00557   else if (alignment_ == TA_LEFT_RAGGED)
00558     tab_win->Pen(ScrollView::DARK_GREEN);
00559   else if (alignment_ == TA_RIGHT_ALIGNED)
00560     tab_win->Pen(ScrollView::PINK);
00561   else if (alignment_ == TA_RIGHT_RAGGED)
00562     tab_win->Pen(ScrollView::CORAL);
00563   else
00564     tab_win->Pen(ScrollView::WHITE);
00565   tab_win->Line(startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y());
00566   tab_win->Pen(ScrollView::GREY);
00567   tab_win->Line(startpt_.x(), startpt_.y(), startpt_.x(), extended_ymin_);
00568   tab_win->Line(endpt_.x(), extended_ymax_, endpt_.x(), endpt_.y());
00569   char score_buf[64];
00570   snprintf(score_buf, sizeof(score_buf), "%d", percent_score_);
00571   tab_win->TextAttributes("Times", 50, false, false, false);
00572   tab_win->Text(startpt_.x(), startpt_.y(), score_buf);
00573 #endif
00574 }
00575 
00576 // Refit the line and/or re-evaluate the vector if the dirty flags are set.
00577 void TabVector::FitAndEvaluateIfNeeded(const ICOORD& vertical,
00578                                        TabFind* finder) {
00579   if (needs_refit_)
00580     Fit(vertical, true);
00581   if (needs_evaluation_)
00582     Evaluate(vertical, finder);
00583 }
00584 
00585 // Evaluate the vector in terms of coverage of its length by good-looking
00586 // box edges. A good looking box is one where its nearest neighbour on the
00587 // inside is nearer than half the distance its nearest neighbour on the
00588 // outside of the putative column. Bad boxes are removed from the line.
00589 // A second pass then further filters boxes by requiring that the gutter
00590 // width be a minimum fraction of the mean gutter along the line.
00591 void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
00592   bool debug = false;
00593   needs_evaluation_ = false;
00594   int length = endpt_.y() - startpt_.y();
00595   if (length == 0 || boxes_.empty()) {
00596     percent_score_ = 0;
00597     Print("Zero length in evaluate");
00598     return;
00599   }
00600   // Compute the mean box height.
00601   BLOBNBOX_C_IT it(&boxes_);
00602   int mean_height = 0;
00603   int height_count = 0;
00604   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00605     BLOBNBOX* bbox = it.data();
00606     const TBOX& box = bbox->bounding_box();
00607     int height = box.height();
00608     mean_height += height;
00609     ++height_count;
00610   }
00611   if (height_count > 0) mean_height /= height_count;
00612   int max_gutter = kGutterMultiple * mean_height;
00613   if (IsRagged()) {
00614     // Ragged edges face a tougher test in that the gap must always be within
00615     // the height of the blob.
00616     max_gutter = kGutterToNeighbourRatio * mean_height;
00617   }
00618 
00619   STATS gutters(0, max_gutter + 1);
00620   // Evaluate the boxes for their goodness, calculating the coverage as we go.
00621   // Remove boxes that are not good and shorten the list to the first and
00622   // last good boxes.
00623   int num_deleted_boxes = 0;
00624   bool text_on_image = false;
00625   int good_length = 0;
00626   const TBOX* prev_good_box = NULL;
00627   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00628     BLOBNBOX* bbox = it.data();
00629     const TBOX& box = bbox->bounding_box();
00630     int mid_y = (box.top() + box.bottom()) / 2;
00631     if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) {
00632       if (!debug) {
00633         tprintf("After already deleting %d boxes, ", num_deleted_boxes);
00634         Print("Starting evaluation");
00635       }
00636       debug = true;
00637     }
00638     // A good box is one where the nearest neighbour on the inside is closer
00639     // than half the distance to the nearest neighbour on the outside
00640     // (of the putative column).
00641     bool left = IsLeftTab();
00642     int tab_x = XAtY(mid_y);
00643     int gutter_width;
00644     int neighbour_gap;
00645     finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
00646                                        bbox, &gutter_width, &neighbour_gap);
00647     if (debug) {
00648       tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n",
00649               box.left(), box.bottom(), box.right(), box.top(),
00650               gutter_width, neighbour_gap);
00651     }
00652     // Now we can make the test.
00653     if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) {
00654       // A good box contributes its height to the good_length.
00655       good_length += box.top() - box.bottom();
00656       gutters.add(gutter_width, 1);
00657       // Two good boxes together contribute the gap between them
00658       // to the good_length as well, as long as the gap is not
00659       // too big.
00660       if (prev_good_box != NULL) {
00661         int vertical_gap = box.bottom() - prev_good_box->top();
00662         double size1 = sqrt(static_cast<double>(prev_good_box->area()));
00663         double size2 = sqrt(static_cast<double>(box.area()));
00664         if (vertical_gap < kMaxFillinMultiple * MIN(size1, size2))
00665           good_length += vertical_gap;
00666         if (debug) {
00667           tprintf("Box and prev good, gap=%d, target %g, goodlength=%d\n",
00668                   vertical_gap, kMaxFillinMultiple * MIN(size1, size2),
00669                   good_length);
00670         }
00671       } else {
00672         // Adjust the start to the first good box.
00673         SetYStart(box.bottom());
00674       }
00675       prev_good_box = &box;
00676       if (bbox->flow() == BTFT_TEXT_ON_IMAGE)
00677         text_on_image = true;
00678     } else {
00679       // Get rid of boxes that are not good.
00680       if (debug) {
00681         tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\n",
00682                 box.left(), box.bottom(), box.right(), box.top(),
00683                 gutter_width, neighbour_gap);
00684       }
00685       it.extract();
00686       ++num_deleted_boxes;
00687     }
00688   }
00689   if (debug) {
00690     Print("Evaluating:");
00691   }
00692   // If there are any good boxes, do it again, except this time get rid of
00693   // boxes that have a gutter that is a small fraction of the mean gutter.
00694   // This filters out ends that run into a coincidental gap in the text.
00695   int search_top = endpt_.y();
00696   int search_bottom = startpt_.y();
00697   int median_gutter = IntCastRounded(gutters.median());
00698   if (gutters.get_total() > 0) {
00699     prev_good_box = NULL;
00700     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00701       BLOBNBOX* bbox = it.data();
00702       const TBOX& box = bbox->bounding_box();
00703       int mid_y = (box.top() + box.bottom()) / 2;
00704       // A good box is one where the gutter width is at least some constant
00705       // fraction of the mean gutter width.
00706       bool left = IsLeftTab();
00707       int tab_x = XAtY(mid_y);
00708       int max_gutter = kGutterMultiple * mean_height;
00709       if (IsRagged()) {
00710         // Ragged edges face a tougher test in that the gap must always be
00711         // within the height of the blob.
00712         max_gutter = kGutterToNeighbourRatio * mean_height;
00713       }
00714       int gutter_width;
00715       int neighbour_gap;
00716       finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
00717                                          bbox, &gutter_width, &neighbour_gap);
00718       // Now we can make the test.
00719       if (gutter_width >= median_gutter * kMinGutterFraction) {
00720         if (prev_good_box == NULL) {
00721           // Adjust the start to the first good box.
00722           SetYStart(box.bottom());
00723           search_bottom = box.top();
00724         }
00725         prev_good_box = &box;
00726         search_top = box.bottom();
00727       } else {
00728         // Get rid of boxes that are not good.
00729         if (debug) {
00730           tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n",
00731                   box.left(), box.bottom(), box.right(), box.top(),
00732                   gutter_width, median_gutter);
00733         }
00734         it.extract();
00735         ++num_deleted_boxes = true;
00736       }
00737     }
00738   }
00739   // If there has been a good box, adjust the end.
00740   if (prev_good_box != NULL) {
00741     SetYEnd(prev_good_box->top());
00742     // Compute the percentage of the vector that is occupied by good boxes.
00743     int length = endpt_.y() - startpt_.y();
00744     percent_score_ = 100 * good_length / length;
00745     if (num_deleted_boxes > 0) {
00746       needs_refit_ = true;
00747       FitAndEvaluateIfNeeded(vertical, finder);
00748       if (boxes_.empty())
00749         return;
00750     }
00751     // Test the gutter over the whole vector, instead of just at the boxes.
00752     int required_shift;
00753     if (search_bottom > search_top) {
00754       search_bottom = startpt_.y();
00755       search_top = endpt_.y();
00756     }
00757     double min_gutter_width = kLineCountReciprocal / boxes_.length();
00758     min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter;
00759     min_gutter_width *= mean_height;
00760     int max_gutter_width = IntCastRounded(min_gutter_width) + 1;
00761     if (median_gutter > max_gutter_width)
00762       max_gutter_width = median_gutter;
00763     int gutter_width = finder->GutterWidth(search_bottom, search_top, *this,
00764                                            text_on_image, max_gutter_width,
00765                                            &required_shift);
00766     if (gutter_width < min_gutter_width) {
00767       if (debug) {
00768         tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n",
00769                 gutter_width, min_gutter_width);
00770       }
00771       boxes_.shallow_clear();
00772       percent_score_ = 0;
00773     } else if (debug) {
00774       tprintf("Final gutter %d, vs limit of %g, required shift = %d\n",
00775               gutter_width, min_gutter_width, required_shift);
00776     }
00777   } else {
00778     // There are no good boxes left, so score is 0.
00779     percent_score_ = 0;
00780   }
00781 
00782   if (debug) {
00783     Print("Evaluation complete:");
00784   }
00785 }
00786 
00787 // (Re)Fit a line to the stored points. Returns false if the line
00788 // is degenerate. Althougth the TabVector code mostly doesn't care about the
00789 // direction of lines, XAtY would give silly results for a horizontal line.
00790 // The class is mostly aimed at use for vertical lines representing
00791 // horizontal tab stops.
00792 bool TabVector::Fit(ICOORD vertical, bool force_parallel) {
00793   needs_refit_ = false;
00794   if (boxes_.empty()) {
00795     // Don't refit something with no boxes, as that only happens
00796     // in Evaluate, and we don't want to end up with a zero vector.
00797     if (!force_parallel)
00798       return false;
00799     // If we are forcing parallel, then we just need to set the sort_key_.
00800     ICOORD midpt = startpt_;
00801     midpt += endpt_;
00802     midpt /= 2;
00803     sort_key_ = SortKey(vertical, midpt.x(), midpt.y());
00804     return startpt_.y() != endpt_.y();
00805   }
00806   if (!force_parallel && !IsRagged()) {
00807     // Use a fitted line as the vertical.
00808     DetLineFit linepoints;
00809     BLOBNBOX_C_IT it(&boxes_);
00810     // Fit a line to all the boxes in the list.
00811     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00812       BLOBNBOX* bbox = it.data();
00813       TBOX box = bbox->bounding_box();
00814       int x1 = IsRightTab() ? box.right() : box.left();
00815       ICOORD boxpt(x1, box.bottom());
00816       linepoints.Add(boxpt);
00817       if (it.at_last()) {
00818         ICOORD top_pt(x1, box.top());
00819         linepoints.Add(top_pt);
00820       }
00821     }
00822     linepoints.Fit(&startpt_, &endpt_);
00823     if (startpt_.y() != endpt_.y()) {
00824       vertical = endpt_;
00825       vertical -= startpt_;
00826     }
00827   }
00828   int start_y = startpt_.y();
00829   int end_y = endpt_.y();
00830   sort_key_ = IsLeftTab() ? MAX_INT32 : -MAX_INT32;
00831   BLOBNBOX_C_IT it(&boxes_);
00832   // Choose a line parallel to the vertical such that all boxes are on the
00833   // correct side of it.
00834   mean_width_ = 0;
00835   int width_count = 0;
00836   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00837     BLOBNBOX* bbox = it.data();
00838     TBOX box = bbox->bounding_box();
00839     mean_width_ += box.width();
00840     ++width_count;
00841     int x1 = IsRightTab() ? box.right() : box.left();
00842     // Test both the bottom and the top, as one will be more extreme, depending
00843     // on the direction of skew.
00844     int bottom_y = box.bottom();
00845     int top_y = box.top();
00846     int key = SortKey(vertical, x1, bottom_y);
00847     if (IsLeftTab() == (key < sort_key_)) {
00848       sort_key_ = key;
00849       startpt_ = ICOORD(x1, bottom_y);
00850     }
00851     key = SortKey(vertical, x1, top_y);
00852     if (IsLeftTab() == (key < sort_key_)) {
00853       sort_key_ = key;
00854       startpt_ = ICOORD(x1, top_y);
00855     }
00856     if (it.at_first())
00857       start_y = bottom_y;
00858     if (it.at_last())
00859       end_y = top_y;
00860   }
00861   if (width_count > 0) {
00862     mean_width_ = (mean_width_ + width_count - 1) / width_count;
00863   }
00864   endpt_ = startpt_ + vertical;
00865   needs_evaluation_ = true;
00866   if (start_y != end_y) {
00867     // Set the ends of the vector to fully include the first and last blobs.
00868     startpt_.set_x(XAtY(vertical, sort_key_, start_y));
00869     startpt_.set_y(start_y);
00870     endpt_.set_x(XAtY(vertical, sort_key_, end_y));
00871     endpt_.set_y(end_y);
00872     return true;
00873   }
00874   return false;
00875 }
00876 
00877 // Returns the singleton partner if there is one, or NULL otherwise.
00878 TabVector* TabVector::GetSinglePartner() {
00879   if (!partners_.singleton())
00880     return NULL;
00881   TabVector_C_IT partner_it(&partners_);
00882   TabVector* partner = partner_it.data();
00883   return partner;
00884 }
00885 
00886 // Return the partner of this TabVector if the vector qualifies as
00887 // being a vertical text line, otherwise NULL.
00888 TabVector* TabVector::VerticalTextlinePartner() {
00889   if (!partners_.singleton())
00890     return NULL;
00891   TabVector_C_IT partner_it(&partners_);
00892   TabVector* partner = partner_it.data();
00893   BLOBNBOX_C_IT box_it1(&boxes_);
00894   BLOBNBOX_C_IT box_it2(&partner->boxes_);
00895   // Count how many boxes are also in the other list.
00896   // At the same time, gather the mean width and median vertical gap.
00897   if (textord_debug_tabfind > 1) {
00898     Print("Testing for vertical text");
00899     partner->Print("           partner");
00900   }
00901   int num_matched = 0;
00902   int num_unmatched = 0;
00903   int total_widths = 0;
00904   int width = startpt().x() - partner->startpt().x();
00905   if (width < 0)
00906     width = -width;
00907   STATS gaps(0, width * 2);
00908   BLOBNBOX* prev_bbox = NULL;
00909   box_it2.mark_cycle_pt();
00910   for (box_it1.mark_cycle_pt(); !box_it1.cycled_list(); box_it1.forward()) {
00911     BLOBNBOX* bbox = box_it1.data();
00912     TBOX box = bbox->bounding_box();
00913     if (prev_bbox != NULL) {
00914       gaps.add(box.bottom() - prev_bbox->bounding_box().top(), 1);
00915     }
00916     while (!box_it2.cycled_list() && box_it2.data() != bbox &&
00917            box_it2.data()->bounding_box().bottom() < box.bottom()) {
00918       box_it2.forward();
00919     }
00920     if (!box_it2.cycled_list() && box_it2.data() == bbox &&
00921         bbox->region_type() >= BRT_UNKNOWN &&
00922         (prev_bbox == NULL || prev_bbox->region_type() >= BRT_UNKNOWN))
00923       ++num_matched;
00924     else
00925       ++num_unmatched;
00926     total_widths += box.width();
00927     prev_bbox = bbox;
00928   }
00929   double avg_width = total_widths * 1.0 / (num_unmatched + num_matched);
00930   double max_gap = textord_tabvector_vertical_gap_fraction * avg_width;
00931   int min_box_match = static_cast<int>((num_matched + num_unmatched) *
00932                                        textord_tabvector_vertical_box_ratio);
00933   bool is_vertical = (gaps.get_total() > 0 &&
00934                       num_matched >= min_box_match &&
00935                       gaps.median() <= max_gap);
00936   if (textord_debug_tabfind > 1) {
00937     tprintf("gaps=%d, matched=%d, unmatched=%d, min_match=%d "
00938             "median gap=%.2f, width=%.2f max_gap=%.2f Vertical=%s\n",
00939             gaps.get_total(), num_matched, num_unmatched, min_box_match,
00940             gaps.median(), avg_width, max_gap, is_vertical?"Yes":"No");
00941   }
00942   return (is_vertical) ? partner : NULL;
00943 }
00944 
00945 // The constructor is private.
00946 TabVector::TabVector(int extended_ymin, int extended_ymax,
00947                      TabAlignment alignment, BLOBNBOX_CLIST* boxes)
00948   : extended_ymin_(extended_ymin), extended_ymax_(extended_ymax),
00949     sort_key_(0), percent_score_(0), mean_width_(0),
00950     needs_refit_(true), needs_evaluation_(true), alignment_(alignment),
00951     top_constraints_(NULL), bottom_constraints_(NULL) {
00952   BLOBNBOX_C_IT it(&boxes_);
00953   it.add_list_after(boxes);
00954 }
00955 
00956 // Delete this, but first, repoint all the partners to point to
00957 // replacement. If replacement is NULL, then partner relationships
00958 // are removed.
00959 void TabVector::Delete(TabVector* replacement) {
00960   TabVector_C_IT it(&partners_);
00961   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00962     TabVector* partner = it.data();
00963     TabVector_C_IT p_it(&partner->partners_);
00964     // If partner already has replacement in its list, then make
00965     // replacement null, and just remove this TabVector when we find it.
00966     TabVector* partner_replacement = replacement;
00967     for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
00968       TabVector* p_partner = p_it.data();
00969       if (p_partner == partner_replacement) {
00970         partner_replacement = NULL;
00971         break;
00972       }
00973     }
00974     // Remove all references to this, and replace with replacement if not NULL.
00975     for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
00976       TabVector* p_partner = p_it.data();
00977       if (p_partner == this) {
00978         p_it.extract();
00979         if (partner_replacement != NULL)
00980           p_it.add_before_stay_put(partner_replacement);
00981       }
00982     }
00983     if (partner_replacement != NULL) {
00984       partner_replacement->AddPartner(partner);
00985     }
00986   }
00987   delete this;
00988 }
00989 
00990 
00991 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines