tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/textord/colpartition.cpp
Go to the documentation of this file.
00001 
00002 // File:        colpartition.cpp
00003 // Description: Class to hold partitions of the page that correspond
00004 //              roughly to text lines.
00005 // Author:      Ray Smith
00006 // Created:     Thu Aug 14 10:54:01 PDT 2008
00007 //
00008 // (C) Copyright 2008, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifdef _MSC_VER
00022 #pragma warning(disable:4244)  // Conversion warnings
00023 #endif
00024 
00025 #ifdef HAVE_CONFIG_H
00026 #include "config_auto.h"
00027 #endif
00028 
00029 #include "colpartition.h"
00030 #include "colpartitiongrid.h"
00031 #include "colpartitionset.h"
00032 #include "detlinefit.h"
00033 #include "dppoint.h"
00034 #include "imagefind.h"
00035 #include "workingpartset.h"
00036 
00037 namespace tesseract {
00038 
00039 ELIST2IZE(ColPartition)
00040 CLISTIZE(ColPartition)
00041 
00043 
00044 // If multiple partners survive the partner depth test beyond this level,
00045 // then arbitrarily pick one.
00046 const int kMaxPartnerDepth = 4;
00047 // Maximum change in spacing (in inches) to ignore.
00048 const double kMaxSpacingDrift = 1.0 / 72;  // 1/72 is one point.
00049 // Maximum fraction of line height used as an additional allowance
00050 // for top spacing.
00051 const double kMaxTopSpacingFraction = 0.25;
00052 // What multiple of the largest line height should be used as an upper bound
00053 // for whether lines are in the same text block?
00054 const double kMaxSameBlockLineSpacing = 3;
00055 // Maximum ratio of sizes for lines to be considered the same size.
00056 const double kMaxSizeRatio = 1.5;
00057 // Fraction of max of leader width and gap for max IQR of gaps.
00058 const double kMaxLeaderGapFractionOfMax = 0.25;
00059 // Fraction of min of leader width and gap for max IQR of gaps.
00060 const double kMaxLeaderGapFractionOfMin = 0.5;
00061 // Minimum number of blobs to be considered a leader.
00062 const int kMinLeaderCount = 5;
00063 // Cost of a cut through a leader.
00064 const int kLeaderCutCost = 8;
00065 // Minimum score for a STRONG_CHAIN textline.
00066 const int kMinStrongTextValue = 6;
00067 // Minimum score for a CHAIN textline.
00068 const int kMinChainTextValue = 3;
00069 // Minimum number of blobs for strong horizontal text lines.
00070 const int kHorzStrongTextlineCount = 8;
00071 // Minimum height (in image pixels) for strong horizontal text lines.
00072 const int kHorzStrongTextlineHeight = 10;
00073 // Minimum aspect ratio for strong horizontal text lines.
00074 const int kHorzStrongTextlineAspect = 5;
00075 // Maximum upper quartile error allowed on a baseline fit as a fraction
00076 // of height.
00077 const double kMaxBaselineError = 0.4375;
00078 // Min coverage for a good baseline between vectors
00079 const double kMinBaselineCoverage = 0.5;
00080 // Max RMS color noise to compare colors.
00081 const int kMaxRMSColorNoise = 128;
00082 // Maximum distance to allow a partition color to be to use that partition
00083 // in smoothing neighbouring types. This is a squared distance.
00084 const int kMaxColorDistance = 900;
00085 
00086 // blob_type is the blob_region_type_ of the blobs in this partition.
00087 // Vertical is the direction of logical vertical on the possibly skewed image.
00088 ColPartition::ColPartition(BlobRegionType blob_type, const ICOORD& vertical)
00089   : left_margin_(-MAX_INT32), right_margin_(MAX_INT32),
00090     median_bottom_(MAX_INT32), median_top_(-MAX_INT32), median_size_(0),
00091     median_left_(MAX_INT32), median_right_(-MAX_INT32), median_width_(0),
00092     blob_type_(blob_type), flow_(BTFT_NONE), good_blob_score_(0),
00093     good_width_(false), good_column_(false),
00094     left_key_tab_(false), right_key_tab_(false),
00095     left_key_(0), right_key_(0), type_(PT_UNKNOWN), vertical_(vertical),
00096     working_set_(NULL), last_add_was_vertical_(false), block_owned_(false),
00097     desperately_merged_(false),
00098     first_column_(-1), last_column_(-1), column_set_(NULL),
00099     side_step_(0), top_spacing_(0), bottom_spacing_(0),
00100     type_before_table_(PT_UNKNOWN), inside_table_column_(false),
00101     nearest_neighbor_above_(NULL), nearest_neighbor_below_(NULL),
00102     space_above_(0), space_below_(0), space_to_left_(0), space_to_right_(0),
00103     owns_blobs_(true) {
00104   memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
00105 }
00106 
00107 // Constructs a fake ColPartition with a single fake BLOBNBOX, all made
00108 // from a single TBOX.
00109 // WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and
00110 // the ColPartition owns the BLOBNBOX!!!
00111 // Call DeleteBoxes before deleting the ColPartition.
00112 ColPartition* ColPartition::FakePartition(const TBOX& box,
00113                                           PolyBlockType block_type,
00114                                           BlobRegionType blob_type,
00115                                           BlobTextFlowType flow) {
00116   ColPartition* part = new ColPartition(blob_type, ICOORD(0, 1));
00117   part->set_type(block_type);
00118   part->set_flow(flow);
00119   part->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(box)));
00120   part->set_left_margin(box.left());
00121   part->set_right_margin(box.right());
00122   part->SetBlobTypes();
00123   part->ComputeLimits();
00124   part->ClaimBoxes();
00125   return part;
00126 }
00127 
00128 // Constructs and returns a ColPartition with the given real BLOBNBOX,
00129 // and sets it up to be a "big" partition (single-blob partition bigger
00130 // than the surrounding text that may be a dropcap, two or more vertically
00131 // touching characters, or some graphic element.
00132 // If the given list is not NULL, the partition is also added to the list.
00133 ColPartition* ColPartition::MakeBigPartition(BLOBNBOX* box,
00134                                              ColPartition_LIST* big_part_list) {
00135   box->set_owner(NULL);
00136   ColPartition* single = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
00137   single->set_flow(BTFT_NONE);
00138   single->AddBox(box);
00139   single->ComputeLimits();
00140   single->ClaimBoxes();
00141   single->SetBlobTypes();
00142   single->set_block_owned(true);
00143   if (big_part_list != NULL) {
00144     ColPartition_IT part_it(big_part_list);
00145     part_it.add_to_end(single);
00146   }
00147   return single;
00148 }
00149 
00150 ColPartition::~ColPartition() {
00151   // Remove this as a partner of all partners, as we don't want them
00152   // referring to a deleted object.
00153   ColPartition_C_IT it(&upper_partners_);
00154   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00155     it.data()->RemovePartner(false, this);
00156   }
00157   it.set_to_list(&lower_partners_);
00158   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00159     it.data()->RemovePartner(true, this);
00160   }
00161 }
00162 
00163 // Constructs a fake ColPartition with no BLOBNBOXes to represent a
00164 // horizontal or vertical line, given a type and a bounding box.
00165 ColPartition* ColPartition::MakeLinePartition(BlobRegionType blob_type,
00166                                               const ICOORD& vertical,
00167                                               int left, int bottom,
00168                                               int right, int top) {
00169   ColPartition* part = new ColPartition(blob_type, vertical);
00170   part->bounding_box_ = TBOX(left, bottom, right, top);
00171   part->median_bottom_ = bottom;
00172   part->median_top_ = top;
00173   part->median_size_ = top - bottom;
00174   part->median_width_ = right - left;
00175   part->left_key_ = part->BoxLeftKey();
00176   part->right_key_ = part->BoxRightKey();
00177   return part;
00178 }
00179 
00180 
00181 // Adds the given box to the partition, updating the partition bounds.
00182 // The list of boxes in the partition is updated, ensuring that no box is
00183 // recorded twice, and the boxes are kept in increasing left position.
00184 void ColPartition::AddBox(BLOBNBOX* bbox) {
00185   TBOX box = bbox->bounding_box();
00186   // Update the partition limits.
00187   if (boxes_.length() == 0) {
00188     bounding_box_ = box;
00189   } else {
00190     bounding_box_ += box;
00191   }
00192 
00193   if (IsVerticalType()) {
00194     if (!last_add_was_vertical_) {
00195       boxes_.sort(SortByBoxBottom<BLOBNBOX>);
00196       last_add_was_vertical_ = true;
00197     }
00198     boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>, true, bbox);
00199   } else {
00200     if (last_add_was_vertical_) {
00201       boxes_.sort(SortByBoxLeft<BLOBNBOX>);
00202       last_add_was_vertical_ = false;
00203     }
00204     boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>, true, bbox);
00205   }
00206   if (!left_key_tab_)
00207     left_key_ = BoxLeftKey();
00208   if (!right_key_tab_)
00209     right_key_ = BoxRightKey();
00210   if (TabFind::WithinTestRegion(2, box.left(), box.bottom()))
00211     tprintf("Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
00212             box.left(), box.bottom(), box.right(), box.top(),
00213             bounding_box_.left(), bounding_box_.right());
00214 }
00215 
00216 // Removes the given box from the partition, updating the bounds.
00217 void ColPartition::RemoveBox(BLOBNBOX* box) {
00218   BLOBNBOX_C_IT bb_it(&boxes_);
00219   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00220     if (box == bb_it.data()) {
00221       bb_it.extract();
00222       ComputeLimits();
00223       return;
00224     }
00225   }
00226 }
00227 
00228 // Returns the tallest box in the partition, as measured perpendicular to the
00229 // presumed flow of text.
00230 BLOBNBOX* ColPartition::BiggestBox() {
00231   BLOBNBOX* biggest = NULL;
00232   BLOBNBOX_C_IT bb_it(&boxes_);
00233   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00234     BLOBNBOX* bbox = bb_it.data();
00235     if (IsVerticalType()) {
00236       if (biggest == NULL ||
00237           bbox->bounding_box().width() > biggest->bounding_box().width())
00238         biggest = bbox;
00239     } else {
00240       if (biggest == NULL ||
00241           bbox->bounding_box().height() > biggest->bounding_box().height())
00242         biggest = bbox;
00243     }
00244   }
00245   return biggest;
00246 }
00247 
00248 // Returns the bounding box excluding the given box.
00249 TBOX ColPartition::BoundsWithoutBox(BLOBNBOX* box) {
00250   TBOX result;
00251   BLOBNBOX_C_IT bb_it(&boxes_);
00252   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00253     if (box != bb_it.data()) {
00254       result += bb_it.data()->bounding_box();
00255     }
00256   }
00257   return result;
00258 }
00259 
00260 // Claims the boxes in the boxes_list by marking them with a this owner
00261 // pointer. If a box is already owned, then it must be owned by this.
00262 void ColPartition::ClaimBoxes() {
00263   BLOBNBOX_C_IT bb_it(&boxes_);
00264   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00265     BLOBNBOX* bblob = bb_it.data();
00266     ColPartition* other = bblob->owner();
00267     if (other == NULL) {
00268       // Normal case: ownership is available.
00269       bblob->set_owner(this);
00270     } else {
00271       ASSERT_HOST(other == this);
00272     }
00273   }
00274 }
00275 
00276 // NULL the owner of the blobs in this partition, so they can be deleted
00277 // independently of the ColPartition.
00278 void ColPartition::DisownBoxes() {
00279   BLOBNBOX_C_IT bb_it(&boxes_);
00280   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00281     BLOBNBOX* bblob = bb_it.data();
00282     ASSERT_HOST(bblob->owner() == this || bblob->owner() == NULL);
00283     bblob->set_owner(NULL);
00284   }
00285 }
00286 
00287 // NULL the owner of the blobs in this partition that are owned by this
00288 // partition, so they can be deleted independently of the ColPartition.
00289 // Any blobs that are not owned by this partition get to keep their owner
00290 // without an assert failure.
00291 void ColPartition::DisownBoxesNoAssert() {
00292   BLOBNBOX_C_IT bb_it(&boxes_);
00293   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00294     BLOBNBOX* bblob = bb_it.data();
00295     if (bblob->owner() == this)
00296       bblob->set_owner(NULL);
00297   }
00298 }
00299 
00300 // Delete the boxes that this partition owns.
00301 void ColPartition::DeleteBoxes() {
00302   // Although the boxes_ list is a C_LIST, in some cases it owns the
00303   // BLOBNBOXes, as the ColPartition takes ownership from the grid,
00304   // and the BLOBNBOXes own the underlying C_BLOBs.
00305   for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
00306     BLOBNBOX* bblob = bb_it.extract();
00307     delete bblob->cblob();
00308     delete bblob;
00309   }
00310 }
00311 
00312 // Reflects the partition in the y-axis, assuming that its blobs have
00313 // already been done. Corrects only a limited part of the members, since
00314 // this function is assumed to be used shortly after initial creation, which
00315 // is before a lot of the members are used.
00316 void ColPartition::ReflectInYAxis() {
00317   BLOBNBOX_CLIST reversed_boxes;
00318   BLOBNBOX_C_IT reversed_it(&reversed_boxes);
00319   // Reverse the order of the boxes_.
00320   BLOBNBOX_C_IT bb_it(&boxes_);
00321   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00322     reversed_it.add_before_then_move(bb_it.extract());
00323   }
00324   bb_it.add_list_after(&reversed_boxes);
00325   ASSERT_HOST(!left_key_tab_ && !right_key_tab_);
00326   int tmp = left_margin_;
00327   left_margin_ = -right_margin_;
00328   right_margin_ = -tmp;
00329   ComputeLimits();
00330 }
00331 
00332 // Returns true if this is a legal partition - meaning that the conditions
00333 // left_margin <= bounding_box left
00334 // left_key <= bounding box left key
00335 // bounding box left <= bounding box right
00336 // and likewise for right margin and key
00337 // are all met.
00338 bool ColPartition::IsLegal() {
00339   if (bounding_box_.left() > bounding_box_.right()) {
00340     if (textord_debug_bugs) {
00341       tprintf("Bounding box invalid\n");
00342       Print();
00343     }
00344     return false;  // Bounding box invalid.
00345   }
00346   if (left_margin_ > bounding_box_.left() ||
00347       right_margin_ < bounding_box_.right()) {
00348     if (textord_debug_bugs) {
00349       tprintf("Margins invalid\n");
00350       Print();
00351     }
00352     return false;  // Margins invalid.
00353   }
00354   if (left_key_ > BoxLeftKey() || right_key_ < BoxRightKey()) {
00355     if (textord_debug_bugs) {
00356       tprintf("Key inside box: %d v %d or %d v %d\n",
00357               left_key_, BoxLeftKey(), right_key_, BoxRightKey());
00358       Print();
00359     }
00360     return false;  // Keys inside the box.
00361   }
00362   return true;
00363 }
00364 
00365 // Returns true if the left and right edges are approximately equal.
00366 bool ColPartition::MatchingColumns(const ColPartition& other) const {
00367   int y = (MidY() + other.MidY()) / 2;
00368   if (!NearlyEqual(other.LeftAtY(y) / kColumnWidthFactor,
00369                    LeftAtY(y) / kColumnWidthFactor, 1))
00370     return false;
00371   if (!NearlyEqual(other.RightAtY(y) / kColumnWidthFactor,
00372                    RightAtY(y) / kColumnWidthFactor, 1))
00373     return false;
00374   return true;
00375 }
00376 
00377 // Returns true if the colors match for two text partitions.
00378 bool ColPartition::MatchingTextColor(const ColPartition& other) const {
00379   if (color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise &&
00380       other.color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise)
00381     return false;  // Too noisy.
00382 
00383   // Colors must match for other to count.
00384   double d_this1_o = ImageFind::ColorDistanceFromLine(other.color1_,
00385                                                       other.color2_,
00386                                                       color1_);
00387   double d_this2_o = ImageFind::ColorDistanceFromLine(other.color1_,
00388                                                       other.color2_,
00389                                                       color2_);
00390   double d_o1_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
00391                                                       other.color1_);
00392   double d_o2_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
00393                                                       other.color2_);
00394 // All 4 distances must be small enough.
00395   return d_this1_o < kMaxColorDistance && d_this2_o < kMaxColorDistance &&
00396          d_o1_this < kMaxColorDistance && d_o2_this < kMaxColorDistance;
00397 }
00398 
00399 // Returns true if the sizes match for two text partitions,
00400 // taking orientation into account. See also SizesSimilar.
00401 bool ColPartition::MatchingSizes(const ColPartition& other) const {
00402   if (blob_type_ == BRT_VERT_TEXT || other.blob_type_ == BRT_VERT_TEXT)
00403     return !TabFind::DifferentSizes(median_width_, other.median_width_);
00404   else
00405     return !TabFind::DifferentSizes(median_size_, other.median_size_);
00406 }
00407 
00408 // Returns true if there is no tabstop violation in merging this and other.
00409 bool ColPartition::ConfirmNoTabViolation(const ColPartition& other) const {
00410   if (bounding_box_.right() < other.bounding_box_.left() &&
00411       bounding_box_.right() < other.LeftBlobRule())
00412     return false;
00413   if (other.bounding_box_.right() < bounding_box_.left() &&
00414       other.bounding_box_.right() < LeftBlobRule())
00415     return false;
00416   if (bounding_box_.left() > other.bounding_box_.right() &&
00417       bounding_box_.left() > other.RightBlobRule())
00418     return false;
00419   if (other.bounding_box_.left() > bounding_box_.right() &&
00420       other.bounding_box_.left() > RightBlobRule())
00421     return false;
00422   return true;
00423 }
00424 
00425 // Returns true if other has a similar stroke width to this.
00426 bool ColPartition::MatchingStrokeWidth(const ColPartition& other,
00427                                        double fractional_tolerance,
00428                                        double constant_tolerance) const {
00429   int match_count = 0;
00430   int nonmatch_count = 0;
00431   BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00432   BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST*>(&other.boxes_));
00433   box_it.mark_cycle_pt();
00434   other_it.mark_cycle_pt();
00435   while (!box_it.cycled_list() && !other_it.cycled_list()) {
00436     if (box_it.data()->MatchingStrokeWidth(*other_it.data(),
00437                                            fractional_tolerance,
00438                                            constant_tolerance))
00439       ++match_count;
00440     else
00441       ++nonmatch_count;
00442     box_it.forward();
00443     other_it.forward();
00444   }
00445   return match_count > nonmatch_count;
00446 }
00447 
00448 // Returns true if base is an acceptable diacritic base char merge
00449 // with this as the diacritic.
00450 // Returns true if:
00451 // (1) this is a ColPartition containing only diacritics, and
00452 // (2) the base characters indicated on the diacritics all believably lie
00453 // within the text line of the candidate ColPartition.
00454 bool ColPartition::OKDiacriticMerge(const ColPartition& candidate,
00455                                     bool debug) const {
00456   BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00457   int min_top = MAX_INT32;
00458   int max_bottom = -MAX_INT32;
00459   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00460     BLOBNBOX* blob = it.data();
00461     if (!blob->IsDiacritic()) {
00462       if (debug) {
00463         tprintf("Blob is not a diacritic:");
00464         blob->bounding_box().print();
00465       }
00466       return false;  // All blobs must have diacritic bases.
00467     }
00468     if (blob->base_char_top() < min_top)
00469       min_top = blob->base_char_top();
00470     if (blob->base_char_bottom() > max_bottom)
00471       max_bottom = blob->base_char_bottom();
00472   }
00473   // If the intersection of all vertical ranges of all base characters
00474   // overlaps the median range of this, then it is OK.
00475   bool result = min_top > candidate.median_bottom_ &&
00476                 max_bottom < candidate.median_top_;
00477   if (debug) {
00478     if (result)
00479       tprintf("OKDiacritic!\n");
00480     else
00481       tprintf("y ranges don\'t overlap: %d-%d / %d-%d\n",
00482               max_bottom, min_top, median_bottom_, median_top_);
00483   }
00484   return result;
00485 }
00486 
00487 // Sets the sort key using either the tab vector, or the bounding box if
00488 // the tab vector is NULL. If the tab_vector lies inside the bounding_box,
00489 // use the edge of the box as a key any way.
00490 void ColPartition::SetLeftTab(const TabVector* tab_vector) {
00491   if (tab_vector != NULL) {
00492     left_key_ = tab_vector->sort_key();
00493     left_key_tab_ = left_key_ <= BoxLeftKey();
00494   } else {
00495     left_key_tab_ = false;
00496   }
00497   if (!left_key_tab_)
00498     left_key_ = BoxLeftKey();
00499 }
00500 
00501 // As SetLeftTab, but with the right.
00502 void ColPartition::SetRightTab(const TabVector* tab_vector) {
00503   if (tab_vector != NULL) {
00504     right_key_ = tab_vector->sort_key();
00505     right_key_tab_ = right_key_ >= BoxRightKey();
00506   } else {
00507     right_key_tab_ = false;
00508   }
00509   if (!right_key_tab_)
00510     right_key_ = BoxRightKey();
00511 }
00512 
00513 // Copies the left/right tab from the src partition, but if take_box is
00514 // true, copies the box instead and uses that as a key.
00515 void ColPartition::CopyLeftTab(const ColPartition& src, bool take_box) {
00516   left_key_tab_ = take_box ? false : src.left_key_tab_;
00517   if (left_key_tab_) {
00518     left_key_ = src.left_key_;
00519   } else {
00520     bounding_box_.set_left(XAtY(src.BoxLeftKey(), MidY()));
00521     left_key_ = BoxLeftKey();
00522   }
00523   if (left_margin_ > bounding_box_.left())
00524     left_margin_ = src.left_margin_;
00525 }
00526 
00527 // As CopyLeftTab, but with the right.
00528 void ColPartition::CopyRightTab(const ColPartition& src, bool take_box) {
00529   right_key_tab_ = take_box ? false : src.right_key_tab_;
00530   if (right_key_tab_) {
00531     right_key_ = src.right_key_;
00532   } else {
00533     bounding_box_.set_right(XAtY(src.BoxRightKey(), MidY()));
00534     right_key_ = BoxRightKey();
00535   }
00536   if (right_margin_ < bounding_box_.right())
00537     right_margin_ = src.right_margin_;
00538 }
00539 
00540 // Returns the left rule line x coord of the leftmost blob.
00541 int ColPartition::LeftBlobRule() const {
00542   BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00543   return it.data()->left_rule();
00544 }
00545 // Returns the right rule line x coord of the rightmost blob.
00546 int ColPartition::RightBlobRule() const {
00547   BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
00548   it.move_to_last();
00549   return it.data()->right_rule();
00550 }
00551 
00552 float ColPartition::SpecialBlobsDensity(const BlobSpecialTextType type) const {
00553   ASSERT_HOST(type < BSTT_COUNT);
00554   return special_blobs_densities_[type];
00555 }
00556 
00557 int ColPartition::SpecialBlobsCount(const BlobSpecialTextType type) {
00558   ASSERT_HOST(type < BSTT_COUNT);
00559   BLOBNBOX_C_IT blob_it(&boxes_);
00560   int count = 0;
00561   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00562     BLOBNBOX* blob = blob_it.data();
00563     BlobSpecialTextType blob_type = blob->special_text_type();
00564     if (blob_type == type) {
00565       count++;
00566     }
00567   }
00568 
00569   return count;
00570 }
00571 
00572 void ColPartition::SetSpecialBlobsDensity(
00573     const BlobSpecialTextType type, const float density) {
00574   ASSERT_HOST(type < BSTT_COUNT);
00575   special_blobs_densities_[type] = density;
00576 }
00577 
00578 void ColPartition::ComputeSpecialBlobsDensity() {
00579   memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
00580   if (boxes_.empty()) {
00581     return;
00582   }
00583 
00584   BLOBNBOX_C_IT blob_it(&boxes_);
00585   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00586     BLOBNBOX* blob = blob_it.data();
00587     BlobSpecialTextType type = blob->special_text_type();
00588     special_blobs_densities_[type]++;
00589   }
00590 
00591   for (int type = 0; type < BSTT_COUNT; ++type) {
00592     special_blobs_densities_[type] /= boxes_.length();
00593   }
00594 }
00595 
00596 // Add a partner above if upper, otherwise below.
00597 // Add them uniquely and keep the list sorted by box left.
00598 // Partnerships are added symmetrically to partner and this.
00599 void ColPartition::AddPartner(bool upper, ColPartition* partner) {
00600   if (upper) {
00601     partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
00602                                         true, this);
00603     upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
00604   } else {
00605     partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
00606                                         true, this);
00607     lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
00608   }
00609 }
00610 
00611 // Removes the partner from this, but does not remove this from partner.
00612 // This asymmetric removal is so as not to mess up the iterator that is
00613 // working on partner's partner list.
00614 void ColPartition::RemovePartner(bool upper, ColPartition* partner) {
00615   ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
00616   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00617     if (it.data() == partner) {
00618       it.extract();
00619       break;
00620     }
00621   }
00622 }
00623 
00624 // Returns the partner if the given partner is a singleton, otherwise NULL.
00625 ColPartition* ColPartition::SingletonPartner(bool upper) {
00626   ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
00627   if (!partners->singleton())
00628     return NULL;
00629   ColPartition_C_IT it(partners);
00630   return it.data();
00631 }
00632 
00633 // Merge with the other partition and delete it.
00634 void ColPartition::Absorb(ColPartition* other, WidthCallback* cb) {
00635   // The result has to either own all of the blobs or none of them.
00636   // Verify the flag is consisent.
00637   ASSERT_HOST(owns_blobs() == other->owns_blobs());
00638   // TODO(nbeato): check owns_blobs better. Right now owns_blobs
00639   // should always be true when this is called. So there is no issues.
00640   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
00641                                 bounding_box_.bottom()) ||
00642       TabFind::WithinTestRegion(2, other->bounding_box_.left(),
00643                                 other->bounding_box_.bottom())) {
00644     tprintf("Merging:");
00645     Print();
00646     other->Print();
00647   }
00648 
00649   // Update the special_blobs_densities_.
00650   memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
00651   for (int type = 0; type < BSTT_COUNT; ++type) {
00652     int w1 = boxes_.length(), w2 = other->boxes_.length();
00653     float new_val = special_blobs_densities_[type] * w1 +
00654         other->special_blobs_densities_[type] * w2;
00655     if (!w1 || !w2) {
00656       special_blobs_densities_[type] = new_val / (w1 + w2);
00657     }
00658   }
00659 
00660   // Merge the two sorted lists.
00661   BLOBNBOX_C_IT it(&boxes_);
00662   BLOBNBOX_C_IT it2(&other->boxes_);
00663   for (; !it2.empty(); it2.forward()) {
00664     BLOBNBOX* bbox2 = it2.extract();
00665     ColPartition* prev_owner = bbox2->owner();
00666     if (prev_owner != other && prev_owner != NULL) {
00667       // A blob on other's list is owned by someone else; let them have it.
00668       continue;
00669     }
00670     ASSERT_HOST(prev_owner == other || prev_owner == NULL);
00671     if (prev_owner == other)
00672       bbox2->set_owner(this);
00673     it.add_to_end(bbox2);
00674   }
00675   left_margin_ = MIN(left_margin_, other->left_margin_);
00676   right_margin_ = MAX(right_margin_, other->right_margin_);
00677   if (other->left_key_ < left_key_) {
00678     left_key_ = other->left_key_;
00679     left_key_tab_ = other->left_key_tab_;
00680   }
00681   if (other->right_key_ > right_key_) {
00682     right_key_ = other->right_key_;
00683     right_key_tab_ = other->right_key_tab_;
00684   }
00685   // Combine the flow and blob_type in a sensible way.
00686   // Dominant flows stay.
00687   if (!DominatesInMerge(flow_, other->flow_)) {
00688     flow_ = other->flow_;
00689     blob_type_ = other->blob_type_;
00690   }
00691   SetBlobTypes();
00692   if (IsVerticalType()) {
00693     boxes_.sort(SortByBoxBottom<BLOBNBOX>);
00694     last_add_was_vertical_ = true;
00695   } else {
00696     boxes_.sort(SortByBoxLeft<BLOBNBOX>);
00697     last_add_was_vertical_ = false;
00698   }
00699   ComputeLimits();
00700   // Fix partner lists. other is going away, so remove it as a
00701   // partner of all its partners and add this in its place.
00702   for (int upper = 0; upper < 2; ++upper) {
00703     ColPartition_CLIST partners;
00704     ColPartition_C_IT part_it(&partners);
00705     part_it.add_list_after(upper ? &other->upper_partners_
00706                                  : &other->lower_partners_);
00707     for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
00708       ColPartition* partner = part_it.extract();
00709       partner->RemovePartner(!upper, other);
00710       partner->RemovePartner(!upper, this);
00711       partner->AddPartner(!upper, this);
00712     }
00713   }
00714   delete other;
00715   if (cb != NULL) {
00716     SetColumnGoodness(cb);
00717   }
00718 }
00719 
00720 // Merge1 and merge2 are candidates to be merged, yet their combined box
00721 // overlaps this. Is that allowed?
00722 // Returns true if the overlap between this and the merged pair of
00723 // merge candidates is sufficiently trivial to be allowed.
00724 // The merged box can graze the edge of this by the ok_box_overlap
00725 // if that exceeds the margin to the median top and bottom.
00726 // ok_box_overlap should be set by the caller appropriate to the sizes of
00727 // the text involved, and is usually a fraction of the median size of merge1
00728 // and/or merge2, or this.
00729 // TODO(rays) Determine whether vertical text needs to be considered.
00730 bool ColPartition::OKMergeOverlap(const ColPartition& merge1,
00731                                   const ColPartition& merge2,
00732                                   int ok_box_overlap, bool debug) {
00733   // Vertical partitions are not allowed to be involved.
00734   if (IsVerticalType() || merge1.IsVerticalType() || merge2.IsVerticalType()) {
00735     if (debug)
00736       tprintf("Vertical partition\n");
00737     return false;
00738   }
00739   // The merging partitions must strongly overlap each other.
00740   if (!merge1.VSignificantCoreOverlap(merge2)) {
00741     if (debug)
00742       tprintf("Voverlap %d (%d)\n",
00743               merge1.VCoreOverlap(merge2),
00744               merge1.VSignificantCoreOverlap(merge2));
00745     return false;
00746   }
00747   // The merged box must not overlap the median bounds of this.
00748   TBOX merged_box(merge1.bounding_box());
00749   merged_box += merge2.bounding_box();
00750   if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&
00751       merged_box.bottom() < bounding_box_.top() - ok_box_overlap &&
00752       merged_box.top() > bounding_box_.bottom() + ok_box_overlap) {
00753     if (debug)
00754       tprintf("Excessive box overlap\n");
00755     return false;
00756   }
00757   // Looks OK!
00758   return true;
00759 }
00760 
00761 // Find the blob at which to split this to minimize the overlap with the
00762 // given box. Returns the first blob to go in the second partition.
00763 BLOBNBOX* ColPartition::OverlapSplitBlob(const TBOX& box) {
00764   if (boxes_.empty() || boxes_.singleton())
00765     return NULL;
00766   BLOBNBOX_C_IT it(&boxes_);
00767   TBOX left_box(it.data()->bounding_box());
00768   for (it.forward(); !it.at_first(); it.forward()) {
00769     BLOBNBOX* bbox = it.data();
00770     left_box += bbox->bounding_box();
00771     if (left_box.overlap(box))
00772       return bbox;
00773   }
00774   return NULL;
00775 }
00776 
00777 // Split this partition keeping the first half in this and returning
00778 // the second half.
00779 // Splits by putting the split_blob and the blobs that follow
00780 // in the second half, and the rest in the first half.
00781 ColPartition* ColPartition::SplitAtBlob(BLOBNBOX* split_blob) {
00782   ColPartition* split_part = ShallowCopy();
00783   split_part->set_owns_blobs(owns_blobs());
00784   BLOBNBOX_C_IT it(&boxes_);
00785   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00786     BLOBNBOX* bbox = it.data();
00787     ColPartition* prev_owner = bbox->owner();
00788     ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == NULL);
00789     if (bbox == split_blob || !split_part->boxes_.empty()) {
00790       split_part->AddBox(it.extract());
00791       if (owns_blobs() && prev_owner != NULL)
00792         bbox->set_owner(split_part);
00793     }
00794   }
00795   ASSERT_HOST(!it.empty());
00796   if (split_part->IsEmpty()) {
00797     // Split part ended up with nothing. Possible if split_blob is not
00798     // in the list of blobs.
00799     delete split_part;
00800     return NULL;
00801   }
00802   right_key_tab_ = false;
00803   split_part->left_key_tab_ = false;
00804   ComputeLimits();
00805   // TODO(nbeato) Merge Ray's CL like this:
00806   // if (owns_blobs())
00807   //  SetBlobTextlineGoodness();
00808   split_part->ComputeLimits();
00809   // TODO(nbeato) Merge Ray's CL like this:
00810   // if (split_part->owns_blobs())
00811   //   split_part->SetBlobTextlineGoodness();
00812   return split_part;
00813 }
00814 
00815 // Split this partition at the given x coordinate, returning the right
00816 // half and keeping the left half in this.
00817 ColPartition* ColPartition::SplitAt(int split_x) {
00818   if (split_x <= bounding_box_.left() || split_x >= bounding_box_.right())
00819     return NULL;  // There will be no change.
00820   ColPartition* split_part = ShallowCopy();
00821   split_part->set_owns_blobs(owns_blobs());
00822   BLOBNBOX_C_IT it(&boxes_);
00823   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00824     BLOBNBOX* bbox = it.data();
00825     ColPartition* prev_owner = bbox->owner();
00826     ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == NULL);
00827     const TBOX& box = bbox->bounding_box();
00828     if (box.left() >= split_x) {
00829       split_part->AddBox(it.extract());
00830       if (owns_blobs() && prev_owner != NULL)
00831         bbox->set_owner(split_part);
00832     }
00833   }
00834   ASSERT_HOST(!it.empty());
00835   if (split_part->IsEmpty()) {
00836     // Split part ended up with nothing. Possible if split_x passes
00837     // through the last blob.
00838     delete split_part;
00839     return NULL;
00840   }
00841   right_key_tab_ = false;
00842   split_part->left_key_tab_ = false;
00843   right_margin_ = split_x;
00844   split_part->left_margin_ = split_x;
00845   ComputeLimits();
00846   split_part->ComputeLimits();
00847   return split_part;
00848 }
00849 
00850 // Recalculates all the coordinate limits of the partition.
00851 void ColPartition::ComputeLimits() {
00852   bounding_box_ = TBOX();  // Clear it
00853   BLOBNBOX_C_IT it(&boxes_);
00854   BLOBNBOX* bbox = NULL;
00855   int non_leader_count = 0;
00856   if (it.empty()) {
00857     bounding_box_.set_left(left_margin_);
00858     bounding_box_.set_right(right_margin_);
00859     bounding_box_.set_bottom(0);
00860     bounding_box_.set_top(0);
00861   } else {
00862     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00863       bbox = it.data();
00864       bounding_box_ += bbox->bounding_box();
00865       if (bbox->flow() != BTFT_LEADER)
00866         ++non_leader_count;
00867     }
00868   }
00869   if (!left_key_tab_)
00870     left_key_ = BoxLeftKey();
00871   if (left_key_ > BoxLeftKey() && textord_debug_bugs) {
00872     // TODO(rays) investigate the causes of these error messages, to find
00873     // out if they are genuinely harmful, or just indicative of junk input.
00874     tprintf("Computed left-illegal partition\n");
00875     Print();
00876   }
00877   if (!right_key_tab_)
00878     right_key_ = BoxRightKey();
00879   if (right_key_ < BoxRightKey() && textord_debug_bugs) {
00880     tprintf("Computed right-illegal partition\n");
00881     Print();
00882   }
00883   if (it.empty())
00884     return;
00885   if (IsImageType() || blob_type() == BRT_RECTIMAGE ||
00886       blob_type() == BRT_POLYIMAGE) {
00887     median_top_ = bounding_box_.top();
00888     median_bottom_ = bounding_box_.bottom();
00889     median_size_ = bounding_box_.height();
00890     median_left_ = bounding_box_.left();
00891     median_right_ = bounding_box_.right();
00892     median_width_ = bounding_box_.width();
00893   } else {
00894     STATS top_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
00895     STATS bottom_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
00896     STATS size_stats(0, bounding_box_.height() + 1);
00897     STATS left_stats(bounding_box_.left(), bounding_box_.right() + 1);
00898     STATS right_stats(bounding_box_.left(), bounding_box_.right() + 1);
00899     STATS width_stats(0, bounding_box_.width() + 1);
00900     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00901       bbox = it.data();
00902       if (non_leader_count == 0 || bbox->flow() != BTFT_LEADER) {
00903         TBOX box = bbox->bounding_box();
00904         int area = box.area();
00905         top_stats.add(box.top(), area);
00906         bottom_stats.add(box.bottom(), area);
00907         size_stats.add(box.height(), area);
00908         left_stats.add(box.left(), area);
00909         right_stats.add(box.right(), area);
00910         width_stats.add(box.width(), area);
00911       }
00912     }
00913     median_top_ = static_cast<int>(top_stats.median() + 0.5);
00914     median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);
00915     median_size_ = static_cast<int>(size_stats.median() + 0.5);
00916     median_left_ = static_cast<int>(left_stats.median() + 0.5);
00917     median_right_ = static_cast<int>(right_stats.median() + 0.5);
00918     median_width_ = static_cast<int>(width_stats.median() + 0.5);
00919   }
00920 
00921   if (right_margin_ < bounding_box_.right() && textord_debug_bugs) {
00922     tprintf("Made partition with bad right coords");
00923     Print();
00924   }
00925   if (left_margin_ > bounding_box_.left() && textord_debug_bugs) {
00926     tprintf("Made partition with bad left coords");
00927     Print();
00928   }
00929   // Fix partner lists. The bounding box has changed and partners are stored
00930   // in bounding box order, so remove and reinsert this as a partner
00931   // of all its partners.
00932   for (int upper = 0; upper < 2; ++upper) {
00933     ColPartition_CLIST partners;
00934     ColPartition_C_IT part_it(&partners);
00935     part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);
00936     for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
00937       ColPartition* partner = part_it.extract();
00938       partner->RemovePartner(!upper, this);
00939       partner->AddPartner(!upper, this);
00940     }
00941   }
00942   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
00943                                 bounding_box_.bottom())) {
00944     tprintf("Recomputed box for partition %p\n", this);
00945     Print();
00946   }
00947 }
00948 
00949 // Returns the number of boxes that overlap the given box.
00950 int ColPartition::CountOverlappingBoxes(const TBOX& box) {
00951   BLOBNBOX_C_IT it(&boxes_);
00952   int overlap_count = 0;
00953   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00954     BLOBNBOX* bbox = it.data();
00955     if (box.overlap(bbox->bounding_box()))
00956       ++overlap_count;
00957   }
00958   return overlap_count;
00959 }
00960 
00961 // Computes and sets the type_ and first_colum_, last_column_ and column_set_.
00962 // resolution refers to the ppi resolution of the image.
00963 void ColPartition::SetPartitionType(int resolution, ColPartitionSet* columns) {
00964   int first_spanned_col = -1;
00965   ColumnSpanningType span_type =
00966       columns->SpanningType(resolution,
00967                             bounding_box_.left(), bounding_box_.right(),
00968                             MIN(bounding_box_.height(), bounding_box_.width()),
00969                             MidY(), left_margin_, right_margin_,
00970                             &first_column_, &last_column_,
00971                             &first_spanned_col);
00972   column_set_ = columns;
00973   if (first_column_ < last_column_ && span_type == CST_PULLOUT &&
00974       !IsLineType()) {
00975     // Unequal columns may indicate that the pullout spans one of the columns
00976     // it lies in, so force it to be allocated to just that column.
00977     if (first_spanned_col >= 0) {
00978       first_column_ = first_spanned_col;
00979       last_column_ = first_spanned_col;
00980     } else {
00981       if ((first_column_ & 1) == 0)
00982         last_column_ = first_column_;
00983       else if ((last_column_ & 1) == 0)
00984         first_column_ = last_column_;
00985       else
00986         first_column_ = last_column_ = (first_column_ + last_column_) / 2;
00987     }
00988   }
00989   type_ = PartitionType(span_type);
00990 }
00991 
00992 // Returns the PartitionType from the current BlobRegionType and a column
00993 // flow spanning type ColumnSpanningType, generated by
00994 // ColPartitionSet::SpanningType, that indicates how the partition sits
00995 // in the columns.
00996 PolyBlockType ColPartition::PartitionType(ColumnSpanningType flow) const {
00997   if (flow == CST_NOISE) {
00998     if (blob_type_ != BRT_HLINE && blob_type_ != BRT_VLINE &&
00999         blob_type_ != BRT_RECTIMAGE && blob_type_ != BRT_VERT_TEXT)
01000       return PT_NOISE;
01001     flow = CST_FLOWING;
01002   }
01003 
01004   switch (blob_type_) {
01005     case BRT_NOISE:
01006       return PT_NOISE;
01007     case BRT_HLINE:
01008       return PT_HORZ_LINE;
01009     case BRT_VLINE:
01010       return PT_VERT_LINE;
01011     case BRT_RECTIMAGE:
01012     case BRT_POLYIMAGE:
01013       switch (flow) {
01014         case CST_FLOWING:
01015           return PT_FLOWING_IMAGE;
01016         case CST_HEADING:
01017           return PT_HEADING_IMAGE;
01018         case CST_PULLOUT:
01019           return PT_PULLOUT_IMAGE;
01020         default:
01021           ASSERT_HOST(!"Undefined flow type for image!");
01022       }
01023       break;
01024     case BRT_VERT_TEXT:
01025       return PT_VERTICAL_TEXT;
01026     case BRT_TEXT:
01027     case BRT_UNKNOWN:
01028     default:
01029       switch (flow) {
01030         case CST_FLOWING:
01031           return PT_FLOWING_TEXT;
01032         case CST_HEADING:
01033           return PT_HEADING_TEXT;
01034         case CST_PULLOUT:
01035           return PT_PULLOUT_TEXT;
01036         default:
01037           ASSERT_HOST(!"Undefined flow type for text!");
01038       }
01039   }
01040   ASSERT_HOST(!"Should never get here!");
01041   return PT_NOISE;
01042 }
01043 
01044 // Returns the first and last column touched by this partition.
01045 // resolution refers to the ppi resolution of the image.
01046 void ColPartition::ColumnRange(int resolution, ColPartitionSet* columns,
01047                                int* first_col, int* last_col) {
01048   int first_spanned_col = -1;
01049   ColumnSpanningType span_type =
01050       columns->SpanningType(resolution,
01051                             bounding_box_.left(), bounding_box_.right(),
01052                             MIN(bounding_box_.height(), bounding_box_.width()),
01053                             MidY(), left_margin_, right_margin_,
01054                             first_col, last_col,
01055                             &first_spanned_col);
01056   type_ = PartitionType(span_type);
01057 }
01058 
01059 // Sets the internal flags good_width_ and good_column_.
01060 void ColPartition::SetColumnGoodness(WidthCallback* cb) {
01061   int y = MidY();
01062   int width = RightAtY(y) - LeftAtY(y);
01063   good_width_ = cb->Run(width);
01064   good_column_ = blob_type_ == BRT_TEXT && left_key_tab_ && right_key_tab_;
01065 }
01066 
01067 // Determines whether the blobs in this partition mostly represent
01068 // a leader (fixed pitch sequence) and sets the member blobs accordingly.
01069 // Note that height is assumed to have been tested elsewhere, and that this
01070 // function will find most fixed-pitch text as leader without a height filter.
01071 // Leader detection is limited to sequences of identical width objects,
01072 // such as .... or ----, so patterns, such as .-.-.-.-. will not be found.
01073 bool ColPartition::MarkAsLeaderIfMonospaced() {
01074   bool result = false;
01075   // Gather statistics on the gaps between blobs and the widths of the blobs.
01076   int part_width = bounding_box_.width();
01077   STATS gap_stats(0, part_width);
01078   STATS width_stats(0, part_width);
01079   BLOBNBOX_C_IT it(&boxes_);
01080   BLOBNBOX* prev_blob = it.data();
01081   prev_blob->set_flow(BTFT_NEIGHBOURS);
01082   width_stats.add(prev_blob->bounding_box().width(), 1);
01083   int blob_count = 1;
01084   for (it.forward(); !it.at_first(); it.forward()) {
01085     BLOBNBOX* blob = it.data();
01086     int left = blob->bounding_box().left();
01087     int right = blob->bounding_box().right();
01088     gap_stats.add(left - prev_blob->bounding_box().right(), 1);
01089     width_stats.add(right - left, 1);
01090     blob->set_flow(BTFT_NEIGHBOURS);
01091     prev_blob = blob;
01092     ++blob_count;
01093   }
01094   double median_gap = gap_stats.median();
01095   double median_width = width_stats.median();
01096   double max_width = MAX(median_gap, median_width);
01097   double min_width = MIN(median_gap, median_width);
01098   double gap_iqr = gap_stats.ile(0.75f) - gap_stats.ile(0.25f);
01099   if (textord_debug_tabfind >= 4) {
01100     tprintf("gap iqr = %g, blob_count=%d, limits=%g,%g\n",
01101             gap_iqr, blob_count, max_width * kMaxLeaderGapFractionOfMax,
01102             min_width * kMaxLeaderGapFractionOfMin);
01103   }
01104   if (gap_iqr < max_width * kMaxLeaderGapFractionOfMax &&
01105       gap_iqr < min_width * kMaxLeaderGapFractionOfMin &&
01106       blob_count >= kMinLeaderCount) {
01107     // This is stable enough to be called a leader, so check the widths.
01108     // Since leader dashes can join, run a dp cutting algorithm and go
01109     // on the cost.
01110     int offset = static_cast<int>(ceil(gap_iqr * 2));
01111     int min_step = static_cast<int>(median_gap + median_width + 0.5);
01112     int max_step = min_step + offset;
01113     min_step -= offset;
01114     // Pad the buffer with min_step/2 on each end.
01115     int part_left = bounding_box_.left() - min_step / 2;
01116     part_width += min_step;
01117     DPPoint* projection = new DPPoint[part_width];
01118     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01119       BLOBNBOX* blob = it.data();
01120       int left = blob->bounding_box().left();
01121       int right = blob->bounding_box().right();
01122       int height = blob->bounding_box().height();
01123       for (int x = left; x < right; ++x) {
01124         projection[left - part_left].AddLocalCost(height);
01125       }
01126     }
01127     DPPoint* best_end = DPPoint::Solve(min_step, max_step, false,
01128                                        &DPPoint::CostWithVariance,
01129                                        part_width, projection);
01130     if (best_end != NULL && best_end->total_cost() < blob_count) {
01131       // Good enough. Call it a leader.
01132       result = true;
01133       for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01134         BLOBNBOX* blob = it.data();
01135         TBOX box = blob->bounding_box();
01136         // If the first or last blob is spaced too much, don't mark it.
01137         if (it.at_first()) {
01138           int gap = it.data_relative(1)->bounding_box().left() -
01139                      blob->bounding_box().right();
01140           if (blob->bounding_box().width() + gap > max_step) {
01141             it.extract();
01142             continue;
01143           }
01144         }
01145         if (it.at_last()) {
01146           int gap = blob->bounding_box().left() -
01147                      it.data_relative(-1)->bounding_box().right();
01148           if (blob->bounding_box().width() + gap > max_step) {
01149             it.extract();
01150             break;
01151           }
01152         }
01153         blob->set_region_type(BRT_TEXT);
01154         blob->set_flow(BTFT_LEADER);
01155       }
01156       blob_type_ = BRT_TEXT;
01157       flow_ = BTFT_LEADER;
01158     } else if (textord_debug_tabfind) {
01159       if (best_end == NULL) {
01160         tprintf("No path\n");
01161       } else {
01162         tprintf("Total cost = %d vs allowed %d\n",
01163                 best_end->total_cost() < blob_count);
01164       }
01165     }
01166     delete [] projection;
01167   }
01168   return result;
01169 }
01170 
01171 // Given the result of TextlineProjection::EvaluateColPartition, (positive for
01172 // horizontal text, negative for vertical text, and near zero for non-text),
01173 // sets the blob_type_ and flow_ for this partition to indicate whether it
01174 // is strongly or weakly vertical or horizontal text, or non-text.
01175 // The function assumes that the blob neighbours are valid (from
01176 // StrokeWidth::SetNeighbours) and that those neighbours have their
01177 // region_type() set.
01178 void ColPartition::SetRegionAndFlowTypesFromProjectionValue(int value) {
01179   int blob_count = 0;        // Total # blobs.
01180   int good_blob_score_ = 0;  // Total # good strokewidth neighbours.
01181   int noisy_count = 0;       // Total # neighbours marked as noise.
01182   int hline_count = 0;
01183   int vline_count = 0;
01184   BLOBNBOX_C_IT it(&boxes_);
01185   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01186     BLOBNBOX* blob = it.data();
01187     ++blob_count;
01188     noisy_count += blob->NoisyNeighbours();
01189     good_blob_score_ += blob->GoodTextBlob();
01190     if (blob->region_type() == BRT_HLINE) ++hline_count;
01191     if (blob->region_type() == BRT_VLINE) ++vline_count;
01192   }
01193   flow_ = BTFT_NEIGHBOURS;
01194   blob_type_ = BRT_UNKNOWN;
01195   if (hline_count > vline_count) {
01196     flow_ = BTFT_NONE;
01197     blob_type_ = BRT_HLINE;
01198   } else if (vline_count > hline_count) {
01199     flow_ = BTFT_NONE;
01200     blob_type_ = BRT_VLINE;
01201   } else if (value < -1 || 1 < value) {
01202     int long_side;
01203     int short_side;
01204     if (value > 0) {
01205       long_side = bounding_box_.width();
01206       short_side = bounding_box_.height();
01207       blob_type_ = BRT_TEXT;
01208     } else {
01209       long_side = bounding_box_.height();
01210       short_side = bounding_box_.width();
01211       blob_type_ = BRT_VERT_TEXT;
01212     }
01213     // We will combine the old metrics using aspect ratio and blob counts
01214     // with the input value by allowing a strong indication to flip the
01215     // STRONG_CHAIN/CHAIN flow values.
01216     int strong_score = blob_count >= kHorzStrongTextlineCount ? 1 : 0;
01217     if (short_side > kHorzStrongTextlineHeight) ++strong_score;
01218     if (short_side * kHorzStrongTextlineAspect < long_side) ++strong_score;
01219     if (abs(value) >= kMinStrongTextValue)
01220       flow_ = BTFT_STRONG_CHAIN;
01221     else if (abs(value) >= kMinChainTextValue)
01222       flow_ = BTFT_CHAIN;
01223     else
01224       flow_ = BTFT_NEIGHBOURS;
01225     // Upgrade chain to strong chain if the other indicators are good
01226     if (flow_ == BTFT_CHAIN && strong_score == 3)
01227       flow_ = BTFT_STRONG_CHAIN;
01228     // Downgrade strong vertical text to chain if the indicators are bad.
01229     if (flow_ == BTFT_STRONG_CHAIN && value < 0 && strong_score < 2)
01230       flow_ = BTFT_CHAIN;
01231   }
01232   if (flow_ == BTFT_NEIGHBOURS) {
01233     // Check for noisy neighbours.
01234     if (noisy_count >= blob_count) {
01235       flow_ = BTFT_NONTEXT;
01236       blob_type_= BRT_NOISE;
01237     }
01238   }
01239   if (TabFind::WithinTestRegion(2, bounding_box_.left(),
01240                                 bounding_box_.bottom())) {
01241     tprintf("RegionFlowTypesFromProjectionValue count=%d, noisy=%d, score=%d,",
01242             blob_count, noisy_count, good_blob_score_);
01243     tprintf(" Projection value=%d, flow=%d, blob_type=%d\n",
01244             value, flow_, blob_type_);
01245     Print();
01246   }
01247   SetBlobTypes();
01248 }
01249 
01250 // Sets all blobs with the partition blob type and flow, but never overwrite
01251 // leader blobs, as we need to be able to identify them later.
01252 void ColPartition::SetBlobTypes() {
01253   if (!owns_blobs())
01254     return;
01255   BLOBNBOX_C_IT it(&boxes_);
01256   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01257     BLOBNBOX* blob = it.data();
01258     if (blob->flow() != BTFT_LEADER)
01259       blob->set_flow(flow_);
01260     blob->set_region_type(blob_type_);
01261     ASSERT_HOST(blob->owner() == NULL || blob->owner() == this);
01262   }
01263 }
01264 
01265 // Returns true if a decent baseline can be fitted through the blobs.
01266 // Works for both horizontal and vertical text.
01267 bool ColPartition::HasGoodBaseline() {
01268   // Approximation of the baseline.
01269   DetLineFit linepoints;
01270   // Calculation of the mean height on this line segment. Note that these
01271   // variable names apply to the context of a horizontal line, and work
01272   // analogously, rather than literally in the case of a vertical line.
01273   int total_height = 0;
01274   int coverage = 0;
01275   int height_count = 0;
01276   int width = 0;
01277   BLOBNBOX_C_IT it(&boxes_);
01278   TBOX box(it.data()->bounding_box());
01279   // Accumulate points representing the baseline at the middle of each blob,
01280   // but add an additional point for each end of the line. This makes it
01281   // harder to fit a severe skew angle, as it is most likely not right.
01282   if (IsVerticalType()) {
01283     // For a vertical line, use the right side as the baseline.
01284     ICOORD first_pt(box.right(), box.bottom());
01285     // Use the bottom-right of the first (bottom) box, the top-right of the
01286     // last, and the middle-right of all others.
01287     linepoints.Add(first_pt);
01288     for (it.forward(); !it.at_last(); it.forward()) {
01289       BLOBNBOX* blob = it.data();
01290       box = blob->bounding_box();
01291       ICOORD box_pt(box.right(), (box.top() + box.bottom()) / 2);
01292       linepoints.Add(box_pt);
01293       total_height += box.width();
01294       coverage += box.height();
01295       ++height_count;
01296     }
01297     box = it.data()->bounding_box();
01298     ICOORD last_pt(box.right(), box.top());
01299     linepoints.Add(last_pt);
01300     width = last_pt.y() - first_pt.y();
01301 
01302   } else {
01303     // Horizontal lines use the bottom as the baseline.
01304     TBOX box(it.data()->bounding_box());
01305     // Use the bottom-left of the first box, the the bottom-right of the last,
01306     // and the middle of all others.
01307     ICOORD first_pt(box.left(), box.bottom());
01308     linepoints.Add(first_pt);
01309     for (it.forward(); !it.at_last(); it.forward()) {
01310       BLOBNBOX* blob = it.data();
01311       box = blob->bounding_box();
01312       ICOORD box_pt((box.left() + box.right()) / 2, box.bottom());
01313       linepoints.Add(box_pt);
01314       total_height += box.height();
01315       coverage += box.width();
01316       ++height_count;
01317     }
01318     box = it.data()->bounding_box();
01319     ICOORD last_pt(box.right(), box.bottom());
01320     linepoints.Add(last_pt);
01321     width = last_pt.x() - first_pt.x();
01322   }
01323   // Maximum median error allowed to be a good text line.
01324   double max_error = kMaxBaselineError * total_height / height_count;
01325   ICOORD start_pt, end_pt;
01326   double error = linepoints.Fit(&start_pt, &end_pt);
01327   return error < max_error && coverage >= kMinBaselineCoverage * width;
01328 }
01329 
01330 // Adds this ColPartition to a matching WorkingPartSet if one can be found,
01331 // otherwise starts a new one in the appropriate column, ending the previous.
01332 void ColPartition::AddToWorkingSet(const ICOORD& bleft, const ICOORD& tright,
01333                                    int resolution,
01334                                    ColPartition_LIST* used_parts,
01335                                    WorkingPartSet_LIST* working_sets) {
01336   if (block_owned_)
01337     return;  // Done it already.
01338   block_owned_ = true;
01339   WorkingPartSet_IT it(working_sets);
01340   // If there is an upper partner use its working_set_ directly.
01341   ColPartition* partner = SingletonPartner(true);
01342   if (partner != NULL && partner->working_set_ != NULL) {
01343     working_set_ = partner->working_set_;
01344     working_set_->AddPartition(this);
01345     return;
01346   }
01347   if (partner != NULL && textord_debug_bugs) {
01348     tprintf("Partition with partner has no working set!:");
01349     Print();
01350     partner->Print();
01351   }
01352   // Search for the column that the left edge fits in.
01353   WorkingPartSet* work_set = NULL;
01354   it.move_to_first();
01355   int col_index = 0;
01356   for (it.mark_cycle_pt(); !it.cycled_list() &&
01357        col_index != first_column_;
01358         it.forward(), ++col_index);
01359   if (textord_debug_tabfind >= 2) {
01360     tprintf("Match is %s for:", (col_index & 1) ? "Real" : "Between");
01361     Print();
01362   }
01363   if (it.cycled_list() && textord_debug_bugs) {
01364     tprintf("Target column=%d, only had %d\n", first_column_, col_index);
01365   }
01366   ASSERT_HOST(!it.cycled_list());
01367   work_set = it.data();
01368   // If last_column_ != first_column, then we need to scoop up all blocks
01369   // between here and the last_column_ and put back in work_set.
01370   if (!it.cycled_list() && last_column_ != first_column_ && !IsPulloutType()) {
01371     // Find the column that the right edge falls in.
01372     BLOCK_LIST completed_blocks;
01373     TO_BLOCK_LIST to_blocks;
01374     for (; !it.cycled_list() && col_index <= last_column_;
01375          it.forward(), ++col_index) {
01376       WorkingPartSet* end_set = it.data();
01377       end_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,
01378                                       &completed_blocks, &to_blocks);
01379     }
01380     work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
01381   }
01382   working_set_ = work_set;
01383   work_set->AddPartition(this);
01384 }
01385 
01386 // From the given block_parts list, builds one or more BLOCKs and
01387 // corresponding TO_BLOCKs, such that the line spacing is uniform in each.
01388 // Created blocks are appended to the end of completed_blocks and to_blocks.
01389 // The used partitions are put onto used_parts, as they may still be referred
01390 // to in the partition grid. bleft, tright and resolution are the bounds
01391 // and resolution of the original image.
01392 void ColPartition::LineSpacingBlocks(const ICOORD& bleft, const ICOORD& tright,
01393                                      int resolution,
01394                                      ColPartition_LIST* block_parts,
01395                                      ColPartition_LIST* used_parts,
01396                                      BLOCK_LIST* completed_blocks,
01397                                      TO_BLOCK_LIST* to_blocks) {
01398   int page_height = tright.y() - bleft.y();
01399   // Compute the initial spacing stats.
01400   ColPartition_IT it(block_parts);
01401   int part_count = 0;
01402   int max_line_height = 0;
01403 
01404   // TODO(joeliu): We should add some special logic for PT_INLINE_EQUATION type
01405   // because their line spacing with their neighbors maybe smaller and their
01406   // height may be slightly larger.
01407 
01408   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01409     ColPartition* part = it.data();
01410     ASSERT_HOST(!part->boxes()->empty());
01411     STATS side_steps(0, part->bounding_box().height());
01412     if (part->bounding_box().height() > max_line_height)
01413       max_line_height = part->bounding_box().height();
01414     BLOBNBOX_C_IT blob_it(part->boxes());
01415     int prev_bottom = blob_it.data()->bounding_box().bottom();
01416     for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
01417       BLOBNBOX* blob = blob_it.data();
01418       int bottom = blob->bounding_box().bottom();
01419       int step = bottom - prev_bottom;
01420       if (step < 0)
01421         step = -step;
01422       side_steps.add(step, 1);
01423       prev_bottom = bottom;
01424     }
01425     part->set_side_step(static_cast<int>(side_steps.median() + 0.5));
01426     if (!it.at_last()) {
01427       ColPartition* next_part = it.data_relative(1);
01428       part->set_bottom_spacing(part->median_bottom() -
01429                                next_part->median_bottom());
01430       part->set_top_spacing(part->median_top() - next_part->median_top());
01431     } else {
01432       part->set_bottom_spacing(page_height);
01433       part->set_top_spacing(page_height);
01434     }
01435     if (textord_debug_tabfind) {
01436       part->Print();
01437       tprintf("side step = %.2f, top spacing = %d, bottom spacing=%d\n",
01438               side_steps.median(), part->top_spacing(), part->bottom_spacing());
01439     }
01440     ++part_count;
01441   }
01442   if (part_count == 0)
01443     return;
01444 
01445   SmoothSpacings(resolution, page_height, block_parts);
01446 
01447   // Move the partitions into individual block lists and make the blocks.
01448   BLOCK_IT block_it(completed_blocks);
01449   TO_BLOCK_IT to_block_it(to_blocks);
01450   ColPartition_LIST spacing_parts;
01451   ColPartition_IT sp_block_it(&spacing_parts);
01452   int same_block_threshold = max_line_height * kMaxSameBlockLineSpacing;
01453   for (it.mark_cycle_pt(); !it.empty();) {
01454     ColPartition* part = it.extract();
01455     sp_block_it.add_to_end(part);
01456     it.forward();
01457     if (it.empty() || part->bottom_spacing() > same_block_threshold ||
01458         !part->SpacingsEqual(*it.data(), resolution)) {
01459       // There is a spacing boundary. Check to see if it.data() belongs
01460       // better in the current block or the next one.
01461       if (!it.empty() && part->bottom_spacing() <= same_block_threshold) {
01462         ColPartition* next_part = it.data();
01463         // If there is a size match one-way, then the middle line goes with
01464         // its matched size, otherwise it goes with the smallest spacing.
01465         ColPartition* third_part = it.at_last() ? NULL : it.data_relative(1);
01466         if (textord_debug_tabfind) {
01467           tprintf("Spacings unequal: upper:%d/%d, lower:%d/%d,"
01468                   " sizes %d %d %d\n",
01469                   part->top_spacing(), part->bottom_spacing(),
01470                   next_part->top_spacing(), next_part->bottom_spacing(),
01471                   part->median_size(), next_part->median_size(),
01472                   third_part != NULL ? third_part->median_size() : 0);
01473         }
01474         // We can only consider adding the next line to the block if the sizes
01475         // match and the lines are close enough for their size.
01476         if (part->SizesSimilar(*next_part) &&
01477             next_part->median_size() * kMaxSameBlockLineSpacing >
01478                 part->bottom_spacing() &&
01479             part->median_size() * kMaxSameBlockLineSpacing >
01480                 part->top_spacing()) {
01481           // Even now, we can only add it as long as the third line doesn't
01482           // match in the same way and have a smaller bottom spacing.
01483           if (third_part == NULL ||
01484               !next_part->SizesSimilar(*third_part) ||
01485               third_part->median_size() * kMaxSameBlockLineSpacing <=
01486                   next_part->bottom_spacing() ||
01487               next_part->median_size() * kMaxSameBlockLineSpacing <=
01488                   next_part->top_spacing() ||
01489                   next_part->bottom_spacing() > part->bottom_spacing()) {
01490             // Add to the current block.
01491             sp_block_it.add_to_end(it.extract());
01492             it.forward();
01493             if (textord_debug_tabfind) {
01494               tprintf("Added line to current block.\n");
01495             }
01496           }
01497         }
01498       }
01499       TO_BLOCK* to_block = MakeBlock(bleft, tright, &spacing_parts, used_parts);
01500       if (to_block != NULL) {
01501         to_block_it.add_to_end(to_block);
01502         block_it.add_to_end(to_block->block);
01503       }
01504       sp_block_it.set_to_list(&spacing_parts);
01505     } else {
01506       if (textord_debug_tabfind && !it.empty()) {
01507         ColPartition* next_part = it.data();
01508         tprintf("Spacings equal: upper:%d/%d, lower:%d/%d\n",
01509                 part->top_spacing(), part->bottom_spacing(),
01510                 next_part->top_spacing(), next_part->bottom_spacing(),
01511                 part->median_size(), next_part->median_size());
01512       }
01513     }
01514   }
01515 }
01516 
01517 // Helper function to clip the input pos to the given bleft, tright bounds.
01518 static void ClipCoord(const ICOORD& bleft, const ICOORD& tright, ICOORD* pos) {
01519   if (pos->x() < bleft.x())
01520     pos->set_x(bleft.x());
01521   if (pos->x() > tright.x())
01522     pos->set_x(tright.x());
01523   if (pos->y() < bleft.y())
01524     pos->set_y(bleft.y());
01525   if (pos->y() > tright.y())
01526     pos->set_y(tright.y());
01527 }
01528 
01529 // Helper moves the blobs from the given list of block_parts into the block
01530 // itself. Sets up the block for (old) textline formation correctly for
01531 // vertical and horizontal text. The partitions are moved to used_parts
01532 // afterwards, as they cannot be deleted yet.
01533 static TO_BLOCK* MoveBlobsToBlock(bool vertical_text, int line_spacing,
01534                                   BLOCK* block,
01535                                   ColPartition_LIST* block_parts,
01536                                   ColPartition_LIST* used_parts) {
01537   // Make a matching TO_BLOCK and put all the BLOBNBOXes from the parts in it.
01538   // Move all the parts to a done list as they are no longer needed, except
01539   // that have have to continue to exist until the part grid is deleted.
01540   // Compute the median blob size as we go, as the block needs to know.
01541   TBOX block_box(block->bounding_box());
01542   STATS sizes(0, MAX(block_box.width(), block_box.height()));
01543   bool text_type = block->poly_block()->IsText();
01544   ColPartition_IT it(block_parts);
01545   TO_BLOCK* to_block = new TO_BLOCK(block);
01546   BLOBNBOX_IT blob_it(&to_block->blobs);
01547   ColPartition_IT used_it(used_parts);
01548   for (it.move_to_first(); !it.empty(); it.forward()) {
01549     ColPartition* part = it.extract();
01550     // Transfer blobs from all regions to the output blocks.
01551     // Blobs for non-text regions will be used to define the polygonal
01552     // bounds of the region.
01553     for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty();
01554          bb_it.forward()) {
01555       BLOBNBOX* bblob = bb_it.extract();
01556       if (bblob->owner() != part) {
01557         tprintf("Ownership incorrect for blob:");
01558         bblob->bounding_box().print();
01559         tprintf("Part=");
01560         part->Print();
01561         if (bblob->owner() == NULL) {
01562           tprintf("Not owned\n");
01563         } else {
01564           tprintf("Owner part:");
01565           bblob->owner()->Print();
01566         }
01567       }
01568       ASSERT_HOST(bblob->owner() == part);
01569       // Assert failure here is caused by arbitrarily changing the partition
01570       // type without also changing the blob type, such as in
01571       // InsertSmallBlobsAsUnknowns.
01572       ASSERT_HOST(!text_type || bblob->region_type() >= BRT_UNKNOWN);
01573       C_OUTLINE_LIST* outlines = bblob->cblob()->out_list();
01574       C_OUTLINE_IT ol_it(outlines);
01575       ASSERT_HOST(!text_type || ol_it.data()->pathlength() > 0);
01576       if (vertical_text)
01577         sizes.add(bblob->bounding_box().width(), 1);
01578       else
01579         sizes.add(bblob->bounding_box().height(), 1);
01580       blob_it.add_after_then_move(bblob);
01581     }
01582     used_it.add_to_end(part);
01583   }
01584   if (text_type && blob_it.empty()) {
01585     delete block;
01586     delete to_block;
01587     return NULL;
01588   }
01589   to_block->line_size = sizes.median();
01590   if (vertical_text) {
01591     int block_width = block->bounding_box().width();
01592     if (block_width < line_spacing)
01593       line_spacing = block_width;
01594     to_block->line_spacing = static_cast<float>(line_spacing);
01595     to_block->max_blob_size = static_cast<float>(block_width + 1);
01596   } else {
01597     int block_height = block->bounding_box().height();
01598     if (block_height < line_spacing)
01599       line_spacing = block_height;
01600     to_block->line_spacing = static_cast<float>(line_spacing);
01601     to_block->max_blob_size = static_cast<float>(block_height + 1);
01602   }
01603   return to_block;
01604 }
01605 
01606 // Constructs a block from the given list of partitions.
01607 // Arguments are as LineSpacingBlocks above.
01608 TO_BLOCK* ColPartition::MakeBlock(const ICOORD& bleft, const ICOORD& tright,
01609                                   ColPartition_LIST* block_parts,
01610                                   ColPartition_LIST* used_parts) {
01611   if (block_parts->empty())
01612     return NULL;  // Nothing to do.
01613   ColPartition_IT it(block_parts);
01614   ColPartition* part = it.data();
01615   PolyBlockType type = part->type();
01616   if (type == PT_VERTICAL_TEXT)
01617     return MakeVerticalTextBlock(bleft, tright, block_parts, used_parts);
01618   // LineSpacingBlocks has handed us a collection of evenly spaced lines and
01619   // put the average spacing in each partition, so we can just take the
01620   // linespacing from the first partition.
01621   int line_spacing = part->bottom_spacing();
01622   if (line_spacing < part->median_size())
01623     line_spacing = part->bounding_box().height();
01624   ICOORDELT_LIST vertices;
01625   ICOORDELT_IT vert_it(&vertices);
01626   ICOORD start, end;
01627   int min_x = MAX_INT32;
01628   int max_x = -MAX_INT32;
01629   int min_y = MAX_INT32;
01630   int max_y = -MAX_INT32;
01631   int iteration = 0;
01632   do {
01633     if (iteration == 0)
01634       ColPartition::LeftEdgeRun(&it, &start, &end);
01635     else
01636       ColPartition::RightEdgeRun(&it, &start, &end);
01637     ClipCoord(bleft, tright, &start);
01638     ClipCoord(bleft, tright, &end);
01639     vert_it.add_after_then_move(new ICOORDELT(start));
01640     vert_it.add_after_then_move(new ICOORDELT(end));
01641     UpdateRange(start.x(), &min_x, &max_x);
01642     UpdateRange(end.x(), &min_x, &max_x);
01643     UpdateRange(start.y(), &min_y, &max_y);
01644     UpdateRange(end.y(), &min_y, &max_y);
01645     if ((iteration == 0 && it.at_first()) ||
01646         (iteration == 1 && it.at_last())) {
01647       ++iteration;
01648       it.move_to_last();
01649     }
01650   } while (iteration < 2);
01651   if (textord_debug_tabfind)
01652     tprintf("Making block at (%d,%d)->(%d,%d)\n",
01653             min_x, min_y, max_x, max_y);
01654   BLOCK* block = new BLOCK("", true, 0, 0, min_x, min_y, max_x, max_y);
01655   block->set_poly_block(new POLY_BLOCK(&vertices, type));
01656   return MoveBlobsToBlock(false, line_spacing, block, block_parts, used_parts);
01657 }
01658 
01659 // Constructs a block from the given list of vertical text partitions.
01660 // Currently only creates rectangular blocks.
01661 TO_BLOCK* ColPartition::MakeVerticalTextBlock(const ICOORD& bleft,
01662                                               const ICOORD& tright,
01663                                               ColPartition_LIST* block_parts,
01664                                               ColPartition_LIST* used_parts) {
01665   if (block_parts->empty())
01666     return NULL;  // Nothing to do.
01667   ColPartition_IT it(block_parts);
01668   ColPartition* part = it.data();
01669   TBOX block_box = part->bounding_box();
01670   int line_spacing = block_box.width();
01671   PolyBlockType type = it.data()->type();
01672   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01673     block_box += it.data()->bounding_box();
01674   }
01675   if (textord_debug_tabfind) {
01676     tprintf("Making block at:");
01677     block_box.print();
01678   }
01679   BLOCK* block = new BLOCK("", true, 0, 0, block_box.left(), block_box.bottom(),
01680                            block_box.right(), block_box.top());
01681   block->set_poly_block(new POLY_BLOCK(block_box, type));
01682   return MoveBlobsToBlock(true, line_spacing, block, block_parts, used_parts);
01683 }
01684 
01685 // Makes a TO_ROW matching this and moves all the blobs to it, transferring
01686 // ownership to to returned TO_ROW.
01687 TO_ROW* ColPartition::MakeToRow() {
01688   BLOBNBOX_C_IT blob_it(&boxes_);
01689   TO_ROW* row = NULL;
01690   int line_size = IsVerticalType() ? median_width_ : median_size_;
01691   // Add all the blobs to a single TO_ROW.
01692   for (; !blob_it.empty(); blob_it.forward()) {
01693     BLOBNBOX* blob = blob_it.extract();
01694 //    blob->compute_bounding_box();
01695     int top = blob->bounding_box().top();
01696     int bottom = blob->bounding_box().bottom();
01697     if (row == NULL) {
01698       row = new TO_ROW(blob, static_cast<float>(top),
01699                        static_cast<float>(bottom),
01700                        static_cast<float>(line_size));
01701     } else {
01702       row->add_blob(blob, static_cast<float>(top),
01703                     static_cast<float>(bottom),
01704                     static_cast<float>(line_size));
01705     }
01706   }
01707   return row;
01708 }
01709 
01710 // Returns a copy of everything except the list of boxes. The resulting
01711 // ColPartition is only suitable for keeping in a column candidate list.
01712 ColPartition* ColPartition::ShallowCopy() const {
01713   ColPartition* part = new ColPartition(blob_type_, vertical_);
01714   part->left_margin_ = left_margin_;
01715   part->right_margin_ = right_margin_;
01716   part->bounding_box_ = bounding_box_;
01717   memcpy(part->special_blobs_densities_, special_blobs_densities_,
01718          sizeof(special_blobs_densities_));
01719   part->median_bottom_ = median_bottom_;
01720   part->median_top_ = median_top_;
01721   part->median_size_ = median_size_;
01722   part->median_left_ = median_left_;
01723   part->median_right_ = median_right_;
01724   part->median_width_ = median_width_;
01725   part->good_width_ = good_width_;
01726   part->good_column_ = good_column_;
01727   part->left_key_tab_ = left_key_tab_;
01728   part->right_key_tab_ = right_key_tab_;
01729   part->type_ = type_;
01730   part->flow_ = flow_;
01731   part->left_key_ = left_key_;
01732   part->right_key_ = right_key_;
01733   part->first_column_ = first_column_;
01734   part->last_column_ = last_column_;
01735   part->owns_blobs_ = false;
01736   return part;
01737 }
01738 
01739 ColPartition* ColPartition::CopyButDontOwnBlobs() {
01740   ColPartition* copy = ShallowCopy();
01741   copy->set_owns_blobs(false);
01742   BLOBNBOX_C_IT inserter(copy->boxes());
01743   BLOBNBOX_C_IT traverser(boxes());
01744   for (traverser.mark_cycle_pt(); !traverser.cycled_list(); traverser.forward())
01745     inserter.add_after_then_move(traverser.data());
01746   return copy;
01747 }
01748 
01749 #ifndef GRAPHICS_DISABLED
01750 // Provides a color for BBGrid to draw the rectangle.
01751 // Must be kept in sync with PolyBlockType.
01752 ScrollView::Color  ColPartition::BoxColor() const {
01753   if (type_ == PT_UNKNOWN)
01754     return BLOBNBOX::TextlineColor(blob_type_, flow_);
01755   return POLY_BLOCK::ColorForPolyBlockType(type_);
01756 }
01757 #endif  // GRAPHICS_DISABLED
01758 
01759 // Keep in sync with BlobRegionType.
01760 static char kBlobTypes[BRT_COUNT + 1] = "NHSRIUVT";
01761 
01762 // Prints debug information on this.
01763 void ColPartition::Print() const {
01764   int y = MidY();
01765   tprintf("ColPart:%c(M%d-%c%d-B%d/%d,%d/%d)->(%dB-%d%c-%dM/%d,%d/%d)"
01766           " w-ok=%d, v-ok=%d, type=%d%c%d, fc=%d, lc=%d, boxes=%d"
01767           " ts=%d bs=%d ls=%d rs=%d\n",
01768           boxes_.empty() ? 'E' : ' ',
01769           left_margin_, left_key_tab_ ? 'T' : 'B', LeftAtY(y),
01770           bounding_box_.left(), median_left_,
01771           bounding_box_.bottom(), median_bottom_,
01772           bounding_box_.right(), RightAtY(y), right_key_tab_ ? 'T' : 'B',
01773           right_margin_, median_right_, bounding_box_.top(), median_top_,
01774           good_width_, good_column_, type_,
01775           kBlobTypes[blob_type_], flow_,
01776           first_column_, last_column_, boxes_.length(),
01777           space_above_, space_below_, space_to_left_, space_to_right_);
01778 }
01779 
01780 // Prints debug information on the colors.
01781 void ColPartition::PrintColors() {
01782   tprintf("Colors:(%d, %d, %d)%d -> (%d, %d, %d)\n",
01783           color1_[COLOR_RED], color1_[COLOR_GREEN], color1_[COLOR_BLUE],
01784           color1_[L_ALPHA_CHANNEL],
01785           color2_[COLOR_RED], color2_[COLOR_GREEN], color2_[COLOR_BLUE]);
01786 }
01787 
01788 // Sets the types of all partitions in the run to be the max of the types.
01789 void ColPartition::SmoothPartnerRun(int working_set_count) {
01790   STATS left_stats(0, working_set_count);
01791   STATS right_stats(0, working_set_count);
01792   PolyBlockType max_type = type_;
01793   ColPartition* partner;
01794   for (partner = SingletonPartner(false); partner != NULL;
01795        partner = partner->SingletonPartner(false)) {
01796     if (partner->type_ > max_type)
01797       max_type = partner->type_;
01798     if (column_set_ == partner->column_set_) {
01799       left_stats.add(partner->first_column_, 1);
01800       right_stats.add(partner->last_column_, 1);
01801     }
01802   }
01803   type_ = max_type;
01804   // TODO(rays) Either establish that it isn't necessary to set the columns,
01805   // or find a way to do it that does not cause an assert failure in
01806   // AddToWorkingSet.
01807 #if 0
01808   first_column_ = left_stats.mode();
01809   last_column_ = right_stats.mode();
01810   if (last_column_ < first_column_)
01811     last_column_ = first_column_;
01812 #endif
01813 
01814   for (partner = SingletonPartner(false); partner != NULL;
01815        partner = partner->SingletonPartner(false)) {
01816     partner->type_ = max_type;
01817 #if 0  // See TODO above
01818     if (column_set_ == partner->column_set_) {
01819       partner->first_column_ = first_column_;
01820       partner->last_column_ = last_column_;
01821     }
01822 #endif
01823   }
01824 }
01825 
01826 // ======= Scenario common to all Refine*Partners* functions =======
01827 // ColPartitions are aiming to represent textlines, or horizontal slices
01828 // of images, and we are trying to form bi-directional (upper/lower) chains
01829 // of UNIQUE partner ColPartitions that can be made into blocks.
01830 // The ColPartitions have previously been typed (see SetPartitionType)
01831 // according to a combination of the content type and
01832 // how they lie on the columns. We want to chain text into
01833 // groups of a single type, but image ColPartitions may have been typed
01834 // differently in different parts of the image, due to being non-rectangular.
01835 //
01836 // We previously ran a search for upper and lower partners, but there may
01837 // be more than one, and they may be of mixed types, so now we wish to
01838 // refine the partners down to at most one.
01839 // A heading may have multiple partners:
01840 // ===============================
01841 // ========  ==========  =========
01842 // ========  ==========  =========
01843 // but it should be a different type.
01844 // A regular flowing text line may have multiple partners:
01845 // ==================   ===================
01846 // =======   =================  ===========
01847 // This could be the start of a pull-out, or it might all be in a single
01848 // column and might be caused by tightly spaced text, bold words, bullets,
01849 // funny punctuation etc, all of which can cause textlines to be split into
01850 // multiple ColPartitions. Pullouts and figure captions should now be different
01851 // types so we can more aggressively merge groups of partners that all sit
01852 // in a single column.
01853 //
01854 // Cleans up the partners of the given type so that there is at most
01855 // one partner. This makes block creation simpler.
01856 // If get_desperate is true, goes to more desperate merge methods
01857 // to merge flowing text before breaking partnerships.
01858 void ColPartition::RefinePartners(PolyBlockType type, bool get_desperate,
01859                                   ColPartitionGrid* grid) {
01860   if (TypesSimilar(type_, type)) {
01861     RefinePartnersInternal(true, get_desperate, grid);
01862     RefinePartnersInternal(false, get_desperate, grid);
01863   } else if (type == PT_COUNT) {
01864     // This is the final pass. Make sure only the correctly typed
01865     // partners surivive, however many there are.
01866     RefinePartnersByType(true, &upper_partners_);
01867     RefinePartnersByType(false, &lower_partners_);
01868     // It is possible for a merge to have given a partition multiple
01869     // partners again, so the last resort is to use overlap which is
01870     // guaranteed to leave at most one partner left.
01871     if (!upper_partners_.empty() && !upper_partners_.singleton())
01872       RefinePartnersByOverlap(true, &upper_partners_);
01873     if (!lower_partners_.empty() && !lower_partners_.singleton())
01874       RefinePartnersByOverlap(false, &lower_partners_);
01875   }
01876 }
01877 
01879 
01880 // Cleans up the partners above if upper is true, else below.
01881 // If get_desperate is true, goes to more desperate merge methods
01882 // to merge flowing text before breaking partnerships.
01883 void ColPartition::RefinePartnersInternal(bool upper, bool get_desperate,
01884                                           ColPartitionGrid* grid) {
01885   ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
01886   if (!partners->empty() && !partners->singleton()) {
01887     RefinePartnersByType(upper, partners);
01888     if (!partners->empty() && !partners->singleton()) {
01889       // Check for transitive partnerships and break the cycle.
01890       RefinePartnerShortcuts(upper, partners);
01891       if (!partners->empty() && !partners->singleton()) {
01892         // Types didn't fix it. Flowing text keeps the one with the longest
01893         // sequence of singleton matching partners. All others max overlap.
01894         if (TypesSimilar(type_, PT_FLOWING_TEXT) && get_desperate) {
01895           RefineTextPartnersByMerge(upper, false, partners, grid);
01896           if (!partners->empty() && !partners->singleton())
01897             RefineTextPartnersByMerge(upper, true, partners, grid);
01898         }
01899         // The last resort is to use overlap.
01900         if (!partners->empty() && !partners->singleton())
01901           RefinePartnersByOverlap(upper, partners);
01902       }
01903     }
01904   }
01905 }
01906 
01907 // Cleans up the partners above if upper is true, else below.
01908 // Restricts the partners to only desirable types. For text and BRT_HLINE this
01909 // means the same type_ , and for image types it means any image type.
01910 void ColPartition::RefinePartnersByType(bool upper,
01911                                         ColPartition_CLIST* partners) {
01912   bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
01913                                          bounding_box_.bottom());
01914   if (debug) {
01915     tprintf("Refining %d %s partners by type for:\n",
01916             partners->length(), upper ? "Upper" : "Lower");
01917     Print();
01918   }
01919   ColPartition_C_IT it(partners);
01920   // Purify text by type.
01921   if (!IsImageType() && !IsLineType() && type() != PT_TABLE) {
01922     // Keep only partners matching type_.
01923     // Exception: PT_VERTICAL_TEXT is allowed to stay with the other
01924     // text types if it is the only partner.
01925     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01926       ColPartition* partner = it.data();
01927       if (!TypesSimilar(type_, partner->type_)) {
01928         if (debug) {
01929           tprintf("Removing partner:");
01930           partner->Print();
01931         }
01932         partner->RemovePartner(!upper, this);
01933         it.extract();
01934       } else if (debug) {
01935         tprintf("Keeping partner:");
01936         partner->Print();
01937       }
01938     }
01939   } else {
01940     // Only polyimages are allowed to have partners of any kind!
01941     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01942       ColPartition* partner = it.data();
01943       if (partner->blob_type() != BRT_POLYIMAGE ||
01944           blob_type() != BRT_POLYIMAGE) {
01945         if (debug) {
01946           tprintf("Removing partner:");
01947           partner->Print();
01948         }
01949         partner->RemovePartner(!upper, this);
01950         it.extract();
01951       } else if (debug) {
01952         tprintf("Keeping partner:");
01953         partner->Print();
01954       }
01955     }
01956   }
01957 }
01958 
01959 // Cleans up the partners above if upper is true, else below.
01960 // Remove transitive partnerships: this<->a, and a<->b and this<->b.
01961 // Gets rid of this<->b, leaving a clean chain.
01962 // Also if we have this<->a and a<->this, then gets rid of this<->a, as
01963 // this has multiple partners.
01964 void ColPartition::RefinePartnerShortcuts(bool upper,
01965                                           ColPartition_CLIST* partners) {
01966   bool done_any = false;
01967   do {
01968     done_any = false;
01969     ColPartition_C_IT it(partners);
01970     for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01971       ColPartition* a = it.data();
01972       // Check for a match between all of a's partners (it1/b1) and all
01973       // of this's partners (it2/b2).
01974       ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);
01975       for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
01976         ColPartition* b1 = it1.data();
01977         if (b1 == this) {
01978           done_any = true;
01979           it.extract();
01980           a->RemovePartner(!upper, this);
01981           break;
01982         }
01983         ColPartition_C_IT it2(partners);
01984         for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
01985           ColPartition* b2 = it2.data();
01986           if (b1 == b2) {
01987             // Jackpot! b2 should not be a partner of this.
01988             it2.extract();
01989             b2->RemovePartner(!upper, this);
01990             done_any = true;
01991             // That potentially invalidated all the iterators, so break out
01992             // and start again.
01993             break;
01994           }
01995         }
01996         if (done_any)
01997           break;
01998       }
01999       if (done_any)
02000         break;
02001     }
02002   } while (done_any && !partners->empty() && !partners->singleton());
02003 }
02004 
02005 // Cleans up the partners above if upper is true, else below.
02006 // If multiple text partners can be merged, (with each other, NOT with this),
02007 // then do so.
02008 // If desperate is true, then an increase in overlap with the merge is
02009 // allowed. If the overlap increases, then the desperately_merged_ flag
02010 // is set, indicating that the textlines probably need to be regenerated
02011 // by aggressive line fitting/splitting, as there are probably vertically
02012 // joined blobs that cross textlines.
02013 void ColPartition::RefineTextPartnersByMerge(bool upper, bool desperate,
02014                                              ColPartition_CLIST* partners,
02015                                              ColPartitionGrid* grid) {
02016   bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
02017                                          bounding_box_.bottom());
02018   if (debug) {
02019     tprintf("Refining %d %s partners by merge for:\n",
02020             partners->length(), upper ? "Upper" : "Lower");
02021     Print();
02022   }
02023   while (!partners->empty() && !partners->singleton()) {
02024     // Absorb will mess up the iterators, so we have to merge one partition
02025     // at a time and rebuild the iterators each time.
02026     ColPartition_C_IT it(partners);
02027     ColPartition* part = it.data();
02028     // Gather a list of merge candidates, from the list of partners, that
02029     // are all in the same single column. See general scenario comment above.
02030     ColPartition_CLIST candidates;
02031     ColPartition_C_IT cand_it(&candidates);
02032     for (it.forward(); !it.at_first(); it.forward()) {
02033       ColPartition* candidate = it.data();
02034       if (part->first_column_ == candidate->last_column_ &&
02035           part->last_column_ == candidate->first_column_)
02036         cand_it.add_after_then_move(it.data());
02037     }
02038     int overlap_increase;
02039     ColPartition* candidate = grid->BestMergeCandidate(part, &candidates, debug,
02040                                                        NULL, &overlap_increase);
02041     if (candidate != NULL && (overlap_increase <= 0 || desperate)) {
02042       if (debug) {
02043         tprintf("Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
02044                 part->HCoreOverlap(*candidate), part->VCoreOverlap(*candidate),
02045                 overlap_increase);
02046       }
02047       // Remove before merge and re-insert to keep the integrity of the grid.
02048       grid->RemoveBBox(candidate);
02049       grid->RemoveBBox(part);
02050       part->Absorb(candidate, NULL);
02051       // We modified the box of part, so re-insert it into the grid.
02052       grid->InsertBBox(true, true, part);
02053       if (overlap_increase > 0)
02054         part->desperately_merged_ = true;
02055     } else {
02056       break;  // Can't merge.
02057     }
02058   }
02059 }
02060 
02061 // Cleans up the partners above if upper is true, else below.
02062 // Keep the partner with the biggest overlap.
02063 void ColPartition::RefinePartnersByOverlap(bool upper,
02064                                            ColPartition_CLIST* partners) {
02065   bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
02066                                          bounding_box_.bottom());
02067   if (debug) {
02068     tprintf("Refining %d %s partners by overlap for:\n",
02069             partners->length(), upper ? "Upper" : "Lower");
02070     Print();
02071   }
02072   ColPartition_C_IT it(partners);
02073   ColPartition* best_partner = it.data();
02074   // Find the partner with the best overlap.
02075   int best_overlap = 0;
02076   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
02077     ColPartition* partner = it.data();
02078     int overlap = MIN(bounding_box_.right(), partner->bounding_box_.right())
02079                 - MAX(bounding_box_.left(), partner->bounding_box_.left());
02080     if (overlap > best_overlap) {
02081       best_overlap = overlap;
02082       best_partner = partner;
02083     }
02084   }
02085   // Keep only the best partner.
02086   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
02087     ColPartition* partner = it.data();
02088     if (partner != best_partner) {
02089       if (debug) {
02090         tprintf("Removing partner:");
02091         partner->Print();
02092       }
02093       partner->RemovePartner(!upper, this);
02094       it.extract();
02095     }
02096   }
02097 }
02098 
02099 // Return true if bbox belongs better in this than other.
02100 bool ColPartition::ThisPartitionBetter(BLOBNBOX* bbox,
02101                                        const ColPartition& other) {
02102   TBOX box = bbox->bounding_box();
02103   // Margins take priority.
02104   int left = box.left();
02105   int right = box.right();
02106   if (left < left_margin_ || right > right_margin_)
02107     return false;
02108   if (left < other.left_margin_ || right > other.right_margin_)
02109     return true;
02110   int top = box.top();
02111   int bottom = box.bottom();
02112   int this_overlap = MIN(top, median_top_) - MAX(bottom, median_bottom_);
02113   int other_overlap = MIN(top, other.median_top_) -
02114                       MAX(bottom, other.median_bottom_);
02115   int this_miss = median_top_ - median_bottom_ - this_overlap;
02116   int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;
02117   if (TabFind::WithinTestRegion(3, box.left(), box.bottom())) {
02118     tprintf("Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\n",
02119             box.left(), box.bottom(), box.right(), box.top(),
02120             this_overlap, other_overlap, this_miss, other_miss,
02121             median_top_, other.median_top_);
02122   }
02123   if (this_miss < other_miss)
02124     return true;
02125   if (this_miss > other_miss)
02126     return false;
02127   if (this_overlap > other_overlap)
02128     return true;
02129   if (this_overlap < other_overlap)
02130     return false;
02131   return median_top_ >= other.median_top_;
02132 }
02133 
02134 // Returns the median line-spacing between the current position and the end
02135 // of the list.
02136 // The iterator is passed by value so the iteration does not modify the
02137 // caller's iterator.
02138 static int MedianSpacing(int page_height, ColPartition_IT it) {
02139   STATS stats(0, page_height);
02140   while (!it.cycled_list()) {
02141     ColPartition* part = it.data();
02142     it.forward();
02143     stats.add(part->bottom_spacing(), 1);
02144     stats.add(part->top_spacing(), 1);
02145   }
02146   return static_cast<int>(stats.median() + 0.5);
02147 }
02148 
02149 // Returns true if this column partition is in the same column as
02150 // part. This function will only work after the SetPartitionType function
02151 // has been called on both column partitions. This is useful for
02152 // doing a SideSearch when you want things in the same page column.
02153 //
02154 // Currently called by the table detection code to identify if potential table
02155 // partitions exist in the same column.
02156 bool ColPartition::IsInSameColumnAs(const ColPartition& part) const {
02157   // Overlap does not occur when last < part.first or first > part.last.
02158   // In other words, one is completely to the side of the other.
02159   // This is just DeMorgan's law applied to that so the function returns true.
02160   return (last_column_ >= part.first_column_) &&
02161          (first_column_ <= part.last_column_);
02162 }
02163 
02164 // Smoothes the spacings in the list into groups of equal linespacing.
02165 // resolution is the resolution of the original image, used as a basis
02166 // for thresholds in change of spacing. page_height is in pixels.
02167 void ColPartition::SmoothSpacings(int resolution, int page_height,
02168                                   ColPartition_LIST* parts) {
02169   // The task would be trivial if we didn't have to allow for blips -
02170   // occasional offsets in spacing caused by anomolous text, such as all
02171   // caps, groups of descenders, joined words, Arabic etc.
02172   // The neighbourhood stores a consecutive group of partitions so that
02173   // blips can be detected correctly, yet conservatively enough to not
02174   // mistake genuine spacing changes for blips. See example below.
02175   ColPartition* neighbourhood[PN_COUNT];
02176   ColPartition_IT it(parts);
02177   it.mark_cycle_pt();
02178   // Although we know nothing about the spacings is this list, the median is
02179   // used as an approximation to allow blips.
02180   // If parts of this block aren't spaced to the median, then we can't
02181   // accept blips in those parts, but we'll recalculate it each time we
02182   // split the block, so the median becomes more likely to match all the text.
02183   int median_space = MedianSpacing(page_height, it);
02184   ColPartition_IT start_it(it);
02185   ColPartition_IT end_it(it);
02186   for (int i = 0; i < PN_COUNT; ++i) {
02187     if (i < PN_UPPER || it.cycled_list()) {
02188       neighbourhood[i] = NULL;
02189     } else {
02190       if (i == PN_LOWER)
02191         end_it = it;
02192       neighbourhood[i] = it.data();
02193       it.forward();
02194     }
02195   }
02196   while (neighbourhood[PN_UPPER] != NULL) {
02197     // Test for end of a group. Normally SpacingsEqual is true within a group,
02198     // but in the case of a blip, it will be false. Here is an example:
02199     // Line enum   Spacing below (spacing between tops of lines)
02200     //  1   ABOVE2    20
02201     //  2   ABOVE1    20
02202     //  3   UPPER     15
02203     //  4   LOWER     25
02204     //  5   BELOW1    20
02205     //  6   BELOW2    20
02206     // Line 4 is all in caps (regular caps), so the spacing between line 3
02207     // and line 4 (looking at the tops) is smaller than normal, and the
02208     // spacing between line 4 and line 5 is larger than normal, but the
02209     // two of them add to twice the normal spacing.
02210     // The following if has to accept unequal spacings 3 times to pass the
02211     // blip (20/15, 15/25 and 25/20)
02212     // When the blip is in the middle, OKSpacingBlip tests that one of
02213     // ABOVE1 and BELOW1 matches the median.
02214     // The first time, everything is shifted down 1, so we present
02215     // OKSpacingBlip with neighbourhood+1 and check that PN_UPPER is median.
02216     // The last time, everything is shifted up 1, so we present OKSpacingBlip
02217     // with neighbourhood-1 and check that PN_LOWER matches the median.
02218     if (neighbourhood[PN_LOWER] == NULL ||
02219         (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],
02220                                                  resolution) &&
02221          !OKSpacingBlip(resolution, median_space, neighbourhood) &&
02222          (!OKSpacingBlip(resolution, median_space, neighbourhood - 1) ||
02223           !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&
02224          (!OKSpacingBlip(resolution, median_space, neighbourhood + 1) ||
02225           !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {
02226       // The group has ended. PN_UPPER is the last member.
02227       // Compute the mean spacing over the group.
02228       ColPartition_IT sum_it(start_it);
02229       ColPartition* last_part = neighbourhood[PN_UPPER];
02230       double total_bottom = 0.0;
02231       double total_top = 0.0;
02232       int total_count = 0;
02233       ColPartition* upper = sum_it.data();
02234       // We do not process last_part, as its spacing is different.
02235       while (upper != last_part) {
02236         total_bottom += upper->bottom_spacing();
02237         total_top += upper->top_spacing();
02238         ++total_count;
02239         sum_it.forward();
02240         upper = sum_it.data();
02241       }
02242       if (total_count > 0) {
02243         // There were at least 2 lines, so set them all to the mean.
02244         int top_spacing = static_cast<int>(total_top / total_count + 0.5);
02245         int bottom_spacing = static_cast<int>(total_bottom / total_count + 0.5);
02246         if (textord_debug_tabfind) {
02247           tprintf("Spacing run ended. Cause:");
02248           if (neighbourhood[PN_LOWER] == NULL) {
02249             tprintf("No more lines\n");
02250           } else {
02251             tprintf("Spacing change. Spacings:\n");
02252             for (int i = 0; i < PN_COUNT; ++i) {
02253               if (neighbourhood[i] == NULL) {
02254                 tprintf("NULL");
02255                 if (i > 0 && neighbourhood[i - 1] != NULL) {
02256                   if (neighbourhood[i - 1]->SingletonPartner(false) != NULL) {
02257                     tprintf(" Lower partner:");
02258                     neighbourhood[i - 1]->SingletonPartner(false)->Print();
02259                   } else {
02260                     tprintf(" NULL lower partner:\n");
02261                   }
02262                 } else {
02263                   tprintf("\n");
02264                 }
02265               } else {
02266                 tprintf("Top = %d, bottom = %d\n",
02267                         neighbourhood[i]->top_spacing(),
02268                         neighbourhood[i]->bottom_spacing());
02269               }
02270             }
02271           }
02272           tprintf("Mean spacing = %d/%d\n", top_spacing, bottom_spacing);
02273         }
02274         sum_it = start_it;
02275         upper = sum_it.data();
02276         while (upper != last_part) {
02277           upper->set_top_spacing(top_spacing);
02278           upper->set_bottom_spacing(bottom_spacing);
02279           if (textord_debug_tabfind) {
02280             tprintf("Setting mean on:");
02281             upper->Print();
02282           }
02283           sum_it.forward();
02284           upper = sum_it.data();
02285         }
02286       }
02287       // PN_LOWER starts the next group and end_it is the next start_it.
02288       start_it = end_it;
02289       // Recalculate the median spacing to maximize the chances of detecting
02290       // spacing blips.
02291       median_space = MedianSpacing(page_height, end_it);
02292     }
02293     // Shuffle pointers.
02294     for (int j = 1; j < PN_COUNT; ++j) {
02295       neighbourhood[j - 1] = neighbourhood[j];
02296     }
02297     if (it.cycled_list()) {
02298       neighbourhood[PN_COUNT - 1] = NULL;
02299     } else {
02300       neighbourhood[PN_COUNT - 1] = it.data();
02301       it.forward();
02302     }
02303     end_it.forward();
02304   }
02305 }
02306 
02307 // Returns true if the parts array of pointers to partitions matches the
02308 // condition for a spacing blip. See SmoothSpacings for what this means
02309 // and how it is used.
02310 bool ColPartition::OKSpacingBlip(int resolution, int median_spacing,
02311                                  ColPartition** parts) {
02312   if (parts[PN_UPPER] == NULL || parts[PN_LOWER] == NULL)
02313     return false;
02314   // The blip is OK if upper and lower sum to an OK value and at least
02315   // one of above1 and below1 is equal to the median.
02316   return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER],
02317                                           median_spacing, resolution) &&
02318          ((parts[PN_ABOVE1] != NULL &&
02319            parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||
02320           (parts[PN_BELOW1] != NULL &&
02321            parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));
02322 }
02323 
02324 // Returns true if both the top and bottom spacings of this match the given
02325 // spacing to within suitable margins dictated by the image resolution.
02326 bool ColPartition::SpacingEqual(int spacing, int resolution) const {
02327   int bottom_error = BottomSpacingMargin(resolution);
02328   int top_error = TopSpacingMargin(resolution);
02329   return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&
02330          NearlyEqual(top_spacing_, spacing, top_error);
02331 }
02332 
02333 // Returns true if both the top and bottom spacings of this and other
02334 // match to within suitable margins dictated by the image resolution.
02335 bool ColPartition::SpacingsEqual(const ColPartition& other,
02336                                  int resolution) const {
02337   int bottom_error = MAX(BottomSpacingMargin(resolution),
02338                          other.BottomSpacingMargin(resolution));
02339   int top_error = MAX(TopSpacingMargin(resolution),
02340                       other.TopSpacingMargin(resolution));
02341   return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&
02342          (NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||
02343           NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,
02344                       bottom_error));
02345 }
02346 
02347 // Returns true if the sum spacing of this and other match the given
02348 // spacing (or twice the given spacing) to within a suitable margin dictated
02349 // by the image resolution.
02350 bool ColPartition::SummedSpacingOK(const ColPartition& other,
02351                                    int spacing, int resolution) const {
02352   int bottom_error = MAX(BottomSpacingMargin(resolution),
02353                          other.BottomSpacingMargin(resolution));
02354   int top_error = MAX(TopSpacingMargin(resolution),
02355                       other.TopSpacingMargin(resolution));
02356   int bottom_total = bottom_spacing_ + other.bottom_spacing_;
02357   int top_total = top_spacing_ + other.top_spacing_;
02358   return (NearlyEqual(spacing, bottom_total, bottom_error) &&
02359           NearlyEqual(spacing, top_total, top_error)) ||
02360          (NearlyEqual(spacing * 2, bottom_total, bottom_error) &&
02361           NearlyEqual(spacing * 2, top_total, top_error));
02362 }
02363 
02364 // Returns a suitable spacing margin that can be applied to bottoms of
02365 // text lines, based on the resolution and the stored side_step_.
02366 int ColPartition::BottomSpacingMargin(int resolution) const {
02367   return static_cast<int>(kMaxSpacingDrift * resolution + 0.5) + side_step_;
02368 }
02369 
02370 // Returns a suitable spacing margin that can be applied to tops of
02371 // text lines, based on the resolution and the stored side_step_.
02372 int ColPartition::TopSpacingMargin(int resolution) const {
02373   return static_cast<int>(kMaxTopSpacingFraction * median_size_ + 0.5) +
02374          BottomSpacingMargin(resolution);
02375 }
02376 
02377 // Returns true if the median text sizes of this and other agree to within
02378 // a reasonable multiplicative factor.
02379 bool ColPartition::SizesSimilar(const ColPartition& other) const {
02380   return median_size_ <= other.median_size_ * kMaxSizeRatio &&
02381          other.median_size_ <= median_size_ * kMaxSizeRatio;
02382 }
02383 
02384 // Helper updates margin_left and margin_right, being the bounds of the left
02385 // margin of part of a block. Returns false and does not update the bounds if
02386 // this partition has a disjoint margin with the established margin.
02387 static bool UpdateLeftMargin(const ColPartition& part,
02388                              int* margin_left, int* margin_right) {
02389   const TBOX& part_box = part.bounding_box();
02390   int top = part_box.top();
02391   int bottom = part_box.bottom();
02392   int tl_key = part.SortKey(part.left_margin(), top);
02393   int tr_key = part.SortKey(part_box.left(), top);
02394   int bl_key = part.SortKey(part.left_margin(), bottom);
02395   int br_key = part.SortKey(part_box.left(), bottom);
02396   int left_key = MAX(tl_key, bl_key);
02397   int right_key = MIN(tr_key, br_key);
02398   if (left_key <= *margin_right && right_key >= *margin_left) {
02399     // This part is good - let's keep it.
02400     *margin_right = MIN(*margin_right, right_key);
02401     *margin_left = MAX(*margin_left, left_key);
02402     return true;
02403   }
02404   return false;
02405 }
02406 
02407 // Computes and returns in start, end a line segment formed from a
02408 // forwards-iterated group of left edges of partitions that satisfy the
02409 // condition that the intersection of the left margins is non-empty, ie the
02410 // rightmost left margin is to the left of the leftmost left bounding box edge.
02411 // On return the iterator is set to the start of the next run.
02412 void ColPartition::LeftEdgeRun(ColPartition_IT* part_it,
02413                                ICOORD* start, ICOORD* end) {
02414   ColPartition* part = part_it->data();
02415   ColPartition* start_part = part;
02416   int start_y = part->bounding_box_.top();
02417   if (!part_it->at_first()) {
02418     int prev_bottom = part_it->data_relative(-1)->bounding_box_.bottom();
02419     if (prev_bottom < start_y)
02420       start_y = prev_bottom;
02421     else if (prev_bottom > start_y)
02422       start_y = (start_y + prev_bottom) / 2;
02423   }
02424   int end_y = part->bounding_box_.bottom();
02425   int margin_right = MAX_INT32;
02426   int margin_left = -MAX_INT32;
02427   UpdateLeftMargin(*part, &margin_left, &margin_right);
02428   do {
02429     part_it->forward();
02430     part = part_it->data();
02431   } while (!part_it->at_first() &&
02432            UpdateLeftMargin(*part, &margin_left, &margin_right));
02433   // The run ended. If we were pushed inwards, compute the next run and
02434   // extend it backwards into the run we just calculated to find the end of
02435   // this run that provides a tight box.
02436   int next_margin_right = MAX_INT32;
02437   int next_margin_left = -MAX_INT32;
02438   UpdateLeftMargin(*part, &next_margin_left, &next_margin_right);
02439   if (next_margin_left > margin_right) {
02440     ColPartition_IT next_it(*part_it);
02441     do {
02442       next_it.forward();
02443       part = next_it.data();
02444     } while (!next_it.at_first() &&
02445              UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
02446     // Now extend the next run backwards into the original run to get the
02447     // tightest fit.
02448     do {
02449       part_it->backward();
02450       part = part_it->data();
02451     } while (part != start_part &&
02452              UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
02453     part_it->forward();
02454   }
02455   // Now calculate the end_y.
02456   part = part_it->data_relative(-1);
02457   end_y = part->bounding_box_.bottom();
02458   if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y)
02459     end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;
02460   start->set_y(start_y);
02461   start->set_x(part->XAtY(margin_right, start_y));
02462   end->set_y(end_y);
02463   end->set_x(part->XAtY(margin_right, end_y));
02464   if (textord_debug_tabfind && !part_it->at_first())
02465     tprintf("Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
02466             start_y, end_y, part->XAtY(margin_left, end_y),
02467             end->x(), part->left_margin_, part->bounding_box_.left());
02468 }
02469 
02470 // Helper updates margin_left and margin_right, being the bounds of the right
02471 // margin of part of a block. Returns false and does not update the bounds if
02472 // this partition has a disjoint margin with the established margin.
02473 static bool UpdateRightMargin(const ColPartition& part,
02474                               int* margin_left, int* margin_right) {
02475   const TBOX& part_box = part.bounding_box();
02476   int top = part_box.top();
02477   int bottom = part_box.bottom();
02478   int tl_key = part.SortKey(part_box.right(), top);
02479   int tr_key = part.SortKey(part.right_margin(), top);
02480   int bl_key = part.SortKey(part_box.right(), bottom);
02481   int br_key = part.SortKey(part.right_margin(), bottom);
02482   int left_key = MAX(tl_key, bl_key);
02483   int right_key = MIN(tr_key, br_key);
02484   if (left_key <= *margin_right && right_key >= *margin_left) {
02485     // This part is good - let's keep it.
02486     *margin_right = MIN(*margin_right, right_key);
02487     *margin_left = MAX(*margin_left, left_key);
02488     return true;
02489   }
02490   return false;
02491 }
02492 
02493 // Computes and returns in start, end a line segment formed from a
02494 // backwards-iterated group of right edges of partitions that satisfy the
02495 // condition that the intersection of the right margins is non-empty, ie the
02496 // leftmost right margin is to the right of the rightmost right bounding box
02497 // edge.
02498 // On return the iterator is set to the start of the next run.
02499 void ColPartition::RightEdgeRun(ColPartition_IT* part_it,
02500                                 ICOORD* start, ICOORD* end) {
02501   ColPartition* part = part_it->data();
02502   ColPartition* start_part = part;
02503   int start_y = part->bounding_box_.bottom();
02504   if (!part_it->at_last()) {
02505     int next_y = part_it->data_relative(1)->bounding_box_.top();
02506     if (next_y > start_y)
02507       start_y = next_y;
02508     else if (next_y < start_y)
02509       start_y = (start_y + next_y) / 2;
02510   }
02511   int end_y = part->bounding_box_.top();
02512   int margin_right = MAX_INT32;
02513   int margin_left = -MAX_INT32;
02514   UpdateRightMargin(*part, &margin_left, &margin_right);
02515   do {
02516     part_it->backward();
02517     part = part_it->data();
02518   } while (!part_it->at_last() &&
02519            UpdateRightMargin(*part, &margin_left, &margin_right));
02520   // The run ended. If we were pushed inwards, compute the next run and
02521   // extend it backwards to find the end of this run for a tight box.
02522   int next_margin_right = MAX_INT32;
02523   int next_margin_left = -MAX_INT32;
02524   UpdateRightMargin(*part, &next_margin_left, &next_margin_right);
02525   if (next_margin_right < margin_left) {
02526     ColPartition_IT next_it(*part_it);
02527     do {
02528       next_it.backward();
02529       part = next_it.data();
02530     } while (!next_it.at_last() &&
02531              UpdateRightMargin(*part, &next_margin_left,
02532                                &next_margin_right));
02533     // Now extend the next run forwards into the original run to get the
02534     // tightest fit.
02535     do {
02536       part_it->forward();
02537       part = part_it->data();
02538     } while (part != start_part &&
02539              UpdateRightMargin(*part, &next_margin_left,
02540                                &next_margin_right));
02541     part_it->backward();
02542   }
02543   // Now calculate the end_y.
02544   part = part_it->data_relative(1);
02545   end_y = part->bounding_box().top();
02546   if (!part_it->at_last() &&
02547       part_it->data()->bounding_box_.bottom() > end_y)
02548     end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;
02549   start->set_y(start_y);
02550   start->set_x(part->XAtY(margin_left, start_y));
02551   end->set_y(end_y);
02552   end->set_x(part->XAtY(margin_left, end_y));
02553   if (textord_debug_tabfind && !part_it->at_last())
02554     tprintf("Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
02555             start_y, end_y, end->x(), part->XAtY(margin_right, end_y),
02556             part->bounding_box_.right(), part->right_margin_);
02557 }
02558 
02559 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines