tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/textord/colfind.cpp
Go to the documentation of this file.
00001 
00002 // File:        colfind.cpp
00003 // Description: Class to hold BLOBNBOXs in a grid for fast access
00004 //              to neighbours.
00005 // Author:      Ray Smith
00006 // Created:     Wed Jun 06 17:22:01 PDT 2007
00007 //
00008 // (C) Copyright 2007, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifdef _MSC_VER
00022 #pragma warning(disable:4244)  // Conversion warnings
00023 #endif
00024 
00025 // Include automatically generated configuration file if running autoconf.
00026 #ifdef HAVE_CONFIG_H
00027 #include "config_auto.h"
00028 #endif
00029 
00030 #include "colfind.h"
00031 
00032 #include "ccnontextdetect.h"
00033 #include "colpartition.h"
00034 #include "colpartitionset.h"
00035 #include "equationdetectbase.h"
00036 #include "linefind.h"
00037 #include "normalis.h"
00038 #include "strokewidth.h"
00039 #include "blobbox.h"
00040 #include "scrollview.h"
00041 #include "tablefind.h"
00042 #include "params.h"
00043 #include "workingpartset.h"
00044 
00045 namespace tesseract {
00046 
00047 // Minimum width (in pixels) to be considered when making columns.
00048 // TODO(rays) convert to inches, dependent on resolution.
00049 const int kMinColumnWidth = 100;
00050 // When assigning columns, the max number of misfit grid rows/ColPartitionSets
00051 // that can be ignored.
00052 const int kMaxIncompatibleColumnCount = 2;
00053 // Min fraction of ColPartition height to be overlapping for margin purposes.
00054 const double kMarginOverlapFraction = 0.25;
00055 // Max fraction of mean_column_gap_ for the gap between two partitions within a
00056 // column to allow them to merge.
00057 const double kHorizontalGapMergeFraction = 0.5;
00058 // Min fraction of grid size to not be considered likely noise.
00059 const double kMinNonNoiseFraction = 0.5;
00060 // Minimum gutter width as a fraction of gridsize
00061 const double kMinGutterWidthGrid = 0.5;
00062 // Max multiple of a partition's median size as a distance threshold for
00063 // adding noise blobs.
00064 const double kMaxDistToPartSizeRatio = 1.5;
00065 
00066 BOOL_VAR(textord_tabfind_show_initial_partitions,
00067          false, "Show partition bounds");
00068 BOOL_VAR(textord_tabfind_show_reject_blobs,
00069          false, "Show blobs rejected as noise");
00070 INT_VAR(textord_tabfind_show_partitions, 0,
00071         "Show partition bounds, waiting if >1");
00072 BOOL_VAR(textord_tabfind_show_columns, false, "Show column bounds");
00073 BOOL_VAR(textord_tabfind_show_blocks, false, "Show final block bounds");
00074 BOOL_VAR(textord_tabfind_find_tables, true, "run table detection");
00075 
00076 ScrollView* ColumnFinder::blocks_win_ = NULL;
00077 
00078 // Gridsize is an estimate of the text size in the image. A suitable value
00079 // is in TO_BLOCK::line_size after find_components has been used to make
00080 // the blobs.
00081 // bleft and tright are the bounds of the image (or rectangle) being processed.
00082 // vlines is a (possibly empty) list of TabVector and vertical_x and y are
00083 // the sum logical vertical vector produced by LineFinder::FindVerticalLines.
00084 ColumnFinder::ColumnFinder(int gridsize,
00085                            const ICOORD& bleft, const ICOORD& tright,
00086                            int resolution, bool cjk_script,
00087                            TabVector_LIST* vlines, TabVector_LIST* hlines,
00088                            int vertical_x, int vertical_y)
00089   : TabFind(gridsize, bleft, tright, vlines, vertical_x, vertical_y,
00090             resolution),
00091     cjk_script_(cjk_script),
00092     min_gutter_width_(static_cast<int>(kMinGutterWidthGrid * gridsize)),
00093     mean_column_gap_(tright.x() - bleft.x()),
00094     reskew_(1.0f, 0.0f), rotation_(1.0f, 0.0f), rerotate_(1.0f, 0.0f),
00095     best_columns_(NULL), stroke_width_(NULL),
00096     part_grid_(gridsize, bleft, tright), nontext_map_(NULL),
00097     projection_(resolution),
00098     denorm_(NULL), input_blobs_win_(NULL), equation_detect_(NULL) {
00099   TabVector_IT h_it(&horizontal_lines_);
00100   h_it.add_list_after(hlines);
00101 }
00102 
00103 ColumnFinder::~ColumnFinder() {
00104   column_sets_.delete_data_pointers();
00105   if (best_columns_ != NULL) {
00106     delete [] best_columns_;
00107   }
00108   if (stroke_width_ != NULL)
00109     delete stroke_width_;
00110   delete input_blobs_win_;
00111   pixDestroy(&nontext_map_);
00112   while (denorm_ != NULL) {
00113     DENORM* dead_denorm = denorm_;
00114     denorm_ = const_cast<DENORM*>(denorm_->predecessor());
00115     delete dead_denorm;
00116   }
00117 
00118   // The ColPartitions are destroyed automatically, but any boxes in
00119   // the noise_parts_ list are owned and need to be deleted explicitly.
00120   ColPartition_IT part_it(&noise_parts_);
00121   for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
00122     ColPartition* part = part_it.data();
00123     part->DeleteBoxes();
00124   }
00125   // Likewise any boxes in the good_parts_ list need to be deleted.
00126   // These are just the image parts. Text parts have already given their
00127   // boxes on to the TO_BLOCK, and have empty lists.
00128   part_it.set_to_list(&good_parts_);
00129   for (part_it.mark_cycle_pt(); !part_it.cycled_list(); part_it.forward()) {
00130     ColPartition* part = part_it.data();
00131     part->DeleteBoxes();
00132   }
00133   // Also, any blobs on the image_bblobs_ list need to have their cblobs
00134   // deleted. This only happens if there has been an early return from
00135   // FindColumns, as in a normal return, the blobs go into the grid and
00136   // end up in noise_parts_, good_parts_ or the output blocks.
00137   BLOBNBOX_IT bb_it(&image_bblobs_);
00138   for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
00139     BLOBNBOX* bblob = bb_it.data();
00140     delete bblob->cblob();
00141   }
00142 }
00143 
00144 // Performs initial processing on the blobs in the input_block:
00145 // Setup the part_grid, stroke_width_, nontext_map.
00146 // Obvious noise blobs are filtered out and used to mark the nontext_map_.
00147 // Initial stroke-width analysis is used to get local text alignment
00148 // direction, so the textline projection_ map can be setup.
00149 // On return, IsVerticallyAlignedText may be called (now optionally) to
00150 // determine the gross textline alignment of the page.
00151 void ColumnFinder::SetupAndFilterNoise(Pix* photo_mask_pix,
00152                                        TO_BLOCK* input_block) {
00153   part_grid_.Init(gridsize(), bleft(), tright());
00154   if (stroke_width_ != NULL)
00155     delete stroke_width_;
00156   stroke_width_ = new StrokeWidth(gridsize(), bleft(), tright());
00157   min_gutter_width_ = static_cast<int>(kMinGutterWidthGrid * gridsize());
00158   input_block->ReSetAndReFilterBlobs();
00159   #ifndef GRAPHICS_DISABLED
00160   if (textord_tabfind_show_blocks) {
00161     input_blobs_win_ = MakeWindow(0, 0, "Filtered Input Blobs");
00162     input_block->plot_graded_blobs(input_blobs_win_);
00163   }
00164   #endif  // GRAPHICS_DISABLED
00165   SetBlockRuleEdges(input_block);
00166   pixDestroy(&nontext_map_);
00167   // Run a preliminary strokewidth neighbour detection on the medium blobs.
00168   stroke_width_->SetNeighboursOnMediumBlobs(input_block);
00169   CCNonTextDetect nontext_detect(gridsize(), bleft(), tright());
00170   // Remove obvious noise and make the initial non-text map.
00171   nontext_map_ = nontext_detect.ComputeNonTextMask(textord_debug_tabfind,
00172                                                    photo_mask_pix, input_block);
00173   stroke_width_->FindTextlineDirectionAndFixBrokenCJK(cjk_script_, input_block);
00174   // Clear the strokewidth grid ready for rotation or leader finding.
00175   stroke_width_->Clear();
00176 }
00177 
00178 // Tests for vertical alignment of text (returning true if so), and generates
00179 // a list of blobs of moderate aspect ratio, in the most frequent writing
00180 // direction (in osd_blobs) for orientation and script detection to test
00181 // the character orientation.
00182 // block is the single block for the whole page or rectangle to be OCRed.
00183 // Note that the vertical alignment may be due to text whose writing direction
00184 // is vertical, like say Japanese, or due to text whose writing direction is
00185 // horizontal but whose text appears vertically aligned because the image is
00186 // not the right way up.
00187 bool ColumnFinder::IsVerticallyAlignedText(TO_BLOCK* block,
00188                                            BLOBNBOX_CLIST* osd_blobs) {
00189   return stroke_width_->TestVerticalTextDirection(block, osd_blobs);
00190 }
00191 
00192 // Rotates the blobs and the TabVectors so that the gross writing direction
00193 // (text lines) are horizontal and lines are read down the page.
00194 // Applied rotation stored in rotation_.
00195 // A second rotation is calculated for application during recognition to
00196 // make the rotated blobs upright for recognition.
00197 // Subsequent rotation stored in text_rotation_.
00198 //
00199 // Arguments:
00200 //   vertical_text_lines true if the text lines are vertical.
00201 //   recognition_rotation [0..3] is the number of anti-clockwise 90 degree
00202 //   rotations from osd required for the text to be upright and readable.
00203 void ColumnFinder::CorrectOrientation(TO_BLOCK* block,
00204                                       bool vertical_text_lines,
00205                                       int recognition_rotation) {
00206   const FCOORD anticlockwise90(0.0f, 1.0f);
00207   const FCOORD clockwise90(0.0f, -1.0f);
00208   const FCOORD rotation180(-1.0f, 0.0f);
00209   const FCOORD norotation(1.0f, 0.0f);
00210 
00211   text_rotation_ = norotation;
00212   // Rotate the page to make the text upright, as implied by
00213   // recognition_rotation.
00214   rotation_ = norotation;
00215   if (recognition_rotation == 1) {
00216     rotation_ = anticlockwise90;
00217   } else if (recognition_rotation == 2) {
00218     rotation_ = rotation180;
00219   } else if (recognition_rotation == 3) {
00220     rotation_ = clockwise90;
00221   }
00222   // We infer text writing direction to be vertical if there are several
00223   // vertical text lines detected, and horizontal if not. But if the page
00224   // orientation was determined to be 90 or 270 degrees, the true writing
00225   // direction is the opposite of what we inferred.
00226   if (recognition_rotation & 1) {
00227     vertical_text_lines = !vertical_text_lines;
00228   }
00229   // If we still believe the writing direction is vertical, we use the
00230   // convention of rotating the page ccw 90 degrees to make the text lines
00231   // horizontal, and mark the blobs for rotation cw 90 degrees for
00232   // classification so that the text order is correct after recognition.
00233   if (vertical_text_lines) {
00234     rotation_.rotate(anticlockwise90);
00235     text_rotation_.rotate(clockwise90);
00236   }
00237   // Set rerotate_ to the inverse of rotation_.
00238   rerotate_ = FCOORD(rotation_.x(), -rotation_.y());
00239   if (rotation_.x() != 1.0f || rotation_.y() != 0.0f) {
00240     // Rotate all the blobs and tab vectors.
00241     RotateBlobList(rotation_, &block->large_blobs);
00242     RotateBlobList(rotation_, &block->blobs);
00243     RotateBlobList(rotation_, &block->small_blobs);
00244     RotateBlobList(rotation_, &block->noise_blobs);
00245     TabFind::ResetForVerticalText(rotation_, rerotate_, &horizontal_lines_,
00246                                   &min_gutter_width_);
00247     part_grid_.Init(gridsize(), bleft(), tright());
00248     // Reset all blobs to initial state and filter by size.
00249     // Since they have rotated, the list they belong on could have changed.
00250     block->ReSetAndReFilterBlobs();
00251     SetBlockRuleEdges(block);
00252     stroke_width_->CorrectForRotation(rerotate_, &part_grid_);
00253   }
00254   if (textord_debug_tabfind) {
00255     tprintf("Vertical=%d, orientation=%d, final rotation=(%f, %f)+(%f,%f)\n",
00256             vertical_text_lines, recognition_rotation,
00257             rotation_.x(), rotation_.y(),
00258             text_rotation_.x(), text_rotation_.y());
00259   }
00260   // Setup the denormalization.
00261   ASSERT_HOST(denorm_ == NULL);
00262   denorm_ = new DENORM;
00263   denorm_->SetupNormalization(NULL, &rotation_, NULL,
00264                               0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f);
00265 }
00266 
00267 // Finds blocks of text, image, rule line, table etc, returning them in the
00268 // blocks and to_blocks
00269 // (Each TO_BLOCK points to the basic BLOCK and adds more information.)
00270 // Image blocks are generated by a combination of photo_mask_pix (which may
00271 // NOT be NULL) and the rejected text found during preliminary textline
00272 // finding.
00273 // The input_block is the result of a call to find_components, and contains
00274 // the blobs found in the image or rectangle to be OCRed. These blobs will be
00275 // removed and placed in the output blocks, while unused ones will be deleted.
00276 // If single_column is true, the input is treated as single column, but
00277 // it is still divided into blocks of equal line spacing/text size.
00278 // scaled_color is scaled down by scaled_factor from the input color image,
00279 // and may be NULL if the input was not color.
00280 // grey_pix is optional, but if present must match the photo_mask_pix in size,
00281 // and must be a *real* grey image instead of binary_pix * 255.
00282 // thresholds_pix is expected to be present iff grey_pix is present and
00283 // can be an integer factor reduction of the grey_pix. It represents the
00284 // thresholds that were used to create the binary_pix from the grey_pix.
00285 // Returns -1 if the user hits the 'd' key in the blocks window while running
00286 // in debug mode, which requests a retry with more debug info.
00287 int ColumnFinder::FindBlocks(PageSegMode pageseg_mode,
00288                              Pix* scaled_color, int scaled_factor,
00289                              TO_BLOCK* input_block, Pix* photo_mask_pix,
00290                              Pix* thresholds_pix, Pix* grey_pix,
00291                              BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) {
00292   pixOr(photo_mask_pix, photo_mask_pix, nontext_map_);
00293   stroke_width_->FindLeaderPartitions(input_block, &part_grid_);
00294   stroke_width_->RemoveLineResidue(&big_parts_);
00295   FindInitialTabVectors(NULL, min_gutter_width_, input_block);
00296   SetBlockRuleEdges(input_block);
00297   stroke_width_->GradeBlobsIntoPartitions(rerotate_, input_block, nontext_map_,
00298                                           denorm_, cjk_script_, &projection_,
00299                                           &part_grid_, &big_parts_);
00300   if (!PSM_SPARSE(pageseg_mode)) {
00301     ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_,
00302                                    input_block, this, &part_grid_, &big_parts_);
00303     ImageFind::TransferImagePartsToImageMask(rerotate_, &part_grid_,
00304                                              photo_mask_pix);
00305     ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_,
00306                                    input_block, this, &part_grid_, &big_parts_);
00307   }
00308   part_grid_.ReTypeBlobs(&image_bblobs_);
00309   TidyBlobs(input_block);
00310   Reset();
00311   // TODO(rays) need to properly handle big_parts_.
00312   ColPartition_IT p_it(&big_parts_);
00313   for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward())
00314     p_it.data()->DisownBoxesNoAssert();
00315   big_parts_.clear();
00316   delete stroke_width_;
00317   stroke_width_ = NULL;
00318   // Compute the edge offsets whether or not there is a grey_pix. It is done
00319   // here as the c_blobs haven't been touched by rotation or anything yet,
00320   // so no denorm is required, yet the text has been separated from image, so
00321   // no time is wasted running it on image blobs.
00322   input_block->ComputeEdgeOffsets(thresholds_pix, grey_pix);
00323 
00324   // A note about handling right-to-left scripts (Hebrew/Arabic):
00325   // The columns must be reversed and come out in right-to-left instead of
00326   // the normal left-to-right order. Because the left-to-right ordering
00327   // is implicit in many data structures, it is simpler to fool the algorithms
00328   // into thinking they are dealing with left-to-right text.
00329   // To do this, we reflect the needed data in the y-axis and then reflect
00330   // the blocks back after they have been created. This is a temporary
00331   // arrangment that is confined to this function only, so the reflection
00332   // is completely invisible in the output blocks.
00333   // The only objects reflected are:
00334   // The vertical separator lines that have already been found;
00335   // The bounding boxes of all BLOBNBOXES on all lists on the input_block
00336   // plus the image_bblobs. The outlines are not touched, since they are
00337   // not looked at.
00338   bool input_is_rtl = input_block->block->right_to_left();
00339   if (input_is_rtl) {
00340     // Reflect the vertical separator lines (member of TabFind).
00341     ReflectInYAxis();
00342     // Reflect the blob boxes.
00343     ReflectForRtl(input_block, &image_bblobs_);
00344     part_grid_.ReflectInYAxis();
00345   }
00346 
00347   if (!PSM_SPARSE(pageseg_mode)) {
00348     if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
00349       // No tab stops needed. Just the grid that FindTabVectors makes.
00350       DontFindTabVectors(&image_bblobs_, input_block, &deskew_, &reskew_);
00351     } else {
00352       SetBlockRuleEdges(input_block);
00353       // Find the tab stops, estimate skew, and deskew the tabs, blobs and
00354       // part_grid_.
00355       FindTabVectors(&horizontal_lines_, &image_bblobs_, input_block,
00356                      min_gutter_width_, &part_grid_, &deskew_, &reskew_);
00357       // Add the deskew to the denorm_.
00358       DENORM* new_denorm = new DENORM;
00359       new_denorm->SetupNormalization(NULL, &deskew_, denorm_,
00360                                      0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f);
00361       denorm_ = new_denorm;
00362     }
00363     SetBlockRuleEdges(input_block);
00364     part_grid_.SetTabStops(this);
00365 
00366     // Make the column_sets_.
00367     if (!MakeColumns(false)) {
00368       tprintf("Empty page!!\n");
00369       part_grid_.DeleteParts();
00370       return 0;  // This is an empty page.
00371     }
00372 
00373     // Refill the grid using rectangular spreading, and get the benefit
00374     // of the completed tab vectors marking the rule edges of each blob.
00375     Clear();
00376     #ifndef GRAPHICS_DISABLED
00377     if (textord_tabfind_show_reject_blobs) {
00378       ScrollView* rej_win = MakeWindow(500, 300, "Rejected blobs");
00379       input_block->plot_graded_blobs(rej_win);
00380     }
00381     #endif  // GRAPHICS_DISABLED
00382     InsertBlobsToGrid(false, false, &image_bblobs_, this);
00383     InsertBlobsToGrid(true, true, &input_block->blobs, this);
00384 
00385     part_grid_.GridFindMargins(best_columns_);
00386     // Split and merge the partitions by looking at local neighbours.
00387     GridSplitPartitions();
00388     // Resolve unknown partitions by adding to an existing partition, fixing
00389     // the type, or declaring them noise.
00390     part_grid_.GridFindMargins(best_columns_);
00391     GridMergePartitions();
00392     // Insert any unused noise blobs that are close enough to an appropriate
00393     // partition.
00394     InsertRemainingNoise(input_block);
00395     // Add horizontal line separators as partitions.
00396     GridInsertHLinePartitions();
00397     GridInsertVLinePartitions();
00398     // Recompute margins based on a local neighbourhood search.
00399     part_grid_.GridFindMargins(best_columns_);
00400     SetPartitionTypes();
00401   }
00402   if (textord_tabfind_show_initial_partitions) {
00403     ScrollView* part_win = MakeWindow(100, 300, "InitialPartitions");
00404     part_grid_.DisplayBoxes(part_win);
00405     DisplayTabVectors(part_win);
00406   }
00407 
00408   if (!PSM_SPARSE(pageseg_mode)) {
00409     if (equation_detect_) {
00410       equation_detect_->FindEquationParts(&part_grid_, best_columns_);
00411     }
00412     if (textord_tabfind_find_tables) {
00413       TableFinder table_finder;
00414       table_finder.Init(gridsize(), bleft(), tright());
00415       table_finder.set_resolution(resolution_);
00416       table_finder.set_left_to_right_language(
00417           !input_block->block->right_to_left());
00418       // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
00419       // insert dot-like noise into period_grid_
00420       table_finder.InsertCleanPartitions(&part_grid_, input_block);
00421       // Get Table Regions
00422       table_finder.LocateTables(&part_grid_, best_columns_, WidthCB(), reskew_);
00423     }
00424     GridRemoveUnderlinePartitions();
00425     part_grid_.DeleteUnknownParts(input_block);
00426 
00427     // Build the partitions into chains that belong in the same block and
00428     // refine into one-to-one links, then smooth the types within each chain.
00429     part_grid_.FindPartitionPartners();
00430     part_grid_.FindFigureCaptions();
00431     part_grid_.RefinePartitionPartners(true);
00432     SmoothPartnerRuns();
00433 
00434     #ifndef GRAPHICS_DISABLED
00435     if (textord_tabfind_show_partitions) {
00436       ScrollView* window = MakeWindow(400, 300, "Partitions");
00437       if (textord_debug_images)
00438         window->Image(AlignedBlob::textord_debug_pix().string(),
00439                       image_origin().x(), image_origin().y());
00440       part_grid_.DisplayBoxes(window);
00441       if (!textord_debug_printable)
00442         DisplayTabVectors(window);
00443       if (window != NULL && textord_tabfind_show_partitions > 1) {
00444         delete window->AwaitEvent(SVET_DESTROY);
00445       }
00446     }
00447     #endif  // GRAPHICS_DISABLED
00448     part_grid_.AssertNoDuplicates();
00449   }
00450   // Ownership of the ColPartitions moves from part_sets_ to part_grid_ here,
00451   // and ownership of the BLOBNBOXes moves to the ColPartitions.
00452   // (They were previously owned by the block or the image_bblobs list.)
00453   ReleaseBlobsAndCleanupUnused(input_block);
00454   // Ownership of the ColPartitions moves from part_grid_ to good_parts_ and
00455   // noise_parts_ here. In text blocks, ownership of the BLOBNBOXes moves
00456   // from the ColPartitions to the output TO_BLOCK. In non-text, the
00457   // BLOBNBOXes stay with the ColPartitions and get deleted in the destructor.
00458   if (PSM_SPARSE(pageseg_mode))
00459     part_grid_.ExtractPartitionsAsBlocks(blocks, to_blocks);
00460   else
00461     TransformToBlocks(blocks, to_blocks);
00462   if (textord_debug_tabfind) {
00463     tprintf("Found %d blocks, %d to_blocks\n",
00464             blocks->length(), to_blocks->length());
00465   }
00466 
00467   DisplayBlocks(blocks);
00468   RotateAndReskewBlocks(input_is_rtl, to_blocks);
00469   int result = 0;
00470   #ifndef GRAPHICS_DISABLED
00471   if (blocks_win_ != NULL) {
00472     bool waiting = false;
00473     do {
00474       waiting = false;
00475       SVEvent* event = blocks_win_->AwaitEvent(SVET_ANY);
00476       if (event->type == SVET_INPUT && event->parameter != NULL) {
00477         if (*event->parameter == 'd')
00478           result = -1;
00479         else
00480           blocks->clear();
00481       } else if (event->type == SVET_DESTROY) {
00482         blocks_win_ = NULL;
00483       } else {
00484         waiting = true;
00485       }
00486       delete event;
00487     } while (waiting);
00488   }
00489   #endif  // GRAPHICS_DISABLED
00490   return result;
00491 }
00492 
00493 // Get the rotation required to deskew, and its inverse rotation.
00494 void ColumnFinder::GetDeskewVectors(FCOORD* deskew, FCOORD* reskew) {
00495   *reskew = reskew_;
00496   *deskew = reskew_;
00497   deskew->set_y(-deskew->y());
00498 }
00499 
00500 void ColumnFinder::SetEquationDetect(EquationDetectBase* detect) {
00501   equation_detect_ = detect;
00502 }
00503 
00505 
00506 // Displays the blob and block bounding boxes in a window called Blocks.
00507 void ColumnFinder::DisplayBlocks(BLOCK_LIST* blocks) {
00508 #ifndef GRAPHICS_DISABLED
00509   if (textord_tabfind_show_blocks) {
00510     if (blocks_win_ == NULL)
00511       blocks_win_ = MakeWindow(700, 300, "Blocks");
00512     else
00513       blocks_win_->Clear();
00514     if (textord_debug_images)
00515       blocks_win_->Image(AlignedBlob::textord_debug_pix().string(),
00516                          image_origin().x(), image_origin().y());
00517     else
00518       DisplayBoxes(blocks_win_);
00519     BLOCK_IT block_it(blocks);
00520     int serial = 1;
00521     for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00522          block_it.forward()) {
00523       BLOCK* block = block_it.data();
00524       block->plot(blocks_win_, serial++,
00525                   textord_debug_printable ? ScrollView::BLUE
00526                                           : ScrollView::GREEN);
00527     }
00528     blocks_win_->Update();
00529   }
00530 #endif
00531 }
00532 
00533 // Displays the column edges at each grid y coordinate defined by
00534 // best_columns_.
00535 void ColumnFinder::DisplayColumnBounds(PartSetVector* sets) {
00536 #ifndef GRAPHICS_DISABLED
00537   ScrollView* col_win = MakeWindow(50, 300, "Columns");
00538   if (textord_debug_images)
00539     col_win->Image(AlignedBlob::textord_debug_pix().string(),
00540                    image_origin().x(), image_origin().y());
00541   else
00542     DisplayBoxes(col_win);
00543   col_win->Pen(textord_debug_printable ? ScrollView::BLUE : ScrollView::GREEN);
00544   for (int i = 0; i < gridheight_; ++i) {
00545     ColPartitionSet* columns = best_columns_[i];
00546     if (columns != NULL)
00547       columns->DisplayColumnEdges(i * gridsize_, (i + 1) * gridsize_, col_win);
00548   }
00549 #endif
00550 }
00551 
00552 // Sets up column_sets_ (the determined column layout at each horizontal
00553 // slice). Returns false if the page is empty.
00554 bool ColumnFinder::MakeColumns(bool single_column) {
00555   // The part_sets_ are a temporary structure used during column creation,
00556   // and is a vector of ColPartitionSets, representing ColPartitions found
00557   // at horizontal slices through the page.
00558   PartSetVector part_sets;
00559   if (!single_column) {
00560     if (!part_grid_.MakeColPartSets(&part_sets))
00561       return false;  // Empty page.
00562     ASSERT_HOST(part_grid_.gridheight() == gridheight_);
00563     // Try using only the good parts first.
00564     bool good_only = true;
00565     do {
00566       for (int i = 0; i < gridheight_; ++i) {
00567         ColPartitionSet* line_set = part_sets.get(i);
00568         if (line_set != NULL && line_set->LegalColumnCandidate()) {
00569           ColPartitionSet* column_candidate = line_set->Copy(good_only);
00570           if (column_candidate != NULL)
00571             column_candidate->AddToColumnSetsIfUnique(&column_sets_, WidthCB());
00572         }
00573       }
00574       good_only = !good_only;
00575     } while (column_sets_.empty() && !good_only);
00576     if (textord_debug_tabfind)
00577       PrintColumnCandidates("Column candidates");
00578     // Improve the column candidates against themselves.
00579     ImproveColumnCandidates(&column_sets_, &column_sets_);
00580     if (textord_debug_tabfind)
00581       PrintColumnCandidates("Improved columns");
00582     // Improve the column candidates using the part_sets_.
00583     ImproveColumnCandidates(&part_sets, &column_sets_);
00584   }
00585   ColPartitionSet* single_column_set =
00586       part_grid_.MakeSingleColumnSet(WidthCB());
00587   if (single_column_set != NULL) {
00588     // Always add the single column set as a backup even if not in
00589     // single column mode.
00590     single_column_set->AddToColumnSetsIfUnique(&column_sets_, WidthCB());
00591   }
00592   if (textord_debug_tabfind)
00593     PrintColumnCandidates("Final Columns");
00594   bool has_columns = !column_sets_.empty();
00595   if (has_columns) {
00596     // Divide the page into sections of uniform column layout.
00597     AssignColumns(part_sets);
00598     if (textord_tabfind_show_columns) {
00599       DisplayColumnBounds(&part_sets);
00600     }
00601     ComputeMeanColumnGap();
00602   }
00603   for (int i = 0; i < part_sets.size(); ++i) {
00604     ColPartitionSet* line_set = part_sets.get(i);
00605     if (line_set != NULL) {
00606       line_set->RelinquishParts();
00607       delete line_set;
00608     }
00609   }
00610   return has_columns;
00611 }
00612 
00613 // Attempt to improve the column_candidates by expanding the columns
00614 // and adding new partitions from the partition sets in src_sets.
00615 // Src_sets may be equal to column_candidates, in which case it will
00616 // use them as a source to improve themselves.
00617 void ColumnFinder::ImproveColumnCandidates(PartSetVector* src_sets,
00618                                            PartSetVector* column_sets) {
00619   PartSetVector temp_cols;
00620   temp_cols.move(column_sets);
00621   if (src_sets == column_sets)
00622     src_sets = &temp_cols;
00623   int set_size = temp_cols.size();
00624   // Try using only the good parts first.
00625   bool good_only = true;
00626   do {
00627     for (int i = 0; i < set_size; ++i) {
00628       ColPartitionSet* column_candidate = temp_cols.get(i);
00629       ASSERT_HOST(column_candidate != NULL);
00630       ColPartitionSet* improved = column_candidate->Copy(good_only);
00631       if (improved != NULL) {
00632         improved->ImproveColumnCandidate(WidthCB(), src_sets);
00633         improved->AddToColumnSetsIfUnique(column_sets, WidthCB());
00634       }
00635     }
00636     good_only = !good_only;
00637   } while (column_sets->empty() && !good_only);
00638   if (column_sets->empty())
00639     column_sets->move(&temp_cols);
00640   else
00641     temp_cols.delete_data_pointers();
00642 }
00643 
00644 // Prints debug information on the column candidates.
00645 void ColumnFinder::PrintColumnCandidates(const char* title) {
00646   int set_size =  column_sets_.size();
00647   tprintf("Found %d %s:\n", set_size, title);
00648   if (textord_debug_tabfind >= 3) {
00649     for (int i = 0; i < set_size; ++i) {
00650       ColPartitionSet* column_set = column_sets_.get(i);
00651       column_set->Print();
00652     }
00653   }
00654 }
00655 
00656 // Finds the optimal set of columns that cover the entire image with as
00657 // few changes in column partition as possible.
00658 // NOTE: this could be thought of as an optimization problem, but a simple
00659 // greedy algorithm is used instead. The algorithm repeatedly finds the modal
00660 // compatible column in an unassigned region and uses that with the extra
00661 // tweak of extending the modal region over small breaks in compatibility.
00662 // Where modal regions overlap, the boundary is chosen so as to minimize
00663 // the cost in terms of ColPartitions not fitting an approved column.
00664 void ColumnFinder::AssignColumns(const PartSetVector& part_sets) {
00665   int set_count = part_sets.size();
00666   ASSERT_HOST(set_count == gridheight());
00667   // Allocate and init the best_columns_.
00668   best_columns_ = new ColPartitionSet*[set_count];
00669   for (int y = 0; y < set_count; ++y)
00670     best_columns_[y] = NULL;
00671   int column_count = column_sets_.size();
00672   // column_set_costs[part_sets_ index][column_sets_ index] is
00673   // < MAX_INT32 if the partition set is compatible with the column set,
00674   // in which case its value is the cost for that set used in deciding
00675   // which competing set to assign.
00676   // any_columns_possible[part_sets_ index] is true if any of
00677   // possible_column_sets[part_sets_ index][*] is < MAX_INT32.
00678   // assigned_costs[part_sets_ index] is set to the column_set_costs
00679   // of the assigned column_sets_ index or MAX_INT32 if none is set.
00680   // On return the best_columns_ member is set.
00681   bool* any_columns_possible = new bool[set_count];
00682   int* assigned_costs = new int[set_count];
00683   int** column_set_costs = new int*[set_count];
00684   // Set possible column_sets to indicate whether each set is compatible
00685   // with each column.
00686   for (int part_i = 0; part_i < set_count; ++part_i) {
00687     ColPartitionSet* line_set = part_sets.get(part_i);
00688     bool debug = line_set != NULL &&
00689                  WithinTestRegion(2, line_set->bounding_box().left(),
00690                                   line_set->bounding_box().bottom());
00691     column_set_costs[part_i] = new int[column_count];
00692     any_columns_possible[part_i] = false;
00693     assigned_costs[part_i] = MAX_INT32;
00694     for (int col_i = 0; col_i < column_count; ++col_i) {
00695       if (line_set != NULL &&
00696           column_sets_.get(col_i)->CompatibleColumns(debug, line_set,
00697                                                      WidthCB())) {
00698         column_set_costs[part_i][col_i] =
00699             column_sets_.get(col_i)->UnmatchedWidth(line_set);
00700         any_columns_possible[part_i] = true;
00701       } else {
00702         column_set_costs[part_i][col_i] = MAX_INT32;
00703         if (debug)
00704           tprintf("Set id %d did not match at y=%d, lineset =%p\n",
00705                   col_i, part_i, line_set);
00706       }
00707     }
00708   }
00709   // Assign a column set to each vertical grid position.
00710   // While there is an unassigned range, find its mode.
00711   int start, end;
00712   while (BiggestUnassignedRange(set_count, any_columns_possible,
00713                                 &start, &end)) {
00714     if (textord_debug_tabfind >= 2)
00715       tprintf("Biggest unassigned range = %d- %d\n", start, end);
00716     // Find the modal column_set_id in the range.
00717     int column_set_id = RangeModalColumnSet(column_set_costs,
00718                                             assigned_costs, start, end);
00719     if (textord_debug_tabfind >= 2) {
00720       tprintf("Range modal column id = %d\n", column_set_id);
00721       column_sets_.get(column_set_id)->Print();
00722     }
00723     // Now find the longest run of the column_set_id in the range.
00724     ShrinkRangeToLongestRun(column_set_costs, assigned_costs,
00725                             any_columns_possible,
00726                             column_set_id, &start, &end);
00727     if (textord_debug_tabfind >= 2)
00728       tprintf("Shrunk range = %d- %d\n", start, end);
00729     // Extend the start and end past the longest run, while there are
00730     // only small gaps in compatibility that can be overcome by larger
00731     // regions of compatibility beyond.
00732     ExtendRangePastSmallGaps(column_set_costs, assigned_costs,
00733                              any_columns_possible,
00734                              column_set_id, -1, -1, &start);
00735     --end;
00736     ExtendRangePastSmallGaps(column_set_costs, assigned_costs,
00737                              any_columns_possible,
00738                              column_set_id, 1, set_count, &end);
00739     ++end;
00740     if (textord_debug_tabfind)
00741       tprintf("Column id %d applies to range = %d - %d\n",
00742               column_set_id, start, end);
00743     // Assign the column to the range, which now may overlap with other ranges.
00744     AssignColumnToRange(column_set_id, start, end, column_set_costs,
00745                         assigned_costs);
00746   }
00747   // If anything remains unassigned, the whole lot is unassigned, so
00748   // arbitrarily assign id 0.
00749   if (best_columns_[0] == NULL) {
00750     AssignColumnToRange(0, 0, gridheight_, column_set_costs, assigned_costs);
00751   }
00752   // Free memory.
00753   for (int i = 0; i < set_count; ++i) {
00754     delete [] column_set_costs[i];
00755   }
00756   delete [] assigned_costs;
00757   delete [] any_columns_possible;
00758   delete [] column_set_costs;
00759 }
00760 
00761 // Finds the biggest range in part_sets_ that has no assigned column, but
00762 // column assignment is possible.
00763 bool ColumnFinder::BiggestUnassignedRange(int set_count,
00764                                           const bool* any_columns_possible,
00765                                           int* best_start, int* best_end) {
00766   int best_range_size = 0;
00767   *best_start = set_count;
00768   *best_end = set_count;
00769   int end = set_count;
00770   for (int start = 0; start < gridheight_; start = end) {
00771     // Find the first unassigned index in start.
00772     while (start < set_count) {
00773       if (best_columns_[start] == NULL && any_columns_possible[start])
00774         break;
00775       ++start;
00776     }
00777     // Find the first past the end and count the good ones in between.
00778     int range_size = 1;  // Number of non-null, but unassigned line sets.
00779     end = start + 1;
00780     while (end < set_count) {
00781       if (best_columns_[end] != NULL)
00782         break;
00783       if (any_columns_possible[end])
00784         ++range_size;
00785       ++end;
00786     }
00787     if (start < set_count && range_size > best_range_size) {
00788       best_range_size = range_size;
00789       *best_start = start;
00790       *best_end = end;
00791     }
00792   }
00793   return *best_start < *best_end;
00794 }
00795 
00796 // Finds the modal compatible column_set_ index within the given range.
00797 int ColumnFinder::RangeModalColumnSet(int** column_set_costs,
00798                                       const int* assigned_costs,
00799                                       int start, int end) {
00800   int column_count = column_sets_.size();
00801   STATS column_stats(0, column_count);
00802   for (int part_i = start; part_i < end; ++part_i) {
00803     for (int col_j = 0; col_j < column_count; ++col_j) {
00804       if (column_set_costs[part_i][col_j] < assigned_costs[part_i])
00805         column_stats.add(col_j, 1);
00806     }
00807   }
00808   ASSERT_HOST(column_stats.get_total() > 0);
00809   return column_stats.mode();
00810 }
00811 
00812 // Given that there are many column_set_id compatible columns in the range,
00813 // shrinks the range to the longest contiguous run of compatibility, allowing
00814 // gaps where no columns are possible, but not where competing columns are
00815 // possible.
00816 void ColumnFinder::ShrinkRangeToLongestRun(int** column_set_costs,
00817                                            const int* assigned_costs,
00818                                            const bool* any_columns_possible,
00819                                            int column_set_id,
00820                                            int* best_start, int* best_end) {
00821   // orig_start and orig_end are the maximum range we will look at.
00822   int orig_start = *best_start;
00823   int orig_end = *best_end;
00824   int best_range_size = 0;
00825   *best_start = orig_end;
00826   *best_end = orig_end;
00827   int end = orig_end;
00828   for (int start = orig_start; start < orig_end; start = end) {
00829     // Find the first possible
00830     while (start < orig_end) {
00831       if (column_set_costs[start][column_set_id] < assigned_costs[start] ||
00832           !any_columns_possible[start])
00833         break;
00834       ++start;
00835     }
00836     // Find the first past the end.
00837     end = start + 1;
00838     while (end < orig_end) {
00839       if (column_set_costs[end][column_set_id] >= assigned_costs[start] &&
00840           any_columns_possible[end])
00841           break;
00842       ++end;
00843     }
00844     if (start < orig_end && end - start > best_range_size) {
00845       best_range_size = end - start;
00846       *best_start = start;
00847       *best_end = end;
00848     }
00849   }
00850 }
00851 
00852 // Moves start in the direction of step, upto, but not including end while
00853 // the only incompatible regions are no more than kMaxIncompatibleColumnCount
00854 // in size, and the compatible regions beyond are bigger.
00855 void ColumnFinder::ExtendRangePastSmallGaps(int** column_set_costs,
00856                                             const int* assigned_costs,
00857                                             const bool* any_columns_possible,
00858                                             int column_set_id,
00859                                             int step, int end, int* start) {
00860   if (textord_debug_tabfind > 2)
00861     tprintf("Starting expansion at %d, step=%d, limit=%d\n",
00862             *start, step, end);
00863   if (*start == end)
00864     return;  // Cannot be expanded.
00865 
00866   int barrier_size = 0;
00867   int good_size = 0;
00868   do {
00869     // Find the size of the incompatible barrier.
00870     barrier_size = 0;
00871     int i;
00872     for (i = *start + step; i != end; i += step) {
00873       if (column_set_costs[i][column_set_id] < assigned_costs[i])
00874         break;  // We are back on.
00875       // Locations where none are possible don't count.
00876       if (any_columns_possible[i])
00877         ++barrier_size;
00878     }
00879     if (textord_debug_tabfind > 2)
00880       tprintf("At %d, Barrier size=%d\n", i, barrier_size);
00881     if (barrier_size > kMaxIncompatibleColumnCount)
00882       return;  // Barrier too big.
00883     if (i == end) {
00884       // We can't go any further, but the barrier was small, so go to the end.
00885       *start = i - step;
00886       return;
00887     }
00888     // Now find the size of the good region on the other side.
00889     good_size = 1;
00890     for (i += step; i != end; i += step) {
00891       if (column_set_costs[i][column_set_id] < assigned_costs[i])
00892         ++good_size;
00893       else if (any_columns_possible[i])
00894         break;
00895     }
00896     if (textord_debug_tabfind > 2)
00897       tprintf("At %d, good size = %d\n", i, good_size);
00898     // If we had enough good ones we can extend the start and keep looking.
00899     if (good_size >= barrier_size)
00900       *start = i - step;
00901   } while (good_size >= barrier_size);
00902 }
00903 
00904 // Assigns the given column_set_id to the given range.
00905 void ColumnFinder::AssignColumnToRange(int column_set_id, int start, int end,
00906                                        int** column_set_costs,
00907                                        int* assigned_costs) {
00908   ColPartitionSet* column_set = column_sets_.get(column_set_id);
00909   for (int i = start; i < end; ++i) {
00910     assigned_costs[i] = column_set_costs[i][column_set_id];
00911     best_columns_[i] = column_set;
00912   }
00913 }
00914 
00915 // Computes the mean_column_gap_.
00916 void ColumnFinder::ComputeMeanColumnGap() {
00917   int total_gap = 0;
00918   int total_width = 0;
00919   int gap_samples = 0;
00920   int width_samples = 0;
00921   for (int i = 0; i < gridheight_; ++i) {
00922     ASSERT_HOST(best_columns_[i] != NULL);
00923     best_columns_[i]->AccumulateColumnWidthsAndGaps(&total_width,
00924                                                     &width_samples,
00925                                                     &total_gap,
00926                                                     &gap_samples);
00927   }
00928   mean_column_gap_ = gap_samples > 0 ? total_gap / gap_samples
00929                                      : total_width / width_samples;
00930 }
00931 
00934 
00935 // Helper to delete all the deletable blobs on the list. Owned blobs are
00936 // extracted from the list, but not deleted, leaving them owned by the owner().
00937 static void ReleaseAllBlobsAndDeleteUnused(BLOBNBOX_LIST* blobs) {
00938   for (BLOBNBOX_IT blob_it(blobs); !blob_it.empty(); blob_it.forward()) {
00939     BLOBNBOX* blob = blob_it.extract();
00940     if (blob->owner() == NULL) {
00941       delete blob->cblob();
00942       delete blob;
00943     }
00944   }
00945 }
00946 
00947 // Hoovers up all un-owned blobs and deletes them.
00948 // The rest get released from the block so the ColPartitions can pass
00949 // ownership to the output blocks.
00950 void ColumnFinder::ReleaseBlobsAndCleanupUnused(TO_BLOCK* block) {
00951   ReleaseAllBlobsAndDeleteUnused(&block->blobs);
00952   ReleaseAllBlobsAndDeleteUnused(&block->small_blobs);
00953   ReleaseAllBlobsAndDeleteUnused(&block->noise_blobs);
00954   ReleaseAllBlobsAndDeleteUnused(&block->large_blobs);
00955   ReleaseAllBlobsAndDeleteUnused(&image_bblobs_);
00956 }
00957 
00958 // Splits partitions that cross columns where they have nothing in the gap.
00959 void ColumnFinder::GridSplitPartitions() {
00960   // Iterate the ColPartitions in the grid.
00961   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
00962     gsearch(&part_grid_);
00963   gsearch.StartFullSearch();
00964   ColPartition* dont_repeat = NULL;
00965   ColPartition* part;
00966   while ((part = gsearch.NextFullSearch()) != NULL) {
00967     if (part->blob_type() < BRT_UNKNOWN || part == dont_repeat)
00968       continue;  // Only applies to text partitions.
00969     ColPartitionSet* column_set = best_columns_[gsearch.GridY()];
00970     int first_col = -1;
00971     int last_col = -1;
00972     // Find which columns the partition spans.
00973     part->ColumnRange(resolution_, column_set, &first_col, &last_col);
00974     if (first_col > 0)
00975       --first_col;
00976     // Convert output column indices to physical column indices.
00977     first_col /= 2;
00978     last_col /= 2;
00979     // We will only consider cases where a partition spans two columns,
00980     // since a heading that spans more columns than that is most likely
00981     // genuine.
00982     if (last_col != first_col + 1)
00983       continue;
00984     // Set up a rectangle search x-bounded by the column gap and y by the part.
00985     int y = part->MidY();
00986     TBOX margin_box = part->bounding_box();
00987     bool debug = AlignedBlob::WithinTestRegion(2, margin_box.left(),
00988                                                margin_box.bottom());
00989     if (debug) {
00990       tprintf("Considering partition for GridSplit:");
00991       part->Print();
00992     }
00993     ColPartition* column = column_set->GetColumnByIndex(first_col);
00994     if (column == NULL)
00995       continue;
00996     margin_box.set_left(column->RightAtY(y) + 2);
00997     column = column_set->GetColumnByIndex(last_col);
00998     if (column == NULL)
00999       continue;
01000     margin_box.set_right(column->LeftAtY(y) - 2);
01001     // TODO(rays) Decide whether to keep rectangular filling or not in the
01002     // main grid and therefore whether we need a fancier search here.
01003     // Now run the rect search on the main blob grid.
01004     GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> rectsearch(this);
01005     if (debug) {
01006       tprintf("Searching box (%d,%d)->(%d,%d)\n",
01007               margin_box.left(), margin_box.bottom(),
01008               margin_box.right(), margin_box.top());
01009       part->Print();
01010     }
01011     rectsearch.StartRectSearch(margin_box);
01012     BLOBNBOX* bbox;
01013     while ((bbox = rectsearch.NextRectSearch()) != NULL) {
01014       if (bbox->bounding_box().overlap(margin_box))
01015         break;
01016     }
01017     if (bbox == NULL) {
01018       // There seems to be nothing in the hole, so split the partition.
01019       gsearch.RemoveBBox();
01020       int x_middle = (margin_box.left() + margin_box.right()) / 2;
01021       if (debug) {
01022         tprintf("Splitting part at %d:", x_middle);
01023         part->Print();
01024       }
01025       ColPartition* split_part = part->SplitAt(x_middle);
01026       if (split_part != NULL) {
01027         if (debug) {
01028           tprintf("Split result:");
01029           part->Print();
01030           split_part->Print();
01031         }
01032         part_grid_.InsertBBox(true, true, split_part);
01033       } else {
01034         // Split had no effect
01035         if (debug)
01036           tprintf("Split had no effect\n");
01037         dont_repeat = part;
01038       }
01039       part_grid_.InsertBBox(true, true, part);
01040       gsearch.RepositionIterator();
01041     } else if (debug) {
01042       tprintf("Part cannot be split: blob (%d,%d)->(%d,%d) in column gap\n",
01043               bbox->bounding_box().left(), bbox->bounding_box().bottom(),
01044               bbox->bounding_box().right(), bbox->bounding_box().top());
01045     }
01046   }
01047 }
01048 
01049 // Merges partitions where there is vertical overlap, within a single column,
01050 // and the horizontal gap is small enough.
01051 void ColumnFinder::GridMergePartitions() {
01052   // Iterate the ColPartitions in the grid.
01053   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01054     gsearch(&part_grid_);
01055   gsearch.StartFullSearch();
01056   ColPartition* part;
01057   while ((part = gsearch.NextFullSearch()) != NULL) {
01058     if (part->IsUnMergeableType())
01059       continue;
01060     // Set up a rectangle search x-bounded by the column and y by the part.
01061     ColPartitionSet* columns = best_columns_[gsearch.GridY()];
01062     TBOX box = part->bounding_box();
01063     bool debug = AlignedBlob::WithinTestRegion(1, box.left(), box.bottom());
01064     if (debug) {
01065       tprintf("Considering part for merge at:");
01066       part->Print();
01067     }
01068     int y = part->MidY();
01069     ColPartition* left_column = columns->ColumnContaining(box.left(), y);
01070     ColPartition* right_column = columns->ColumnContaining(box.right(), y);
01071     if (left_column == NULL || right_column != left_column) {
01072       if (debug)
01073         tprintf("In different columns\n");
01074       continue;
01075     }
01076     box.set_left(left_column->LeftAtY(y));
01077     box.set_right(right_column->RightAtY(y));
01078     // Now run the rect search.
01079     bool modified_box = false;
01080     GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01081       rsearch(&part_grid_);
01082     rsearch.SetUniqueMode(true);
01083     rsearch.StartRectSearch(box);
01084     ColPartition* neighbour;
01085 
01086     while ((neighbour = rsearch.NextRectSearch()) != NULL) {
01087       if (neighbour == part || neighbour->IsUnMergeableType())
01088         continue;
01089       const TBOX& neighbour_box = neighbour->bounding_box();
01090       if (debug) {
01091         tprintf("Considering merge with neighbour at:");
01092         neighbour->Print();
01093       }
01094       if (neighbour_box.right() < box.left() ||
01095           neighbour_box.left() > box.right())
01096         continue;  // Not within the same column.
01097       if (part->VSignificantCoreOverlap(*neighbour) &&
01098           part->TypesMatch(*neighbour)) {
01099         // There is vertical overlap and the gross types match, but only
01100         // merge if the horizontal gap is small enough, as one of the
01101         // partitions may be a figure caption within a column.
01102         // If there is only one column, then the mean_column_gap_ is large
01103         // enough to allow almost any merge, by being the mean column width.
01104         const TBOX& part_box = part->bounding_box();
01105         // Don't merge if there is something else in the way. Use the margin
01106         // to decide, and check both to allow a bit of overlap.
01107         if (neighbour_box.left() > part->right_margin() &&
01108             part_box.right() < neighbour->left_margin())
01109           continue;  // Neighbour is too far to the right.
01110         if (neighbour_box.right() < part->left_margin() &&
01111             part_box.left() > neighbour->right_margin())
01112           continue;  // Neighbour is too far to the left.
01113         int h_gap = MAX(part_box.left(), neighbour_box.left()) -
01114                     MIN(part_box.right(), neighbour_box.right());
01115         if (h_gap < mean_column_gap_ * kHorizontalGapMergeFraction ||
01116             part_box.width() < mean_column_gap_ ||
01117             neighbour_box.width() < mean_column_gap_) {
01118           if (debug) {
01119             tprintf("Running grid-based merge between:\n");
01120             part->Print();
01121             neighbour->Print();
01122           }
01123           rsearch.RemoveBBox();
01124           gsearch.RepositionIterator();
01125           part->Absorb(neighbour, WidthCB());
01126           modified_box = true;
01127         } else if (debug) {
01128           tprintf("Neighbour failed hgap test\n");
01129         }
01130       } else if (debug) {
01131         tprintf("Neighbour failed overlap or typesmatch test\n");
01132       }
01133     }
01134     if (modified_box) {
01135       // We modified the box of part, so re-insert it into the grid.
01136       // This does no harm in the current cell, as it already exists there,
01137       // but it needs to exist in all the cells covered by its bounding box,
01138       // or it will never be found by a full search.
01139       // Because the box has changed, it has to be removed first, otherwise
01140       // add_sorted may fail to keep a single copy of the pointer.
01141       gsearch.RemoveBBox();
01142       part_grid_.InsertBBox(true, true, part);
01143       gsearch.RepositionIterator();
01144     }
01145   }
01146 }
01147 
01148 // Inserts remaining noise blobs into the most applicable partition if any.
01149 // If there is no applicable partition, then the blobs are deleted.
01150 void ColumnFinder::InsertRemainingNoise(TO_BLOCK* block) {
01151   BLOBNBOX_IT blob_it(&block->noise_blobs);
01152   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
01153     BLOBNBOX* blob = blob_it.data();
01154     if (blob->owner() != NULL) continue;
01155     TBOX search_box(blob->bounding_box());
01156     bool debug = WithinTestRegion(2, search_box.left(), search_box.bottom());
01157     search_box.pad(gridsize(), gridsize());
01158     // Setup a rectangle search to find the best partition to merge with.
01159     ColPartitionGridSearch rsearch(&part_grid_);
01160     rsearch.SetUniqueMode(true);
01161     rsearch.StartRectSearch(search_box);
01162     ColPartition* part;
01163     ColPartition* best_part = NULL;
01164     int best_distance = 0;
01165     while ((part = rsearch.NextRectSearch()) != NULL) {
01166       if (part->IsUnMergeableType())
01167         continue;
01168       int distance = projection_.DistanceOfBoxFromPartition(
01169           blob->bounding_box(), *part, denorm_, debug);
01170       if (best_part == NULL || distance < best_distance) {
01171         best_part = part;
01172         best_distance = distance;
01173       }
01174     }
01175     if (best_part != NULL &&
01176         best_distance < kMaxDistToPartSizeRatio * best_part->median_size()) {
01177       // Close enough to merge.
01178       if (debug) {
01179         tprintf("Adding noise blob with distance %d, thr=%g:box:",
01180                 best_distance,
01181                 kMaxDistToPartSizeRatio * best_part->median_size());
01182         blob->bounding_box().print();
01183         tprintf("To partition:");
01184         best_part->Print();
01185       }
01186       part_grid_.RemoveBBox(best_part);
01187       best_part->AddBox(blob);
01188       part_grid_.InsertBBox(true, true, best_part);
01189       blob->set_owner(best_part);
01190       blob->set_flow(best_part->flow());
01191       blob->set_region_type(best_part->blob_type());
01192     } else {
01193       // Mark the blob for deletion.
01194       blob->set_region_type(BRT_NOISE);
01195     }
01196   }
01197   // Delete the marked blobs, clearing neighbour references.
01198   block->DeleteUnownedNoise();
01199 }
01200 
01201 // Helper makes a box from a horizontal line.
01202 static TBOX BoxFromHLine(const TabVector* hline) {
01203   int top = MAX(hline->startpt().y(), hline->endpt().y());
01204   int bottom = MIN(hline->startpt().y(), hline->endpt().y());
01205   top += hline->mean_width();
01206   if (top == bottom) {
01207     if (bottom > 0)
01208       --bottom;
01209     else
01210       ++top;
01211   }
01212   return TBOX(hline->startpt().x(), bottom, hline->endpt().x(), top);
01213 }
01214 
01215 // Remove partitions that come from horizontal lines that look like
01216 // underlines, but are not part of a table.
01217 void ColumnFinder::GridRemoveUnderlinePartitions() {
01218   TabVector_IT hline_it(&horizontal_lines_);
01219   for (hline_it.mark_cycle_pt(); !hline_it.cycled_list(); hline_it.forward()) {
01220     TabVector* hline = hline_it.data();
01221     if (hline->intersects_other_lines())
01222       continue;
01223     TBOX line_box = BoxFromHLine(hline);
01224     TBOX search_box = line_box;
01225     search_box.pad(0, line_box.height());
01226     ColPartitionGridSearch part_search(&part_grid_);
01227     part_search.SetUniqueMode(true);
01228     part_search.StartRectSearch(search_box);
01229     ColPartition* covered;
01230     bool touched_table = false;
01231     bool touched_text = false;
01232     ColPartition* line_part = NULL;
01233     while ((covered = part_search.NextRectSearch()) != NULL) {
01234       if (covered->type() == PT_TABLE) {
01235         touched_table = true;
01236         break;
01237       } else if (covered->IsTextType()) {
01238         // TODO(rays) Add a list of underline sections to ColPartition.
01239         int text_bottom = covered->median_bottom();
01240         if (line_box.bottom() <= text_bottom && text_bottom <= search_box.top())
01241           touched_text = true;
01242       } else if (covered->blob_type() == BRT_HLINE &&
01243           line_box.contains(covered->bounding_box())) {
01244         line_part = covered;
01245       }
01246     }
01247     if (line_part != NULL && !touched_table && touched_text) {
01248       part_grid_.RemoveBBox(line_part);
01249       delete line_part;
01250     }
01251   }
01252 }
01253 
01254 // Add horizontal line separators as partitions.
01255 void ColumnFinder::GridInsertHLinePartitions() {
01256   TabVector_IT hline_it(&horizontal_lines_);
01257   for (hline_it.mark_cycle_pt(); !hline_it.cycled_list(); hline_it.forward()) {
01258     TabVector* hline = hline_it.data();
01259     TBOX line_box = BoxFromHLine(hline);
01260     ColPartition* part = ColPartition::MakeLinePartition(
01261         BRT_HLINE, vertical_skew_,
01262         line_box.left(), line_box.bottom(), line_box.right(), line_box.top());
01263     part->set_type(PT_HORZ_LINE);
01264     bool any_image = false;
01265     ColPartitionGridSearch part_search(&part_grid_);
01266     part_search.SetUniqueMode(true);
01267     part_search.StartRectSearch(line_box);
01268     ColPartition* covered;
01269     while ((covered = part_search.NextRectSearch()) != NULL) {
01270       if (covered->IsImageType()) {
01271         any_image = true;
01272         break;
01273       }
01274     }
01275     if (!any_image)
01276       part_grid_.InsertBBox(true, true, part);
01277     else
01278       delete part;
01279   }
01280 }
01281 
01282 // Add horizontal line separators as partitions.
01283 void ColumnFinder::GridInsertVLinePartitions() {
01284   TabVector_IT vline_it(dead_vectors());
01285   for (vline_it.mark_cycle_pt(); !vline_it.cycled_list(); vline_it.forward()) {
01286     TabVector* vline = vline_it.data();
01287     if (!vline->IsSeparator())
01288       continue;
01289     int left = MIN(vline->startpt().x(), vline->endpt().x());
01290     int right = MAX(vline->startpt().x(), vline->endpt().x());
01291     right += vline->mean_width();
01292     if (left == right) {
01293       if (left > 0)
01294         --left;
01295       else
01296         ++right;
01297     }
01298     ColPartition* part = ColPartition::MakeLinePartition(
01299         BRT_VLINE, vertical_skew_,
01300         left, vline->startpt().y(), right, vline->endpt().y());
01301     part->set_type(PT_VERT_LINE);
01302     bool any_image = false;
01303     ColPartitionGridSearch part_search(&part_grid_);
01304     part_search.SetUniqueMode(true);
01305     part_search.StartRectSearch(part->bounding_box());
01306     ColPartition* covered;
01307     while ((covered = part_search.NextRectSearch()) != NULL) {
01308       if (covered->IsImageType()) {
01309         any_image = true;
01310         break;
01311       }
01312     }
01313     if (!any_image)
01314       part_grid_.InsertBBox(true, true, part);
01315     else
01316       delete part;
01317   }
01318 }
01319 
01320 // For every ColPartition in the grid, sets its type based on position
01321 // in the columns.
01322 void ColumnFinder::SetPartitionTypes() {
01323   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01324     gsearch(&part_grid_);
01325   gsearch.StartFullSearch();
01326   ColPartition* part;
01327   while ((part = gsearch.NextFullSearch()) != NULL) {
01328     part->SetPartitionType(resolution_, best_columns_[gsearch.GridY()]);
01329   }
01330 }
01331 
01332 // Only images remain with multiple types in a run of partners.
01333 // Sets the type of all in the group to the maximum of the group.
01334 void ColumnFinder::SmoothPartnerRuns() {
01335   // Iterate the ColPartitions in the grid.
01336   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01337     gsearch(&part_grid_);
01338   gsearch.StartFullSearch();
01339   ColPartition* part;
01340   while ((part = gsearch.NextFullSearch()) != NULL) {
01341     ColPartition* partner = part->SingletonPartner(true);
01342     if (partner != NULL) {
01343       if (partner->SingletonPartner(false) != part) {
01344         tprintf("Ooops! Partition:(%d partners)",
01345                 part->upper_partners()->length());
01346         part->Print();
01347         tprintf("has singleton partner:(%d partners",
01348                 partner->lower_partners()->length());
01349         partner->Print();
01350         tprintf("but its singleton partner is:");
01351         if (partner->SingletonPartner(false) == NULL)
01352           tprintf("NULL\n");
01353         else
01354           partner->SingletonPartner(false)->Print();
01355       }
01356       ASSERT_HOST(partner->SingletonPartner(false) == part);
01357     } else if (part->SingletonPartner(false) != NULL) {
01358       ColPartitionSet* column_set = best_columns_[gsearch.GridY()];
01359       int column_count = column_set->ColumnCount();
01360       part->SmoothPartnerRun(column_count * 2 + 1);
01361     }
01362   }
01363 }
01364 
01365 // Helper functions for TransformToBlocks.
01366 // Add the part to the temp list in the correct order.
01367 void ColumnFinder::AddToTempPartList(ColPartition* part,
01368                                      ColPartition_CLIST* temp_list) {
01369   int mid_y = part->MidY();
01370   ColPartition_C_IT it(temp_list);
01371   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01372     ColPartition* test_part = it.data();
01373     if (part->type() == PT_NOISE || test_part->type() == PT_NOISE)
01374       continue;  // Noise stays in sequence.
01375     if (test_part == part->SingletonPartner(false))
01376       break;  // Insert before its lower partner.
01377     int neighbour_bottom = test_part->median_bottom();
01378     int neighbour_top = test_part->median_top();
01379     int neighbour_y = (neighbour_bottom + neighbour_top) / 2;
01380     if (neighbour_y < mid_y)
01381       break;  // part is above test_part so insert it.
01382     if (!part->HOverlaps(*test_part) && !part->WithinSameMargins(*test_part))
01383       continue;  // Incompatibles stay in order
01384   }
01385   if (it.cycled_list()) {
01386     it.add_to_end(part);
01387   } else {
01388     it.add_before_stay_put(part);
01389   }
01390 }
01391 
01392 // Add everything from the temp list to the work_set assuming correct order.
01393 void ColumnFinder::EmptyTempPartList(ColPartition_CLIST* temp_list,
01394                                      WorkingPartSet_LIST* work_set) {
01395   ColPartition_C_IT it(temp_list);
01396   while (!it.empty()) {
01397     it.extract()->AddToWorkingSet(bleft_, tright_, resolution_,
01398                           &good_parts_, work_set);
01399     it.forward();
01400   }
01401 }
01402 
01403 // Transform the grid of partitions to the output blocks.
01404 void ColumnFinder::TransformToBlocks(BLOCK_LIST* blocks,
01405                                      TO_BLOCK_LIST* to_blocks) {
01406   WorkingPartSet_LIST work_set;
01407   ColPartitionSet* column_set = NULL;
01408   ColPartition_IT noise_it(&noise_parts_);
01409   // The temp_part_list holds a list of parts at the same grid y coord
01410   // so they can be added in the correct order. This prevents thin objects
01411   // like horizontal lines going before the text lines above them.
01412   ColPartition_CLIST temp_part_list;
01413   // Iterate the ColPartitions in the grid. It starts at the top
01414   GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT>
01415     gsearch(&part_grid_);
01416   gsearch.StartFullSearch();
01417   int prev_grid_y = -1;
01418   ColPartition* part;
01419   while ((part = gsearch.NextFullSearch()) != NULL) {
01420     int grid_y = gsearch.GridY();
01421     if (grid_y != prev_grid_y) {
01422       EmptyTempPartList(&temp_part_list, &work_set);
01423       prev_grid_y = grid_y;
01424     }
01425     if (best_columns_[grid_y] != column_set) {
01426       column_set = best_columns_[grid_y];
01427       // Every line should have a non-null best column.
01428       ASSERT_HOST(column_set != NULL);
01429       column_set->ChangeWorkColumns(bleft_, tright_, resolution_,
01430                                     &good_parts_, &work_set);
01431       if (textord_debug_tabfind)
01432         tprintf("Changed column groups at grid index %d, y=%d\n",
01433                 gsearch.GridY(), gsearch.GridY() * gridsize());
01434     }
01435     if (part->type() == PT_NOISE) {
01436       noise_it.add_to_end(part);
01437     } else {
01438       AddToTempPartList(part, &temp_part_list);
01439     }
01440   }
01441   EmptyTempPartList(&temp_part_list, &work_set);
01442   // Now finish all working sets and transfer ColPartitionSets to block_sets.
01443   WorkingPartSet_IT work_it(&work_set);
01444   while (!work_it.empty()) {
01445     WorkingPartSet* working_set = work_it.extract();
01446     working_set->ExtractCompletedBlocks(bleft_, tright_, resolution_,
01447                                         &good_parts_, blocks, to_blocks);
01448     delete working_set;
01449     work_it.forward();
01450   }
01451 }
01452 
01453 // Helper reflects a list of blobs in the y-axis.
01454 // Only reflects the BLOBNBOX bounding box. Not the blobs or outlines below.
01455 static void ReflectBlobList(BLOBNBOX_LIST* bblobs) {
01456   BLOBNBOX_IT it(bblobs);
01457   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01458     it.data()->reflect_box_in_y_axis();
01459   }
01460 }
01461 
01462 // Reflect the blob boxes (but not the outlines) in the y-axis so that
01463 // the blocks get created in the correct RTL order. Reflects the blobs
01464 // in the input_block and the bblobs list.
01465 // The reflection is undone in RotateAndReskewBlocks by
01466 // reflecting the blocks themselves, and then recomputing the blob bounding
01467 // boxes.
01468 void ColumnFinder::ReflectForRtl(TO_BLOCK* input_block, BLOBNBOX_LIST* bblobs) {
01469   ReflectBlobList(bblobs);
01470   ReflectBlobList(&input_block->blobs);
01471   ReflectBlobList(&input_block->small_blobs);
01472   ReflectBlobList(&input_block->noise_blobs);
01473   ReflectBlobList(&input_block->large_blobs);
01474   // Update the denorm with the reflection.
01475   DENORM* new_denorm = new DENORM;
01476   new_denorm->SetupNormalization(NULL, NULL, denorm_,
01477                                  0.0f, 0.0f, -1.0f, 1.0f, 0.0f, 0.0f);
01478   denorm_ = new_denorm;
01479 }
01480 
01481 // Helper fixes up blobs and cblobs to match the desired rotation,
01482 // exploding multi-outline blobs back to single blobs and accumulating
01483 // the bounding box widths and heights.
01484 static void RotateAndExplodeBlobList(const FCOORD& blob_rotation,
01485                                      BLOBNBOX_LIST* bblobs,
01486                                      STATS* widths,
01487                                      STATS* heights) {
01488   BLOBNBOX_IT it(bblobs);
01489   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01490     BLOBNBOX* blob = it.data();
01491     C_BLOB* cblob = blob->cblob();
01492     C_OUTLINE_LIST* outlines = cblob->out_list();
01493     C_OUTLINE_IT ol_it(outlines);
01494     if (!outlines->singleton()) {
01495       // This blob has multiple outlines from CJK repair.
01496       // Explode the blob back into individual outlines.
01497       for (;!ol_it.empty(); ol_it.forward()) {
01498         C_OUTLINE* outline = ol_it.extract();
01499         BLOBNBOX* new_blob = BLOBNBOX::RealBlob(outline);
01500         // This blob will be revisited later since we add_after_stay_put here.
01501         // This means it will get rotated and have its width/height added to
01502         // the stats below.
01503         it.add_after_stay_put(new_blob);
01504       }
01505       it.extract();
01506       delete cblob;
01507       delete blob;
01508     } else {
01509       if (blob_rotation.x() != 1.0f || blob_rotation.y() != 0.0f) {
01510         cblob->rotate(blob_rotation);
01511       }
01512       blob->compute_bounding_box();
01513       widths->add(blob->bounding_box().width(), 1);
01514       heights->add(blob->bounding_box().height(), 1);
01515     }
01516   }
01517 }
01518 
01519 // Undo the deskew that was done in FindTabVectors, as recognition is done
01520 // without correcting blobs or blob outlines for skew.
01521 // Reskew the completed blocks to put them back to the original rotated coords
01522 // that were created by CorrectOrientation.
01523 // If the input_is_rtl, then reflect the blocks in the y-axis to undo the
01524 // reflection that was done before FindTabVectors.
01525 // Blocks that were identified as vertical text (relative to the rotated
01526 // coordinates) are further rotated so the text lines are horizontal.
01527 // blob polygonal outlines are rotated to match the position of the blocks
01528 // that they are in, and their bounding boxes are recalculated to be accurate.
01529 // Record appropriate inverse transformations and required
01530 // classifier transformation in the blocks.
01531 void ColumnFinder::RotateAndReskewBlocks(bool input_is_rtl,
01532                                          TO_BLOCK_LIST* blocks) {
01533   if (input_is_rtl) {
01534     // The skew is backwards because of the reflection.
01535     FCOORD tmp = deskew_;
01536     deskew_ = reskew_;
01537     reskew_ = tmp;
01538   }
01539   TO_BLOCK_IT it(blocks);
01540   int block_index = 1;
01541   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
01542     TO_BLOCK* to_block = it.data();
01543     BLOCK* block = to_block->block;
01544     // Blocks are created on the deskewed blob outlines in TransformToBlocks()
01545     // so we need to reskew them back to page coordinates.
01546     if (input_is_rtl) {
01547       block->reflect_polygon_in_y_axis();
01548     }
01549     block->rotate(reskew_);
01550     // Copy the right_to_left flag to the created block.
01551     block->set_right_to_left(input_is_rtl);
01552     // Save the skew angle in the block for baseline computations.
01553     block->set_skew(reskew_);
01554     block->set_index(block_index++);
01555     FCOORD blob_rotation = ComputeBlockAndClassifyRotation(block);
01556     // Rotate all the blobs if needed and recompute the bounding boxes.
01557     // Compute the block median blob width and height as we go.
01558     STATS widths(0, block->bounding_box().width());
01559     STATS heights(0, block->bounding_box().height());
01560     RotateAndExplodeBlobList(blob_rotation, &to_block->blobs,
01561                              &widths, &heights);
01562     TO_ROW_IT row_it(to_block->get_rows());
01563     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
01564       TO_ROW* row = row_it.data();
01565       RotateAndExplodeBlobList(blob_rotation, row->blob_list(),
01566                                &widths, &heights);
01567     }
01568     block->set_median_size(static_cast<int>(widths.median() + 0.5),
01569                            static_cast<int>(heights.median() + 0.5));
01570     if (textord_debug_tabfind >= 2)
01571       tprintf("Block median size = (%d, %d)\n",
01572               block->median_size().x(), block->median_size().y());
01573   }
01574 }
01575 
01576 // Computes the rotations for the block (to make textlines horizontal) and
01577 // for the blobs (for classification) and sets the appropriate members
01578 // of the given block.
01579 // Returns the rotation that needs to be applied to the blobs to make
01580 // them sit in the rotated block.
01581 FCOORD ColumnFinder::ComputeBlockAndClassifyRotation(BLOCK* block) {
01582   // The text_rotation_ tells us the gross page text rotation that needs
01583   // to be applied for classification
01584   // TODO(rays) find block-level classify rotation by orientation detection.
01585   // In the mean time, assume that "up" for text printed in the minority
01586   // direction (PT_VERTICAL_TEXT) is perpendicular to the line of reading.
01587   // Accomplish this by zero-ing out the text rotation.  This covers the
01588   // common cases of image credits in documents written in Latin scripts
01589   // and page headings for predominantly vertically written CJK books.
01590   FCOORD classify_rotation(text_rotation_);
01591   FCOORD block_rotation(1.0f, 0.0f);
01592   if (block->poly_block()->isA() == PT_VERTICAL_TEXT) {
01593     // Vertical text needs to be 90 degrees rotated relative to the rest.
01594     // If the rest has a 90 degree rotation already, use the inverse, making
01595     // the vertical text the original way up. Otherwise use 90 degrees
01596     // clockwise.
01597     if (rerotate_.x() == 0.0f)
01598       block_rotation = rerotate_;
01599     else
01600       block_rotation = FCOORD(0.0f, -1.0f);
01601     block->rotate(block_rotation);
01602     classify_rotation = FCOORD(1.0f, 0.0f);
01603   }
01604   block_rotation.rotate(rotation_);
01605   // block_rotation is now what we have done to the blocks. Now do the same
01606   // thing to the blobs, but save the inverse rotation in the block, as that
01607   // is what we need to DENORM back to the image coordinates.
01608   FCOORD blob_rotation(block_rotation);
01609   block_rotation.set_y(-block_rotation.y());
01610   block->set_re_rotation(block_rotation);
01611   block->set_classify_rotation(classify_rotation);
01612   if (textord_debug_tabfind) {
01613     tprintf("Blk %d, type %d rerotation(%.2f, %.2f), char(%.2f,%.2f), box:",
01614             block->index(), block->poly_block()->isA(),
01615             block->re_rotation().x(), block->re_rotation().y(),
01616             classify_rotation.x(), classify_rotation.y());
01617     block->bounding_box().print();
01618   }
01619   return blob_rotation;
01620 }
01621 
01622 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines