tesseract
3.03
|
00001 00002 // File: colfind.h 00003 // Description: Class to find columns in the grid of BLOBNBOXes. 00004 // Author: Ray Smith 00005 // Created: Thu Feb 21 14:04:01 PST 2008 00006 // 00007 // (C) Copyright 2008, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TESSERACT_TEXTORD_COLFIND_H__ 00021 #define TESSERACT_TEXTORD_COLFIND_H__ 00022 00023 #include "tabfind.h" 00024 #include "imagefind.h" 00025 #include "colpartitiongrid.h" 00026 #include "colpartitionset.h" 00027 #include "ocrblock.h" 00028 #include "textlineprojection.h" 00029 00030 class BLOCK_LIST; 00031 struct Boxa; 00032 struct Pixa; 00033 class DENORM; 00034 class ScrollView; 00035 class STATS; 00036 class TO_BLOCK; 00037 00038 namespace tesseract { 00039 00040 extern BOOL_VAR_H(textord_tabfind_find_tables, false, "run table detection"); 00041 00042 class ColPartitionSet; 00043 class ColPartitionSet_LIST; 00044 class ColSegment_LIST; 00045 class ColumnGroup_LIST; 00046 class LineSpacing; 00047 class StrokeWidth; 00048 class TempColumn_LIST; 00049 class EquationDetectBase; 00050 00051 // The ColumnFinder class finds columns in the grid. 00052 class ColumnFinder : public TabFind { 00053 public: 00054 // Gridsize is an estimate of the text size in the image. A suitable value 00055 // is in TO_BLOCK::line_size after find_components has been used to make 00056 // the blobs. 00057 // bleft and tright are the bounds of the image (rectangle) being processed. 00058 // vlines is a (possibly empty) list of TabVector and vertical_x and y are 00059 // the sum logical vertical vector produced by LineFinder::FindVerticalLines. 00060 ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright, 00061 int resolution, bool cjk_script, TabVector_LIST* vlines, 00062 TabVector_LIST* hlines, int vertical_x, int vertical_y); 00063 virtual ~ColumnFinder(); 00064 00065 // Accessors for testing 00066 const DENORM* denorm() const { 00067 return denorm_; 00068 } 00069 const TextlineProjection* projection() const { 00070 return &projection_; 00071 } 00072 void set_cjk_script(bool is_cjk) { 00073 cjk_script_ = is_cjk; 00074 } 00075 00076 // ====================================================================== 00077 // The main function of ColumnFinder is broken into pieces to facilitate 00078 // optional insertion of orientation and script detection in an efficient 00079 // way. The calling sequence IS MANDATORY however, whether or not 00080 // OSD is being used: 00081 // 1. Construction. 00082 // 2. SetupAndFilterNoise. 00083 // 3. IsVerticallyAlignedText. 00084 // 4. CorrectOrientation. 00085 // 5. FindBlocks. 00086 // 6. Destruction. Use of a single column finder for multiple images does not 00087 // make sense. 00088 // Throughout these steps, the ColPartitions are owned by part_grid_, which 00089 // means that that it must be kept correct. Exception: big_parts_ owns its 00090 // own ColPartitions. 00091 // The BLOBNBOXes are owned by the input TO_BLOCK for the whole time, except 00092 // for a phase in FindBlocks before TransformToBlocks, when they become 00093 // owned by the ColPartitions. The owner() ColPartition of a BLOBNBOX 00094 // indicates more of a betrothal for the majority of layout analysis, ie 00095 // which ColPartition will take ownership when the blobs are release from 00096 // the input TO_BLOCK. Exception: image_bblobs_ owns the fake blobs that 00097 // are part of the image regions, as they are not on any TO_BLOCK list. 00098 // TODO(rays) break up column finder further into smaller classes, as 00099 // there is a lot more to it than column finding now. 00100 // ====================================================================== 00101 00102 // Performs initial processing on the blobs in the input_block: 00103 // Setup the part_grid, stroke_width_, nontext_map_. 00104 // Obvious noise blobs are filtered out and used to mark the nontext_map_. 00105 // Initial stroke-width analysis is used to get local text alignment 00106 // direction, so the textline projection_ map can be setup. 00107 // On return, IsVerticallyAlignedText may be called (now optionally) to 00108 // determine the gross textline alignment of the page. 00109 void SetupAndFilterNoise(Pix* photo_mask_pix, TO_BLOCK* input_block); 00110 00111 // Tests for vertical alignment of text (returning true if so), and generates 00112 // a list of blobs (in osd_blobs) for orientation and script detection. 00113 // block is the single block for the whole page or rectangle to be OCRed. 00114 // Note that the vertical alignment may be due to text whose writing direction 00115 // is vertical, like say Japanese, or due to text whose writing direction is 00116 // horizontal but whose text appears vertically aligned because the image is 00117 // not the right way up. 00118 bool IsVerticallyAlignedText(TO_BLOCK* block, BLOBNBOX_CLIST* osd_blobs); 00119 00120 // Rotates the blobs and the TabVectors so that the gross writing direction 00121 // (text lines) are horizontal and lines are read down the page. 00122 // Applied rotation stored in rotation_. 00123 // A second rotation is calculated for application during recognition to 00124 // make the rotated blobs upright for recognition. 00125 // Subsequent rotation stored in text_rotation_. 00126 // 00127 // Arguments: 00128 // vertical_text_lines is true if the text lines are vertical. 00129 // recognition_rotation [0..3] is the number of anti-clockwise 90 degree 00130 // rotations from osd required for the text to be upright and readable. 00131 void CorrectOrientation(TO_BLOCK* block, bool vertical_text_lines, 00132 int recognition_rotation); 00133 00134 // Finds blocks of text, image, rule line, table etc, returning them in the 00135 // blocks and to_blocks 00136 // (Each TO_BLOCK points to the basic BLOCK and adds more information.) 00137 // Image blocks are generated by a combination of photo_mask_pix (which may 00138 // NOT be NULL) and the rejected text found during preliminary textline 00139 // finding. 00140 // The input_block is the result of a call to find_components, and contains 00141 // the blobs found in the image or rectangle to be OCRed. These blobs will be 00142 // removed and placed in the output blocks, while unused ones will be deleted. 00143 // If single_column is true, the input is treated as single column, but 00144 // it is still divided into blocks of equal line spacing/text size. 00145 // scaled_color is scaled down by scaled_factor from the input color image, 00146 // and may be NULL if the input was not color. 00147 // grey_pix is optional, but if present must match the photo_mask_pix in size, 00148 // and must be a *real* grey image instead of binary_pix * 255. 00149 // thresholds_pix is expected to be present iff grey_pix is present and 00150 // can be an integer factor reduction of the grey_pix. It represents the 00151 // thresholds that were used to create the binary_pix from the grey_pix. 00152 // Returns -1 if the user hits the 'd' key in the blocks window while running 00153 // in debug mode, which requests a retry with more debug info. 00154 int FindBlocks(PageSegMode pageseg_mode, 00155 Pix* scaled_color, int scaled_factor, 00156 TO_BLOCK* block, Pix* photo_mask_pix, 00157 Pix* thresholds_pix, Pix* grey_pix, 00158 BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks); 00159 00160 // Get the rotation required to deskew, and its inverse rotation. 00161 void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew); 00162 00163 // Set the equation detection pointer. 00164 void SetEquationDetect(EquationDetectBase* detect); 00165 00166 private: 00167 // Displays the blob and block bounding boxes in a window called Blocks. 00168 void DisplayBlocks(BLOCK_LIST* blocks); 00169 // Displays the column edges at each grid y coordinate defined by 00170 // best_columns_. 00171 void DisplayColumnBounds(PartSetVector* sets); 00172 00174 00175 // Sets up column_sets_ (the determined column layout at each horizontal 00176 // slice). Returns false if the page is empty. 00177 bool MakeColumns(bool single_column); 00178 // Attempt to improve the column_candidates by expanding the columns 00179 // and adding new partitions from the partition sets in src_sets. 00180 // Src_sets may be equal to column_candidates, in which case it will 00181 // use them as a source to improve themselves. 00182 void ImproveColumnCandidates(PartSetVector* src_sets, 00183 PartSetVector* column_sets); 00184 // Prints debug information on the column candidates. 00185 void PrintColumnCandidates(const char* title); 00186 // Finds the optimal set of columns that cover the entire image with as 00187 // few changes in column partition as possible. 00188 void AssignColumns(const PartSetVector& part_sets); 00189 // Finds the biggest range in part_sets_ that has no assigned column, but 00190 // column assignment is possible. 00191 bool BiggestUnassignedRange(int set_count, const bool* any_columns_possible, 00192 int* start, int* end); 00193 // Finds the modal compatible column_set_ index within the given range. 00194 int RangeModalColumnSet(int** column_set_costs, const int* assigned_costs, 00195 int start, int end); 00196 // Given that there are many column_set_id compatible columns in the range, 00197 // shrinks the range to the longest contiguous run of compatibility, allowing 00198 // gaps where no columns are possible, but not where competing columns are 00199 // possible. 00200 void ShrinkRangeToLongestRun(int** column_set_costs, 00201 const int* assigned_costs, 00202 const bool* any_columns_possible, 00203 int column_set_id, 00204 int* best_start, int* best_end); 00205 // Moves start in the direction of step, upto, but not including end while 00206 // the only incompatible regions are no more than kMaxIncompatibleColumnCount 00207 // in size, and the compatible regions beyond are bigger. 00208 void ExtendRangePastSmallGaps(int** column_set_costs, 00209 const int* assigned_costs, 00210 const bool* any_columns_possible, 00211 int column_set_id, 00212 int step, int end, int* start); 00213 // Assigns the given column_set_id to the part_sets_ in the given range. 00214 void AssignColumnToRange(int column_set_id, int start, int end, 00215 int** column_set_costs, int* assigned_costs); 00216 00217 // Computes the mean_column_gap_. 00218 void ComputeMeanColumnGap(); 00219 00222 00223 // Hoovers up all un-owned blobs and deletes them. 00224 // The rest get released from the block so the ColPartitions can pass 00225 // ownership to the output blocks. 00226 void ReleaseBlobsAndCleanupUnused(TO_BLOCK* block); 00227 // Splits partitions that cross columns where they have nothing in the gap. 00228 void GridSplitPartitions(); 00229 // Merges partitions where there is vertical overlap, within a single column, 00230 // and the horizontal gap is small enough. 00231 void GridMergePartitions(); 00232 // Inserts remaining noise blobs into the most applicable partition if any. 00233 // If there is no applicable partition, then the blobs are deleted. 00234 void InsertRemainingNoise(TO_BLOCK* block); 00235 // Remove partitions that come from horizontal lines that look like 00236 // underlines, but are not part of a table. 00237 void GridRemoveUnderlinePartitions(); 00238 // Add horizontal line separators as partitions. 00239 void GridInsertHLinePartitions(); 00240 // Add vertical line separators as partitions. 00241 void GridInsertVLinePartitions(); 00242 // For every ColPartition in the grid, sets its type based on position 00243 // in the columns. 00244 void SetPartitionTypes(); 00245 // Only images remain with multiple types in a run of partners. 00246 // Sets the type of all in the group to the maximum of the group. 00247 void SmoothPartnerRuns(); 00248 00250 00251 // Helper functions for TransformToBlocks. 00252 // Add the part to the temp list in the correct order. 00253 void AddToTempPartList(ColPartition* part, ColPartition_CLIST* temp_list); 00254 // Add everything from the temp list to the work_set assuming correct order. 00255 void EmptyTempPartList(ColPartition_CLIST* temp_list, 00256 WorkingPartSet_LIST* work_set); 00257 00258 // Transform the grid of partitions to the output blocks. 00259 void TransformToBlocks(BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks); 00260 00261 // Reflect the blob boxes (but not the outlines) in the y-axis so that 00262 // the blocks get created in the correct RTL order. Rotates the blobs 00263 // in the input_block and the bblobs list. 00264 // The reflection is undone in RotateAndReskewBlocks by 00265 // reflecting the blocks themselves, and then recomputing the blob bounding 00266 // boxes. 00267 void ReflectForRtl(TO_BLOCK* input_block, BLOBNBOX_LIST* bblobs); 00268 00269 // Undo the deskew that was done in FindTabVectors, as recognition is done 00270 // without correcting blobs or blob outlines for skew. 00271 // Reskew the completed blocks to put them back to the original rotated coords 00272 // that were created by CorrectOrientation. 00273 // If the input_is_rtl, then reflect the blocks in the y-axis to undo the 00274 // reflection that was done before FindTabVectors. 00275 // Blocks that were identified as vertical text (relative to the rotated 00276 // coordinates) are further rotated so the text lines are horizontal. 00277 // blob polygonal outlines are rotated to match the position of the blocks 00278 // that they are in, and their bounding boxes are recalculated to be accurate. 00279 // Record appropriate inverse transformations and required 00280 // classifier transformation in the blocks. 00281 void RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST* to_blocks); 00282 00283 // Computes the rotations for the block (to make textlines horizontal) and 00284 // for the blobs (for classification) and sets the appropriate members 00285 // of the given block. 00286 // Returns the rotation that needs to be applied to the blobs to make 00287 // them sit in the rotated block. 00288 FCOORD ComputeBlockAndClassifyRotation(BLOCK* block); 00289 00290 // True if this is most likely a cjk page with rectangular characters. 00291 bool cjk_script_; 00292 // The minimum gutter width to apply for finding columns. 00293 // Modified when vertical text is detected to prevent detection of 00294 // vertical text lines as columns. 00295 int min_gutter_width_; 00296 // The mean gap between columns over the page. 00297 int mean_column_gap_; 00298 // The rotation vector needed to convert original coords to deskewed. 00299 FCOORD deskew_; 00300 // The rotation vector needed to convert deskewed back to original coords. 00301 FCOORD reskew_; 00302 // The rotation vector used to rotate vertically oriented pages. 00303 FCOORD rotation_; 00304 // The rotation vector needed to convert the rotated back to original coords. 00305 FCOORD rerotate_; 00306 // The additional rotation vector needed to rotate text for recognition. 00307 FCOORD text_rotation_; 00308 // The column_sets_ contain the ordered candidate ColPartitionSets that 00309 // define the possible divisions of the page into columns. 00310 PartSetVector column_sets_; 00311 // A simple array of pointers to the best assigned column division at 00312 // each grid y coordinate. 00313 ColPartitionSet** best_columns_; 00314 // The grid used for creating initial partitions with strokewidth. 00315 StrokeWidth* stroke_width_; 00316 // The grid used to hold ColPartitions after the columns have been determined. 00317 ColPartitionGrid part_grid_; 00318 // List of ColPartitions that are no longer needed after they have been 00319 // turned into regions, but are kept around because they are referenced 00320 // by the part_grid_. 00321 ColPartition_LIST good_parts_; 00322 // List of ColPartitions that are big and might be dropcap or vertically 00323 // joined. 00324 ColPartition_LIST big_parts_; 00325 // List of ColPartitions that have been declared noise. 00326 ColPartition_LIST noise_parts_; 00327 // The fake blobs that are made from the images. 00328 BLOBNBOX_LIST image_bblobs_; 00329 // Horizontal line separators. 00330 TabVector_LIST horizontal_lines_; 00331 // Image map of photo/noise areas on the page. 00332 Pix* nontext_map_; 00333 // Textline projection map. 00334 TextlineProjection projection_; 00335 // Sequence of DENORMS that indicate how to get back to the original image 00336 // coordinate space. The destructor must delete all the DENORMs in the chain. 00337 DENORM* denorm_; 00338 00339 // Various debug windows that automatically go away on completion. 00340 ScrollView* input_blobs_win_; 00341 00342 // The equation region detector pointer. Note: This pointer is passed in by 00343 // member function SetEquationDetect, and releasing it is NOT owned by this 00344 // class. 00345 EquationDetectBase* equation_detect_; 00346 00347 // Allow a subsequent instance to reuse the blocks window. 00348 // Not thread-safe, but multiple threads shouldn't be using windows anyway. 00349 static ScrollView* blocks_win_; 00350 }; 00351 00352 } // namespace tesseract. 00353 00354 #endif // TESSERACT_TEXTORD_COLFIND_H__