tesseract
3.03
|
00001 /********************************************************************** 00002 * File: devanagari_processing.cpp 00003 * Description: Methods to process images containing devanagari symbols, 00004 * prior to classification. 00005 * Author: Shobhit Saxena 00006 * Created: Mon Nov 17 20:26:01 IST 2008 00007 * 00008 * (C) Copyright 2008, Google Inc. 00009 ** Licensed under the Apache License, Version 2.0 (the "License"); 00010 ** you may not use this file except in compliance with the License. 00011 ** You may obtain a copy of the License at 00012 ** http://www.apache.org/licenses/LICENSE-2.0 00013 ** Unless required by applicable law or agreed to in writing, software 00014 ** distributed under the License is distributed on an "AS IS" BASIS, 00015 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 ** See the License for the specific language governing permissions and 00017 ** limitations under the License. 00018 * 00019 **********************************************************************/ 00020 00021 #ifdef HAVE_CONFIG_H 00022 #include "config_auto.h" 00023 #endif 00024 00025 #include "devanagari_processing.h" 00026 #include "allheaders.h" 00027 #include "tordmain.h" 00028 #include "statistc.h" 00029 00030 // Flags controlling the debugging information for shiro-rekha splitting 00031 // strategies. 00032 INT_VAR(devanagari_split_debuglevel, 0, 00033 "Debug level for split shiro-rekha process."); 00034 00035 BOOL_VAR(devanagari_split_debugimage, 0, 00036 "Whether to create a debug image for split shiro-rekha process."); 00037 00038 namespace tesseract { 00039 00040 ShiroRekhaSplitter::ShiroRekhaSplitter() { 00041 orig_pix_ = NULL; 00042 segmentation_block_list_ = NULL; 00043 splitted_image_ = NULL; 00044 global_xheight_ = kUnspecifiedXheight; 00045 perform_close_ = false; 00046 debug_image_ = NULL; 00047 pageseg_split_strategy_ = NO_SPLIT; 00048 ocr_split_strategy_ = NO_SPLIT; 00049 } 00050 00051 ShiroRekhaSplitter::~ShiroRekhaSplitter() { 00052 Clear(); 00053 } 00054 00055 void ShiroRekhaSplitter::Clear() { 00056 pixDestroy(&orig_pix_); 00057 pixDestroy(&splitted_image_); 00058 pageseg_split_strategy_ = NO_SPLIT; 00059 ocr_split_strategy_ = NO_SPLIT; 00060 pixDestroy(&debug_image_); 00061 segmentation_block_list_ = NULL; 00062 global_xheight_ = kUnspecifiedXheight; 00063 perform_close_ = false; 00064 } 00065 00066 // This method dumps a debug image to the specified location. 00067 void ShiroRekhaSplitter::DumpDebugImage(const char* filename) const { 00068 pixWrite(filename, debug_image_, IFF_PNG); 00069 } 00070 00071 // On setting the input image, a clone of it is owned by this class. 00072 void ShiroRekhaSplitter::set_orig_pix(Pix* pix) { 00073 if (orig_pix_) { 00074 pixDestroy(&orig_pix_); 00075 } 00076 orig_pix_ = pixClone(pix); 00077 } 00078 00079 // Top-level method to perform splitting based on current settings. 00080 // Returns true if a split was actually performed. 00081 // split_for_pageseg should be true if the splitting is being done prior to 00082 // page segmentation. This mode uses the flag 00083 // pageseg_devanagari_split_strategy to determine the splitting strategy. 00084 bool ShiroRekhaSplitter::Split(bool split_for_pageseg) { 00085 SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ : 00086 ocr_split_strategy_; 00087 if (split_strategy == NO_SPLIT) { 00088 return false; // Nothing to do. 00089 } 00090 ASSERT_HOST(split_strategy == MINIMAL_SPLIT || 00091 split_strategy == MAXIMAL_SPLIT); 00092 ASSERT_HOST(orig_pix_); 00093 if (devanagari_split_debuglevel > 0) { 00094 tprintf("Splitting shiro-rekha ...\n"); 00095 tprintf("Split strategy = %s\n", 00096 split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal"); 00097 tprintf("Initial pageseg available = %s\n", 00098 segmentation_block_list_ ? "yes" : "no"); 00099 } 00100 // Create a copy of original image to store the splitting output. 00101 pixDestroy(&splitted_image_); 00102 splitted_image_ = pixCopy(NULL, orig_pix_); 00103 00104 // Initialize debug image if required. 00105 if (devanagari_split_debugimage) { 00106 pixDestroy(&debug_image_); 00107 debug_image_ = pixConvertTo32(orig_pix_); 00108 } 00109 00110 // Determine all connected components in the input image. A close operation 00111 // may be required prior to this, depending on the current settings. 00112 Pix* pix_for_ccs = pixClone(orig_pix_); 00113 if (perform_close_ && global_xheight_ != kUnspecifiedXheight && 00114 !segmentation_block_list_) { 00115 if (devanagari_split_debuglevel > 0) { 00116 tprintf("Performing a global close operation..\n"); 00117 } 00118 // A global measure is available for xheight, but no local information 00119 // exists. 00120 pixDestroy(&pix_for_ccs); 00121 pix_for_ccs = pixCopy(NULL, orig_pix_); 00122 PerformClose(pix_for_ccs, global_xheight_); 00123 } 00124 Pixa* ccs; 00125 Boxa* tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8); 00126 boxaDestroy(&tmp_boxa); 00127 pixDestroy(&pix_for_ccs); 00128 00129 // Iterate over all connected components. Get their bounding boxes and clip 00130 // out the image regions corresponding to these boxes from the original image. 00131 // Conditionally run splitting on each of them. 00132 Boxa* regions_to_clear = boxaCreate(0); 00133 for (int i = 0; i < pixaGetCount(ccs); ++i) { 00134 Box* box = ccs->boxa->box[i]; 00135 Pix* word_pix = pixClipRectangle(orig_pix_, box, NULL); 00136 ASSERT_HOST(word_pix); 00137 int xheight = GetXheightForCC(box); 00138 if (xheight == kUnspecifiedXheight && segmentation_block_list_ && 00139 devanagari_split_debugimage) { 00140 pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0); 00141 } 00142 // If some xheight measure is available, attempt to pre-eliminate small 00143 // blobs from the shiro-rekha process. This is primarily to save the CCs 00144 // corresponding to punctuation marks/small dots etc which are part of 00145 // larger graphemes. 00146 if (xheight == kUnspecifiedXheight || 00147 (box->w > xheight / 3 && box->h > xheight / 2)) { 00148 SplitWordShiroRekha(split_strategy, word_pix, xheight, 00149 box->x, box->y, regions_to_clear); 00150 } else if (devanagari_split_debuglevel > 0) { 00151 tprintf("CC dropped from splitting: %d,%d (%d, %d)\n", 00152 box->x, box->y, box->w, box->h); 00153 } 00154 pixDestroy(&word_pix); 00155 } 00156 // Actually clear the boxes now. 00157 for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) { 00158 Box* box = boxaGetBox(regions_to_clear, i, L_CLONE); 00159 pixClearInRect(splitted_image_, box); 00160 boxDestroy(&box); 00161 } 00162 boxaDestroy(®ions_to_clear); 00163 pixaDestroy(&ccs); 00164 if (devanagari_split_debugimage) { 00165 DumpDebugImage(split_for_pageseg ? "pageseg_split_debug.png" : 00166 "ocr_split_debug.png"); 00167 } 00168 return true; 00169 } 00170 00171 // Method to perform a close operation on the input image. The xheight 00172 // estimate decides the size of sel used. 00173 void ShiroRekhaSplitter::PerformClose(Pix* pix, int xheight_estimate) { 00174 pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3); 00175 } 00176 00177 // This method resolves the cc bbox to a particular row and returns the row's 00178 // xheight. 00179 int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) { 00180 if (!segmentation_block_list_) { 00181 return global_xheight_; 00182 } 00183 // Compute the box coordinates in Tesseract's coordinate system. 00184 TBOX bbox(cc_bbox->x, 00185 pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1, 00186 cc_bbox->x + cc_bbox->w, 00187 pixGetHeight(orig_pix_) - cc_bbox->y - 1); 00188 // Iterate over all blocks. 00189 BLOCK_IT block_it(segmentation_block_list_); 00190 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { 00191 BLOCK* block = block_it.data(); 00192 // Iterate over all rows in the block. 00193 ROW_IT row_it(block->row_list()); 00194 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00195 ROW* row = row_it.data(); 00196 if (!row->bounding_box().major_overlap(bbox)) { 00197 continue; 00198 } 00199 // Row could be skewed, warped, etc. Use the position of the box to 00200 // determine the baseline position of the row for that x-coordinate. 00201 // Create a square TBOX whose baseline's mid-point lies at this point 00202 // and side is row's xheight. Take the overlap of this box with the input 00203 // box and check if it is a 'major overlap'. If so, this box lies in this 00204 // row. In that case, return the xheight for this row. 00205 float box_middle = 0.5 * (bbox.left() + bbox.right()); 00206 int baseline = static_cast<int>(row->base_line(box_middle) + 0.5); 00207 TBOX test_box(box_middle - row->x_height() / 2, 00208 baseline, 00209 box_middle + row->x_height() / 2, 00210 static_cast<int>(baseline + row->x_height())); 00211 // Compute overlap. If it is is a major overlap, this is the right row. 00212 if (bbox.major_overlap(test_box)) { 00213 return row->x_height(); 00214 } 00215 } 00216 } 00217 // No row found for this bbox. 00218 return kUnspecifiedXheight; 00219 } 00220 00221 // Returns a list of regions (boxes) which should be cleared in the original 00222 // image so as to perform shiro-rekha splitting. Pix is assumed to carry one 00223 // (or less) word only. Xheight measure could be the global estimate, the row 00224 // estimate, or unspecified. If unspecified, over splitting may occur, since a 00225 // conservative estimate of stroke width along with an associated multiplier 00226 // is used in its place. It is advisable to have a specified xheight when 00227 // splitting for classification/training. 00228 // A vertical projection histogram of all the on-pixels in the input pix is 00229 // computed. The maxima of this histogram is regarded as an approximate location 00230 // of the shiro-rekha. By descending on the maxima's peak on both sides, 00231 // stroke width of shiro-rekha is estimated. 00232 // A horizontal projection histogram is computed for a sub-image of the input 00233 // image, which extends from just below the shiro-rekha down to a certain 00234 // leeway. The leeway depends on the input xheight, if provided, else a 00235 // conservative multiplier on approximate stroke width is used (which may lead 00236 // to over-splitting). 00237 void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy, 00238 Pix* pix, 00239 int xheight, 00240 int word_left, 00241 int word_top, 00242 Boxa* regions_to_clear) { 00243 if (split_strategy == NO_SPLIT) { 00244 return; 00245 } 00246 int width = pixGetWidth(pix); 00247 int height = pixGetHeight(pix); 00248 // Statistically determine the yextents of the shiro-rekha. 00249 int shirorekha_top, shirorekha_bottom, shirorekha_ylevel; 00250 GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom, 00251 &shirorekha_ylevel); 00252 // Since the shiro rekha is also a stroke, its width is equal to the stroke 00253 // width. 00254 int stroke_width = shirorekha_bottom - shirorekha_top + 1; 00255 00256 // Some safeguards to protect CCs we do not want to be split. 00257 // These are particularly useful when the word wasn't eliminated earlier 00258 // because xheight information was unavailable. 00259 if (shirorekha_ylevel > height / 2) { 00260 // Shirorekha shouldn't be in the bottom half of the word. 00261 if (devanagari_split_debuglevel > 0) { 00262 tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n", 00263 word_left, word_top); 00264 } 00265 return; 00266 } 00267 if (stroke_width > height / 3) { 00268 // Even the boldest of fonts shouldn't do this. 00269 if (devanagari_split_debuglevel > 0) { 00270 tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n", 00271 word_left, word_top); 00272 } 00273 return; 00274 } 00275 00276 // Clear the ascender and descender regions of the word. 00277 // Obtain a vertical projection histogram for the resulting image. 00278 Box* box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3, 00279 width, 5 * stroke_width / 3); 00280 Pix* word_in_xheight = pixCopy(NULL, pix); 00281 pixClearInRect(word_in_xheight, box_to_clear); 00282 // Also clear any pixels which are below shirorekha_bottom + some leeway. 00283 // The leeway is set to xheight if the information is available, else it is a 00284 // multiplier applied to the stroke width. 00285 int leeway_to_keep = stroke_width * 3; 00286 if (xheight != kUnspecifiedXheight) { 00287 // This is because the xheight-region typically includes the shiro-rekha 00288 // inside it, i.e., the top of the xheight range corresponds to the top of 00289 // shiro-rekha. 00290 leeway_to_keep = xheight - stroke_width; 00291 } 00292 box_to_clear->y = shirorekha_bottom + leeway_to_keep; 00293 box_to_clear->h = height - box_to_clear->y; 00294 pixClearInRect(word_in_xheight, box_to_clear); 00295 boxDestroy(&box_to_clear); 00296 00297 PixelHistogram vert_hist; 00298 vert_hist.ConstructVerticalCountHist(word_in_xheight); 00299 pixDestroy(&word_in_xheight); 00300 00301 // If the number of black pixel in any column of the image is less than a 00302 // fraction of the stroke width, treat it as noise / a stray mark. Perform 00303 // these changes inside the vert_hist data itself, as that is used later on as 00304 // a bit vector for the final split decision at every column. 00305 for (int i = 0; i < width; ++i) { 00306 if (vert_hist.hist()[i] <= stroke_width / 4) 00307 vert_hist.hist()[i] = 0; 00308 else 00309 vert_hist.hist()[i] = 1; 00310 } 00311 // In order to split the line at any point, we make sure that the width of the 00312 // gap is atleast half the stroke width. 00313 int i = 0; 00314 int cur_component_width = 0; 00315 while (i < width) { 00316 if (!vert_hist.hist()[i]) { 00317 int j = 0; 00318 while (i + j < width && !vert_hist.hist()[i+j]) 00319 ++j; 00320 if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) { 00321 // Perform a shiro-rekha split. The intervening region lies from i to 00322 // i+j-1. 00323 // A minimal single-pixel split makes the estimation of intra- and 00324 // inter-word spacing easier during page layout analysis, 00325 // whereas a maximal split may be needed for OCR, depending on 00326 // how the engine was trained. 00327 bool minimal_split = (split_strategy == MINIMAL_SPLIT); 00328 int split_width = minimal_split ? 1 : j; 00329 int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i; 00330 if (!minimal_split || (i != 0 && i + j != width)) { 00331 Box* box_to_clear = 00332 boxCreate(word_left + split_left, 00333 word_top + shirorekha_top - stroke_width / 3, 00334 split_width, 00335 5 * stroke_width / 3); 00336 if (box_to_clear) { 00337 boxaAddBox(regions_to_clear, box_to_clear, L_CLONE); 00338 // Mark this in the debug image if needed. 00339 if (devanagari_split_debugimage) { 00340 pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128); 00341 } 00342 boxDestroy(&box_to_clear); 00343 cur_component_width = 0; 00344 } 00345 } 00346 } 00347 i += j; 00348 } else { 00349 ++i; 00350 ++cur_component_width; 00351 } 00352 } 00353 } 00354 00355 // Refreshes the words in the segmentation block list by using blobs in the 00356 // input block list. 00357 // The segmentation block list must be set. 00358 void ShiroRekhaSplitter::RefreshSegmentationWithNewBlobs( 00359 C_BLOB_LIST* new_blobs) { 00360 // The segmentation block list must have been specified. 00361 ASSERT_HOST(segmentation_block_list_); 00362 if (devanagari_split_debuglevel > 0) { 00363 tprintf("Before refreshing blobs:\n"); 00364 PrintSegmentationStats(segmentation_block_list_); 00365 tprintf("New Blobs found: %d\n", new_blobs->length()); 00366 } 00367 00368 C_BLOB_LIST not_found_blobs; 00369 RefreshWordBlobsFromNewBlobs(segmentation_block_list_, 00370 new_blobs, 00371 ((devanagari_split_debugimage && debug_image_) ? 00372 ¬_found_blobs : NULL)); 00373 00374 if (devanagari_split_debuglevel > 0) { 00375 tprintf("After refreshing blobs:\n"); 00376 PrintSegmentationStats(segmentation_block_list_); 00377 } 00378 if (devanagari_split_debugimage && debug_image_) { 00379 // Plot out the original blobs for which no match was found in the new 00380 // all_blobs list. 00381 C_BLOB_IT not_found_it(¬_found_blobs); 00382 for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); 00383 not_found_it.forward()) { 00384 C_BLOB* not_found = not_found_it.data(); 00385 TBOX not_found_box = not_found->bounding_box(); 00386 Box* box_to_plot = GetBoxForTBOX(not_found_box); 00387 pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255); 00388 boxDestroy(&box_to_plot); 00389 } 00390 00391 // Plot out the blobs unused from all blobs. 00392 C_BLOB_IT all_blobs_it(new_blobs); 00393 for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); 00394 all_blobs_it.forward()) { 00395 C_BLOB* a_blob = all_blobs_it.data(); 00396 Box* box_to_plot = GetBoxForTBOX(a_blob->bounding_box()); 00397 pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0); 00398 boxDestroy(&box_to_plot); 00399 } 00400 } 00401 } 00402 00403 // Returns a new box object for the corresponding TBOX, based on the original 00404 // image's coordinate system. 00405 Box* ShiroRekhaSplitter::GetBoxForTBOX(const TBOX& tbox) const { 00406 return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1, 00407 tbox.width(), tbox.height()); 00408 } 00409 00410 // This method returns the computed mode-height of blobs in the pix. 00411 // It also prunes very small blobs from calculation. 00412 int ShiroRekhaSplitter::GetModeHeight(Pix* pix) { 00413 Boxa* boxa = pixConnComp(pix, NULL, 8); 00414 STATS heights(0, pixGetHeight(pix)); 00415 heights.clear(); 00416 for (int i = 0; i < boxaGetCount(boxa); ++i) { 00417 Box* box = boxaGetBox(boxa, i, L_CLONE); 00418 if (box->h >= 3 || box->w >= 3) { 00419 heights.add(box->h, 1); 00420 } 00421 boxDestroy(&box); 00422 } 00423 boxaDestroy(&boxa); 00424 return heights.mode(); 00425 } 00426 00427 // This method returns y-extents of the shiro-rekha computed from the input 00428 // word image. 00429 void ShiroRekhaSplitter::GetShiroRekhaYExtents(Pix* word_pix, 00430 int* shirorekha_top, 00431 int* shirorekha_bottom, 00432 int* shirorekha_ylevel) { 00433 // Compute a histogram from projecting the word on a vertical line. 00434 PixelHistogram hist_horiz; 00435 hist_horiz.ConstructHorizontalCountHist(word_pix); 00436 // Get the ylevel where the top-line exists. This is basically the global 00437 // maxima in the horizontal histogram. 00438 int topline_onpixel_count = 0; 00439 int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count); 00440 00441 // Get the upper and lower extents of the shiro rekha. 00442 int thresh = (topline_onpixel_count * 70) / 100; 00443 int ulimit = topline_ylevel; 00444 int llimit = topline_ylevel; 00445 while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh) 00446 --ulimit; 00447 while (llimit < pixGetHeight(word_pix) && hist_horiz.hist()[llimit] >= thresh) 00448 ++llimit; 00449 00450 if (shirorekha_top) *shirorekha_top = ulimit; 00451 if (shirorekha_bottom) *shirorekha_bottom = llimit; 00452 if (shirorekha_ylevel) *shirorekha_ylevel = topline_ylevel; 00453 } 00454 00455 // This method returns the global-maxima for the histogram. The frequency of 00456 // the global maxima is returned in count, if specified. 00457 int PixelHistogram::GetHistogramMaximum(int* count) const { 00458 int best_value = 0; 00459 for (int i = 0; i < length_; ++i) { 00460 if (hist_[i] > hist_[best_value]) { 00461 best_value = i; 00462 } 00463 } 00464 if (count) { 00465 *count = hist_[best_value]; 00466 } 00467 return best_value; 00468 } 00469 00470 // Methods to construct histograms from images. 00471 void PixelHistogram::ConstructVerticalCountHist(Pix* pix) { 00472 Clear(); 00473 int width = pixGetWidth(pix); 00474 int height = pixGetHeight(pix); 00475 hist_ = new int[width]; 00476 length_ = width; 00477 int wpl = pixGetWpl(pix); 00478 l_uint32 *data = pixGetData(pix); 00479 for (int i = 0; i < width; ++i) 00480 hist_[i] = 0; 00481 for (int i = 0; i < height; ++i) { 00482 l_uint32 *line = data + i * wpl; 00483 for (int j = 0; j < width; ++j) 00484 if (GET_DATA_BIT(line, j)) 00485 ++(hist_[j]); 00486 } 00487 } 00488 00489 void PixelHistogram::ConstructHorizontalCountHist(Pix* pix) { 00490 Clear(); 00491 Numa* counts = pixCountPixelsByRow(pix, NULL); 00492 length_ = numaGetCount(counts); 00493 hist_ = new int[length_]; 00494 for (int i = 0; i < length_; ++i) { 00495 l_int32 val = 0; 00496 numaGetIValue(counts, i, &val); 00497 hist_[i] = val; 00498 } 00499 numaDestroy(&counts); 00500 } 00501 00502 } // namespace tesseract.