tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/textord/devanagari_processing.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        devanagari_processing.cpp
00003  * Description: Methods to process images containing devanagari symbols,
00004  *              prior to classification.
00005  * Author:      Shobhit Saxena
00006  * Created:     Mon Nov 17 20:26:01 IST 2008
00007  *
00008  * (C) Copyright 2008, Google Inc.
00009  ** Licensed under the Apache License, Version 2.0 (the "License");
00010  ** you may not use this file except in compliance with the License.
00011  ** You may obtain a copy of the License at
00012  ** http://www.apache.org/licenses/LICENSE-2.0
00013  ** Unless required by applicable law or agreed to in writing, software
00014  ** distributed under the License is distributed on an "AS IS" BASIS,
00015  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016  ** See the License for the specific language governing permissions and
00017  ** limitations under the License.
00018  *
00019  **********************************************************************/
00020 
00021 #ifdef HAVE_CONFIG_H
00022 #include "config_auto.h"
00023 #endif
00024 
00025 #include "devanagari_processing.h"
00026 #include "allheaders.h"
00027 #include "tordmain.h"
00028 #include "statistc.h"
00029 
00030 // Flags controlling the debugging information for shiro-rekha splitting
00031 // strategies.
00032 INT_VAR(devanagari_split_debuglevel, 0,
00033         "Debug level for split shiro-rekha process.");
00034 
00035 BOOL_VAR(devanagari_split_debugimage, 0,
00036          "Whether to create a debug image for split shiro-rekha process.");
00037 
00038 namespace tesseract {
00039 
00040 ShiroRekhaSplitter::ShiroRekhaSplitter() {
00041   orig_pix_ = NULL;
00042   segmentation_block_list_ = NULL;
00043   splitted_image_ = NULL;
00044   global_xheight_ = kUnspecifiedXheight;
00045   perform_close_ = false;
00046   debug_image_ = NULL;
00047   pageseg_split_strategy_ = NO_SPLIT;
00048   ocr_split_strategy_ = NO_SPLIT;
00049 }
00050 
00051 ShiroRekhaSplitter::~ShiroRekhaSplitter() {
00052   Clear();
00053 }
00054 
00055 void ShiroRekhaSplitter::Clear() {
00056   pixDestroy(&orig_pix_);
00057   pixDestroy(&splitted_image_);
00058   pageseg_split_strategy_ = NO_SPLIT;
00059   ocr_split_strategy_ = NO_SPLIT;
00060   pixDestroy(&debug_image_);
00061   segmentation_block_list_ = NULL;
00062   global_xheight_ = kUnspecifiedXheight;
00063   perform_close_ = false;
00064 }
00065 
00066 // This method dumps a debug image to the specified location.
00067 void ShiroRekhaSplitter::DumpDebugImage(const char* filename) const {
00068   pixWrite(filename, debug_image_, IFF_PNG);
00069 }
00070 
00071 // On setting the input image, a clone of it is owned by this class.
00072 void ShiroRekhaSplitter::set_orig_pix(Pix* pix) {
00073   if (orig_pix_) {
00074     pixDestroy(&orig_pix_);
00075   }
00076   orig_pix_ = pixClone(pix);
00077 }
00078 
00079 // Top-level method to perform splitting based on current settings.
00080 // Returns true if a split was actually performed.
00081 // split_for_pageseg should be true if the splitting is being done prior to
00082 // page segmentation. This mode uses the flag
00083 // pageseg_devanagari_split_strategy to determine the splitting strategy.
00084 bool ShiroRekhaSplitter::Split(bool split_for_pageseg) {
00085   SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ :
00086       ocr_split_strategy_;
00087   if (split_strategy == NO_SPLIT) {
00088     return false;  // Nothing to do.
00089   }
00090   ASSERT_HOST(split_strategy == MINIMAL_SPLIT ||
00091               split_strategy == MAXIMAL_SPLIT);
00092   ASSERT_HOST(orig_pix_);
00093   if (devanagari_split_debuglevel > 0) {
00094     tprintf("Splitting shiro-rekha ...\n");
00095     tprintf("Split strategy = %s\n",
00096             split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal");
00097     tprintf("Initial pageseg available = %s\n",
00098             segmentation_block_list_ ? "yes" : "no");
00099   }
00100   // Create a copy of original image to store the splitting output.
00101   pixDestroy(&splitted_image_);
00102   splitted_image_ = pixCopy(NULL, orig_pix_);
00103 
00104   // Initialize debug image if required.
00105   if (devanagari_split_debugimage) {
00106     pixDestroy(&debug_image_);
00107     debug_image_ = pixConvertTo32(orig_pix_);
00108   }
00109 
00110   // Determine all connected components in the input image. A close operation
00111   // may be required prior to this, depending on the current settings.
00112   Pix* pix_for_ccs = pixClone(orig_pix_);
00113   if (perform_close_ && global_xheight_ != kUnspecifiedXheight &&
00114       !segmentation_block_list_) {
00115     if (devanagari_split_debuglevel > 0) {
00116       tprintf("Performing a global close operation..\n");
00117     }
00118     // A global measure is available for xheight, but no local information
00119     // exists.
00120     pixDestroy(&pix_for_ccs);
00121     pix_for_ccs = pixCopy(NULL, orig_pix_);
00122     PerformClose(pix_for_ccs, global_xheight_);
00123   }
00124   Pixa* ccs;
00125   Boxa* tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8);
00126   boxaDestroy(&tmp_boxa);
00127   pixDestroy(&pix_for_ccs);
00128 
00129   // Iterate over all connected components. Get their bounding boxes and clip
00130   // out the image regions corresponding to these boxes from the original image.
00131   // Conditionally run splitting on each of them.
00132   Boxa* regions_to_clear = boxaCreate(0);
00133   for (int i = 0; i < pixaGetCount(ccs); ++i) {
00134     Box* box = ccs->boxa->box[i];
00135     Pix* word_pix = pixClipRectangle(orig_pix_, box, NULL);
00136     ASSERT_HOST(word_pix);
00137     int xheight = GetXheightForCC(box);
00138     if (xheight == kUnspecifiedXheight && segmentation_block_list_ &&
00139         devanagari_split_debugimage) {
00140       pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0);
00141     }
00142     // If some xheight measure is available, attempt to pre-eliminate small
00143     // blobs from the shiro-rekha process. This is primarily to save the CCs
00144     // corresponding to punctuation marks/small dots etc which are part of
00145     // larger graphemes.
00146     if (xheight == kUnspecifiedXheight ||
00147         (box->w > xheight / 3 && box->h > xheight / 2)) {
00148       SplitWordShiroRekha(split_strategy, word_pix, xheight,
00149                           box->x, box->y, regions_to_clear);
00150     } else if (devanagari_split_debuglevel > 0) {
00151       tprintf("CC dropped from splitting: %d,%d (%d, %d)\n",
00152               box->x, box->y, box->w, box->h);
00153     }
00154     pixDestroy(&word_pix);
00155   }
00156   // Actually clear the boxes now.
00157   for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) {
00158     Box* box = boxaGetBox(regions_to_clear, i, L_CLONE);
00159     pixClearInRect(splitted_image_, box);
00160     boxDestroy(&box);
00161   }
00162   boxaDestroy(&regions_to_clear);
00163   pixaDestroy(&ccs);
00164   if (devanagari_split_debugimage) {
00165     DumpDebugImage(split_for_pageseg ? "pageseg_split_debug.png" :
00166                    "ocr_split_debug.png");
00167   }
00168   return true;
00169 }
00170 
00171 // Method to perform a close operation on the input image. The xheight
00172 // estimate decides the size of sel used.
00173 void ShiroRekhaSplitter::PerformClose(Pix* pix, int xheight_estimate) {
00174   pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3);
00175 }
00176 
00177 // This method resolves the cc bbox to a particular row and returns the row's
00178 // xheight.
00179 int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) {
00180   if (!segmentation_block_list_) {
00181     return global_xheight_;
00182   }
00183   // Compute the box coordinates in Tesseract's coordinate system.
00184   TBOX bbox(cc_bbox->x,
00185             pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1,
00186             cc_bbox->x + cc_bbox->w,
00187             pixGetHeight(orig_pix_) - cc_bbox->y - 1);
00188   // Iterate over all blocks.
00189   BLOCK_IT block_it(segmentation_block_list_);
00190   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
00191     BLOCK* block = block_it.data();
00192     // Iterate over all rows in the block.
00193     ROW_IT row_it(block->row_list());
00194     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00195       ROW* row = row_it.data();
00196       if (!row->bounding_box().major_overlap(bbox)) {
00197         continue;
00198       }
00199       // Row could be skewed, warped, etc. Use the position of the box to
00200       // determine the baseline position of the row for that x-coordinate.
00201       // Create a square TBOX whose baseline's mid-point lies at this point
00202       // and side is row's xheight. Take the overlap of this box with the input
00203       // box and check if it is a 'major overlap'. If so, this box lies in this
00204       // row. In that case, return the xheight for this row.
00205       float box_middle = 0.5 * (bbox.left() + bbox.right());
00206       int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);
00207       TBOX test_box(box_middle - row->x_height() / 2,
00208                     baseline,
00209                     box_middle + row->x_height() / 2,
00210                     static_cast<int>(baseline + row->x_height()));
00211       // Compute overlap. If it is is a major overlap, this is the right row.
00212       if (bbox.major_overlap(test_box)) {
00213         return row->x_height();
00214       }
00215     }
00216   }
00217   // No row found for this bbox.
00218   return kUnspecifiedXheight;
00219 }
00220 
00221 // Returns a list of regions (boxes) which should be cleared in the original
00222 // image so as to perform shiro-rekha splitting. Pix is assumed to carry one
00223 // (or less) word only. Xheight measure could be the global estimate, the row
00224 // estimate, or unspecified. If unspecified, over splitting may occur, since a
00225 // conservative estimate of stroke width along with an associated multiplier
00226 // is used in its place. It is advisable to have a specified xheight when
00227 // splitting for classification/training.
00228 // A vertical projection histogram of all the on-pixels in the input pix is
00229 // computed. The maxima of this histogram is regarded as an approximate location
00230 // of the shiro-rekha. By descending on the maxima's peak on both sides,
00231 // stroke width of shiro-rekha is estimated.
00232 // A horizontal projection histogram is computed for a sub-image of the input
00233 // image, which extends from just below the shiro-rekha down to a certain
00234 // leeway. The leeway depends on the input xheight, if provided, else a
00235 // conservative multiplier on approximate stroke width is used (which may lead
00236 // to over-splitting).
00237 void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy,
00238                                              Pix* pix,
00239                                              int xheight,
00240                                              int word_left,
00241                                              int word_top,
00242                                              Boxa* regions_to_clear) {
00243   if (split_strategy == NO_SPLIT) {
00244     return;
00245   }
00246   int width = pixGetWidth(pix);
00247   int height = pixGetHeight(pix);
00248   // Statistically determine the yextents of the shiro-rekha.
00249   int shirorekha_top, shirorekha_bottom, shirorekha_ylevel;
00250   GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom,
00251                         &shirorekha_ylevel);
00252   // Since the shiro rekha is also a stroke, its width is equal to the stroke
00253   // width.
00254   int stroke_width = shirorekha_bottom - shirorekha_top + 1;
00255 
00256   // Some safeguards to protect CCs we do not want to be split.
00257   // These are particularly useful when the word wasn't eliminated earlier
00258   // because xheight information was unavailable.
00259   if (shirorekha_ylevel > height / 2) {
00260     // Shirorekha shouldn't be in the bottom half of the word.
00261     if (devanagari_split_debuglevel > 0) {
00262       tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n",
00263               word_left, word_top);
00264     }
00265     return;
00266   }
00267   if (stroke_width > height / 3) {
00268     // Even the boldest of fonts shouldn't do this.
00269     if (devanagari_split_debuglevel > 0) {
00270       tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n",
00271               word_left, word_top);
00272     }
00273     return;
00274   }
00275 
00276   // Clear the ascender and descender regions of the word.
00277   // Obtain a vertical projection histogram for the resulting image.
00278   Box* box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3,
00279                                 width, 5 * stroke_width / 3);
00280   Pix* word_in_xheight = pixCopy(NULL, pix);
00281   pixClearInRect(word_in_xheight, box_to_clear);
00282   // Also clear any pixels which are below shirorekha_bottom + some leeway.
00283   // The leeway is set to xheight if the information is available, else it is a
00284   // multiplier applied to the stroke width.
00285   int leeway_to_keep = stroke_width * 3;
00286   if (xheight != kUnspecifiedXheight) {
00287     // This is because the xheight-region typically includes the shiro-rekha
00288     // inside it, i.e., the top of the xheight range corresponds to the top of
00289     // shiro-rekha.
00290     leeway_to_keep = xheight - stroke_width;
00291   }
00292   box_to_clear->y = shirorekha_bottom + leeway_to_keep;
00293   box_to_clear->h = height - box_to_clear->y;
00294   pixClearInRect(word_in_xheight, box_to_clear);
00295   boxDestroy(&box_to_clear);
00296 
00297   PixelHistogram vert_hist;
00298   vert_hist.ConstructVerticalCountHist(word_in_xheight);
00299   pixDestroy(&word_in_xheight);
00300 
00301   // If the number of black pixel in any column of the image is less than a
00302   // fraction of the stroke width, treat it as noise / a stray mark. Perform
00303   // these changes inside the vert_hist data itself, as that is used later on as
00304   // a bit vector for the final split decision at every column.
00305   for (int i = 0; i < width; ++i) {
00306     if (vert_hist.hist()[i] <= stroke_width / 4)
00307       vert_hist.hist()[i] = 0;
00308     else
00309       vert_hist.hist()[i] = 1;
00310   }
00311   // In order to split the line at any point, we make sure that the width of the
00312   // gap is atleast half the stroke width.
00313   int i = 0;
00314   int cur_component_width = 0;
00315   while (i < width) {
00316     if (!vert_hist.hist()[i]) {
00317       int j = 0;
00318       while (i + j < width && !vert_hist.hist()[i+j])
00319         ++j;
00320       if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) {
00321         // Perform a shiro-rekha split. The intervening region lies from i to
00322         // i+j-1.
00323         // A minimal single-pixel split makes the estimation of intra- and
00324         // inter-word spacing easier during page layout analysis,
00325         // whereas a maximal split may be needed for OCR, depending on
00326         // how the engine was trained.
00327         bool minimal_split = (split_strategy == MINIMAL_SPLIT);
00328         int split_width = minimal_split ? 1 : j;
00329         int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i;
00330         if (!minimal_split || (i != 0 && i + j != width)) {
00331           Box* box_to_clear =
00332               boxCreate(word_left + split_left,
00333                         word_top + shirorekha_top - stroke_width / 3,
00334                         split_width,
00335                         5 * stroke_width / 3);
00336           if (box_to_clear) {
00337             boxaAddBox(regions_to_clear, box_to_clear, L_CLONE);
00338             // Mark this in the debug image if needed.
00339             if (devanagari_split_debugimage) {
00340               pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128);
00341             }
00342             boxDestroy(&box_to_clear);
00343             cur_component_width = 0;
00344           }
00345         }
00346       }
00347       i += j;
00348     } else {
00349       ++i;
00350       ++cur_component_width;
00351     }
00352   }
00353 }
00354 
00355 // Refreshes the words in the segmentation block list by using blobs in the
00356 // input block list.
00357 // The segmentation block list must be set.
00358 void ShiroRekhaSplitter::RefreshSegmentationWithNewBlobs(
00359     C_BLOB_LIST* new_blobs) {
00360   // The segmentation block list must have been specified.
00361   ASSERT_HOST(segmentation_block_list_);
00362   if (devanagari_split_debuglevel > 0) {
00363     tprintf("Before refreshing blobs:\n");
00364     PrintSegmentationStats(segmentation_block_list_);
00365     tprintf("New Blobs found: %d\n", new_blobs->length());
00366   }
00367 
00368   C_BLOB_LIST not_found_blobs;
00369   RefreshWordBlobsFromNewBlobs(segmentation_block_list_,
00370                                new_blobs,
00371                                ((devanagari_split_debugimage && debug_image_) ?
00372                                 &not_found_blobs : NULL));
00373 
00374   if (devanagari_split_debuglevel > 0) {
00375     tprintf("After refreshing blobs:\n");
00376     PrintSegmentationStats(segmentation_block_list_);
00377   }
00378   if (devanagari_split_debugimage && debug_image_) {
00379     // Plot out the original blobs for which no match was found in the new
00380     // all_blobs list.
00381     C_BLOB_IT not_found_it(&not_found_blobs);
00382     for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
00383          not_found_it.forward()) {
00384       C_BLOB* not_found = not_found_it.data();
00385       TBOX not_found_box = not_found->bounding_box();
00386       Box* box_to_plot = GetBoxForTBOX(not_found_box);
00387       pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255);
00388       boxDestroy(&box_to_plot);
00389     }
00390 
00391     // Plot out the blobs unused from all blobs.
00392     C_BLOB_IT all_blobs_it(new_blobs);
00393     for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
00394          all_blobs_it.forward()) {
00395       C_BLOB* a_blob = all_blobs_it.data();
00396       Box* box_to_plot = GetBoxForTBOX(a_blob->bounding_box());
00397       pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0);
00398       boxDestroy(&box_to_plot);
00399     }
00400   }
00401 }
00402 
00403 // Returns a new box object for the corresponding TBOX, based on the original
00404 // image's coordinate system.
00405 Box* ShiroRekhaSplitter::GetBoxForTBOX(const TBOX& tbox) const {
00406   return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1,
00407                    tbox.width(), tbox.height());
00408 }
00409 
00410 // This method returns the computed mode-height of blobs in the pix.
00411 // It also prunes very small blobs from calculation.
00412 int ShiroRekhaSplitter::GetModeHeight(Pix* pix) {
00413   Boxa* boxa = pixConnComp(pix, NULL, 8);
00414   STATS heights(0, pixGetHeight(pix));
00415   heights.clear();
00416   for (int i = 0; i < boxaGetCount(boxa); ++i) {
00417     Box* box = boxaGetBox(boxa, i, L_CLONE);
00418     if (box->h >= 3 || box->w >= 3) {
00419       heights.add(box->h, 1);
00420     }
00421     boxDestroy(&box);
00422   }
00423   boxaDestroy(&boxa);
00424   return heights.mode();
00425 }
00426 
00427 // This method returns y-extents of the shiro-rekha computed from the input
00428 // word image.
00429 void ShiroRekhaSplitter::GetShiroRekhaYExtents(Pix* word_pix,
00430                                                int* shirorekha_top,
00431                                                int* shirorekha_bottom,
00432                                                int* shirorekha_ylevel) {
00433   // Compute a histogram from projecting the word on a vertical line.
00434   PixelHistogram hist_horiz;
00435   hist_horiz.ConstructHorizontalCountHist(word_pix);
00436   // Get the ylevel where the top-line exists. This is basically the global
00437   // maxima in the horizontal histogram.
00438   int topline_onpixel_count = 0;
00439   int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count);
00440 
00441   // Get the upper and lower extents of the shiro rekha.
00442   int thresh = (topline_onpixel_count * 70) / 100;
00443   int ulimit = topline_ylevel;
00444   int llimit = topline_ylevel;
00445   while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh)
00446     --ulimit;
00447   while (llimit < pixGetHeight(word_pix) && hist_horiz.hist()[llimit] >= thresh)
00448     ++llimit;
00449 
00450   if (shirorekha_top) *shirorekha_top = ulimit;
00451   if (shirorekha_bottom) *shirorekha_bottom = llimit;
00452   if (shirorekha_ylevel) *shirorekha_ylevel = topline_ylevel;
00453 }
00454 
00455 // This method returns the global-maxima for the histogram. The frequency of
00456 // the global maxima is returned in count, if specified.
00457 int PixelHistogram::GetHistogramMaximum(int* count) const {
00458   int best_value = 0;
00459   for (int i = 0; i < length_; ++i) {
00460     if (hist_[i] > hist_[best_value]) {
00461       best_value = i;
00462     }
00463   }
00464   if (count) {
00465     *count = hist_[best_value];
00466   }
00467   return best_value;
00468 }
00469 
00470 // Methods to construct histograms from images.
00471 void PixelHistogram::ConstructVerticalCountHist(Pix* pix) {
00472   Clear();
00473   int width = pixGetWidth(pix);
00474   int height = pixGetHeight(pix);
00475   hist_ = new int[width];
00476   length_ = width;
00477   int wpl = pixGetWpl(pix);
00478   l_uint32 *data = pixGetData(pix);
00479   for (int i = 0; i < width; ++i)
00480     hist_[i] = 0;
00481   for (int i = 0; i < height; ++i) {
00482     l_uint32 *line = data + i * wpl;
00483     for (int j = 0; j < width; ++j)
00484       if (GET_DATA_BIT(line, j))
00485         ++(hist_[j]);
00486   }
00487 }
00488 
00489 void PixelHistogram::ConstructHorizontalCountHist(Pix* pix) {
00490   Clear();
00491   Numa* counts = pixCountPixelsByRow(pix, NULL);
00492   length_ = numaGetCount(counts);
00493   hist_ = new int[length_];
00494   for (int i = 0; i < length_; ++i) {
00495     l_int32 val = 0;
00496     numaGetIValue(counts, i, &val);
00497     hist_[i] = val;
00498   }
00499   numaDestroy(&counts);
00500 }
00501 
00502 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines