tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/pagesegmain.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        pagesegmain.cpp
00003  * Description: Top-level page segmenter for Tesseract.
00004  * Author:      Ray Smith
00005  * Created:     Thu Sep 25 17:12:01 PDT 2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef _WIN32
00021 #ifndef __GNUC__
00022 #include <windows.h>
00023 #endif  // __GNUC__
00024 #ifndef unlink
00025 #include <io.h>
00026 #endif
00027 #else
00028 #include <unistd.h>
00029 #endif  // _WIN32
00030 #ifdef _MSC_VER
00031 #pragma warning(disable:4244)  // Conversion warnings
00032 #endif
00033 
00034 // Include automatically generated configuration file if running autoconf.
00035 #ifdef HAVE_CONFIG_H
00036 #include "config_auto.h"
00037 #endif
00038 
00039 #include "allheaders.h"
00040 #include "blobbox.h"
00041 #include "blread.h"
00042 #include "colfind.h"
00043 #include "equationdetect.h"
00044 #include "imagefind.h"
00045 #include "linefind.h"
00046 #include "makerow.h"
00047 #include "osdetect.h"
00048 #include "tabvector.h"
00049 #include "tesseractclass.h"
00050 #include "tessvars.h"
00051 #include "textord.h"
00052 #include "tordmain.h"
00053 #include "wordseg.h"
00054 
00055 namespace tesseract {
00056 
00058 const int kMinCredibleResolution = 70;
00060 const int kDefaultResolution = 300;
00061 // Max erosions to perform in removing an enclosing circle.
00062 const int kMaxCircleErosions = 8;
00063 
00064 // Helper to remove an enclosing circle from an image.
00065 // If there isn't one, then the image will most likely get badly mangled.
00066 // The returned pix must be pixDestroyed after use. NULL may be returned
00067 // if the image doesn't meet the trivial conditions that it uses to determine
00068 // success.
00069 static Pix* RemoveEnclosingCircle(Pix* pixs) {
00070   Pix* pixsi = pixInvert(NULL, pixs);
00071   Pix* pixc = pixCreateTemplate(pixs);
00072   pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
00073   pixSeedfillBinary(pixc, pixc, pixsi, 4);
00074   pixInvert(pixc, pixc);
00075   pixDestroy(&pixsi);
00076   Pix* pixt = pixAnd(NULL, pixs, pixc);
00077   l_int32 max_count;
00078   pixCountConnComp(pixt, 8, &max_count);
00079   // The count has to go up before we start looking for the minimum.
00080   l_int32 min_count = MAX_INT32;
00081   Pix* pixout = NULL;
00082   for (int i = 1; i < kMaxCircleErosions; i++) {
00083     pixDestroy(&pixt);
00084     pixErodeBrick(pixc, pixc, 3, 3);
00085     pixt = pixAnd(NULL, pixs, pixc);
00086     l_int32 count;
00087     pixCountConnComp(pixt, 8, &count);
00088     if (i == 1 || count > max_count) {
00089       max_count = count;
00090       min_count = count;
00091     } else if (i > 1 && count < min_count) {
00092       min_count = count;
00093       pixDestroy(&pixout);
00094       pixout = pixCopy(NULL, pixt);  // Save the best.
00095     } else if (count >= min_count) {
00096       break;  // We have passed by the best.
00097     }
00098   }
00099   pixDestroy(&pixt);
00100   pixDestroy(&pixc);
00101   return pixout;
00102 }
00103 
00109 int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
00110                            Tesseract* osd_tess, OSResults* osr) {
00111   ASSERT_HOST(pix_binary_ != NULL);
00112   int width = pixGetWidth(pix_binary_);
00113   int height = pixGetHeight(pix_binary_);
00114   // Get page segmentation mode.
00115   PageSegMode pageseg_mode = static_cast<PageSegMode>(
00116       static_cast<int>(tessedit_pageseg_mode));
00117   // If a UNLV zone file can be found, use that instead of segmentation.
00118   if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
00119       input_file != NULL && input_file->length() > 0) {
00120     STRING name = *input_file;
00121     const char* lastdot = strrchr(name.string(), '.');
00122     if (lastdot != NULL)
00123       name[lastdot - name.string()] = '\0';
00124     read_unlv_file(name, width, height, blocks);
00125   }
00126   if (blocks->empty()) {
00127     // No UNLV file present. Work according to the PageSegMode.
00128     // First make a single block covering the whole image.
00129     BLOCK_IT block_it(blocks);
00130     BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
00131     block->set_right_to_left(right_to_left());
00132     block_it.add_to_end(block);
00133   } else {
00134     // UNLV file present. Use PSM_SINGLE_BLOCK.
00135     pageseg_mode = PSM_SINGLE_BLOCK;
00136   }
00137   int auto_page_seg_ret_val = 0;
00138   TO_BLOCK_LIST to_blocks;
00139   if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
00140       PSM_SPARSE(pageseg_mode)) {
00141     auto_page_seg_ret_val =
00142         AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr);
00143     if (pageseg_mode == PSM_OSD_ONLY)
00144       return auto_page_seg_ret_val;
00145     // To create blobs from the image region bounds uncomment this line:
00146     //  to_blocks.clear();  // Uncomment to go back to the old mode.
00147   } else {
00148     deskew_ = FCOORD(1.0f, 0.0f);
00149     reskew_ = FCOORD(1.0f, 0.0f);
00150     if (pageseg_mode == PSM_CIRCLE_WORD) {
00151       Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
00152       if (pixcleaned != NULL) {
00153         pixDestroy(&pix_binary_);
00154         pix_binary_ = pixcleaned;
00155       }
00156     }
00157   }
00158 
00159   if (auto_page_seg_ret_val < 0) {
00160     return -1;
00161   }
00162 
00163   if (blocks->empty()) {
00164     if (textord_debug_tabfind)
00165       tprintf("Empty page\n");
00166     return 0;  // AutoPageSeg found an empty page.
00167   }
00168   bool splitting =
00169       pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
00170   bool cjk_mode = textord_use_cjk_fp_model;
00171 
00172   textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
00173                        pix_thresholds_, pix_grey_, splitting || cjk_mode,
00174                        blocks, &to_blocks);
00175   return auto_page_seg_ret_val;
00176 }
00177 
00178 // Helper writes a grey image to a file for use by scrollviewer.
00179 // Normally for speed we don't display the image in the layout debug windows.
00180 // If textord_debug_images is true, we draw the image as a background to some
00181 // of the debug windows. printable determines whether these
00182 // images are optimized for printing instead of screen display.
00183 static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
00184   Pix* grey_pix = pixCreate(pixGetWidth(pix_binary),
00185                             pixGetHeight(pix_binary), 8);
00186   // Printable images are light grey on white, but for screen display
00187   // they are black on dark grey so the other colors show up well.
00188   if (printable) {
00189     pixSetAll(grey_pix);
00190     pixSetMasked(grey_pix, pix_binary, 192);
00191   } else {
00192     pixSetAllArbitrary(grey_pix, 64);
00193     pixSetMasked(grey_pix, pix_binary, 0);
00194   }
00195   AlignedBlob::IncrementDebugPix();
00196   pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG);
00197   pixDestroy(&grey_pix);
00198 }
00199 
00200 
00220 int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
00221                            BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
00222                            Tesseract* osd_tess, OSResults* osr) {
00223   if (textord_debug_images) {
00224     WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
00225   }
00226   Pix* photomask_pix = NULL;
00227   Pix* musicmask_pix = NULL;
00228   // The blocks made by the ColumnFinder. Moved to blocks before return.
00229   BLOCK_LIST found_blocks;
00230   TO_BLOCK_LIST temp_blocks;
00231 
00232   bool single_column = !PSM_COL_FIND_ENABLED(pageseg_mode);
00233   bool osd_enabled = PSM_OSD_ENABLED(pageseg_mode);
00234   bool osd_only = pageseg_mode == PSM_OSD_ONLY;
00235   ColumnFinder* finder = SetupPageSegAndDetectOrientation(
00236       single_column, osd_enabled, osd_only, blocks, osd_tess, osr,
00237       &temp_blocks, &photomask_pix, &musicmask_pix);
00238   int result = 0;
00239   if (finder != NULL) {
00240     TO_BLOCK_IT to_block_it(&temp_blocks);
00241     TO_BLOCK* to_block = to_block_it.data();
00242     if (musicmask_pix != NULL) {
00243       // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
00244       // blocks separately. For now combine with photomask_pix.
00245       pixOr(photomask_pix, photomask_pix, musicmask_pix);
00246     }
00247     if (equ_detect_) {
00248       finder->SetEquationDetect(equ_detect_);
00249     }
00250     result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
00251                                 to_block, photomask_pix,
00252                                 pix_thresholds_, pix_grey_,
00253                                 &found_blocks, to_blocks);
00254     if (result >= 0)
00255       finder->GetDeskewVectors(&deskew_, &reskew_);
00256     delete finder;
00257   }
00258   pixDestroy(&photomask_pix);
00259   pixDestroy(&musicmask_pix);
00260   if (result < 0) return result;
00261 
00262   blocks->clear();
00263   BLOCK_IT block_it(blocks);
00264   // Move the found blocks to the input/output blocks.
00265   block_it.add_list_after(&found_blocks);
00266 
00267   if (textord_debug_images) {
00268     // The debug image is no longer needed so delete it.
00269     unlink(AlignedBlob::textord_debug_pix().string());
00270   }
00271   return result;
00272 }
00273 
00287 ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
00288     bool single_column, bool osd, bool only_osd,
00289     BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr,
00290     TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix) {
00291   int vertical_x = 0;
00292   int vertical_y = 1;
00293   TabVector_LIST v_lines;
00294   TabVector_LIST h_lines;
00295   ICOORD bleft(0, 0);
00296 
00297   ASSERT_HOST(pix_binary_ != NULL);
00298   if (tessedit_dump_pageseg_images) {
00299     pixWrite("tessinput.png", pix_binary_, IFF_PNG);
00300   }
00301   // Leptonica is used to find the rule/separator lines in the input.
00302   LineFinder::FindAndRemoveLines(source_resolution_,
00303                                  textord_tabfind_show_vlines, pix_binary_,
00304                                  &vertical_x, &vertical_y, music_mask_pix,
00305                                  &v_lines, &h_lines);
00306   if (tessedit_dump_pageseg_images)
00307     pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
00308   // Leptonica is used to find a mask of the photo regions in the input.
00309   *photo_mask_pix = ImageFind::FindImages(pix_binary_);
00310   if (tessedit_dump_pageseg_images)
00311     pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
00312   if (single_column)
00313     v_lines.clear();
00314 
00315   // The rest of the algorithm uses the usual connected components.
00316   textord_.find_components(pix_binary_, blocks, to_blocks);
00317 
00318   TO_BLOCK_IT to_block_it(to_blocks);
00319   // There must be exactly one input block.
00320   // TODO(rays) handle new textline finding with a UNLV zone file.
00321   ASSERT_HOST(to_blocks->singleton());
00322   TO_BLOCK* to_block = to_block_it.data();
00323   TBOX blkbox = to_block->block->bounding_box();
00324   ColumnFinder* finder = NULL;
00325 
00326   if (to_block->line_size >= 2) {
00327     finder = new ColumnFinder(static_cast<int>(to_block->line_size),
00328                               blkbox.botleft(), blkbox.topright(),
00329                               source_resolution_, textord_use_cjk_fp_model,
00330                               &v_lines, &h_lines, vertical_x, vertical_y);
00331 
00332     finder->SetupAndFilterNoise(*photo_mask_pix, to_block);
00333 
00334     if (equ_detect_) {
00335       equ_detect_->LabelSpecialText(to_block);
00336     }
00337 
00338     BLOBNBOX_CLIST osd_blobs;
00339     // osd_orientation is the number of 90 degree rotations to make the
00340     // characters upright. (See osdetect.h for precise definition.)
00341     // We want the text lines horizontal, (vertical text indicates vertical
00342     // textlines) which may conflict (eg vertically written CJK).
00343     int osd_orientation = 0;
00344     bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs);
00345     if (osd && osd_tess != NULL && osr != NULL) {
00346       os_detect_blobs(&osd_blobs, osr, osd_tess);
00347       if (only_osd) {
00348         delete finder;
00349         return NULL;
00350       }
00351       osd_orientation = osr->best_result.orientation_id;
00352       double osd_score = osr->orientations[osd_orientation];
00353       double osd_margin = min_orientation_margin * 2;
00354       for (int i = 0; i < 4; ++i) {
00355         if (i != osd_orientation &&
00356             osd_score - osr->orientations[i] < osd_margin) {
00357           osd_margin = osd_score - osr->orientations[i];
00358         }
00359       }
00360       int best_script_id = osr->best_result.script_id;
00361       const char* best_script_str =
00362           osd_tess->unicharset.get_script_from_script_id(best_script_id);
00363       bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
00364           best_script_id == osd_tess->unicharset.hiragana_sid() ||
00365           best_script_id == osd_tess->unicharset.katakana_sid() ||
00366           strcmp("Japanese", best_script_str) == 0 ||
00367           strcmp("Korean", best_script_str) == 0 ||
00368           strcmp("Hangul", best_script_str) == 0;
00369       if (cjk) {
00370         finder->set_cjk_script(true);
00371       }
00372       if (osd_margin < min_orientation_margin) {
00373         // The margin is weak.
00374         if (!cjk && !vertical_text && osd_orientation == 2) {
00375           // upside down latin text is improbable with such a weak margin.
00376           tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
00377                   "Don't rotate.\n", osd_margin);
00378           osd_orientation = 0;
00379         } else {
00380           tprintf("OSD: Weak margin (%.2f) for %d blob text block, "
00381                   "but using orientation anyway: %d\n",
00382                   osd_blobs.length(), osd_margin, osd_orientation);
00383         }
00384       }
00385     }
00386     osd_blobs.shallow_clear();
00387     finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
00388   }
00389 
00390   return finder;
00391 }
00392 
00393 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines