tesseract
3.03
|
00001 /********************************************************************** 00002 * File: pagesegmain.cpp 00003 * Description: Top-level page segmenter for Tesseract. 00004 * Author: Ray Smith 00005 * Created: Thu Sep 25 17:12:01 PDT 2008 00006 * 00007 * (C) Copyright 2008, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _WIN32 00021 #ifndef __GNUC__ 00022 #include <windows.h> 00023 #endif // __GNUC__ 00024 #ifndef unlink 00025 #include <io.h> 00026 #endif 00027 #else 00028 #include <unistd.h> 00029 #endif // _WIN32 00030 #ifdef _MSC_VER 00031 #pragma warning(disable:4244) // Conversion warnings 00032 #endif 00033 00034 // Include automatically generated configuration file if running autoconf. 00035 #ifdef HAVE_CONFIG_H 00036 #include "config_auto.h" 00037 #endif 00038 00039 #include "allheaders.h" 00040 #include "blobbox.h" 00041 #include "blread.h" 00042 #include "colfind.h" 00043 #include "equationdetect.h" 00044 #include "imagefind.h" 00045 #include "linefind.h" 00046 #include "makerow.h" 00047 #include "osdetect.h" 00048 #include "tabvector.h" 00049 #include "tesseractclass.h" 00050 #include "tessvars.h" 00051 #include "textord.h" 00052 #include "tordmain.h" 00053 #include "wordseg.h" 00054 00055 namespace tesseract { 00056 00058 const int kMinCredibleResolution = 70; 00060 const int kDefaultResolution = 300; 00061 // Max erosions to perform in removing an enclosing circle. 00062 const int kMaxCircleErosions = 8; 00063 00064 // Helper to remove an enclosing circle from an image. 00065 // If there isn't one, then the image will most likely get badly mangled. 00066 // The returned pix must be pixDestroyed after use. NULL may be returned 00067 // if the image doesn't meet the trivial conditions that it uses to determine 00068 // success. 00069 static Pix* RemoveEnclosingCircle(Pix* pixs) { 00070 Pix* pixsi = pixInvert(NULL, pixs); 00071 Pix* pixc = pixCreateTemplate(pixs); 00072 pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET); 00073 pixSeedfillBinary(pixc, pixc, pixsi, 4); 00074 pixInvert(pixc, pixc); 00075 pixDestroy(&pixsi); 00076 Pix* pixt = pixAnd(NULL, pixs, pixc); 00077 l_int32 max_count; 00078 pixCountConnComp(pixt, 8, &max_count); 00079 // The count has to go up before we start looking for the minimum. 00080 l_int32 min_count = MAX_INT32; 00081 Pix* pixout = NULL; 00082 for (int i = 1; i < kMaxCircleErosions; i++) { 00083 pixDestroy(&pixt); 00084 pixErodeBrick(pixc, pixc, 3, 3); 00085 pixt = pixAnd(NULL, pixs, pixc); 00086 l_int32 count; 00087 pixCountConnComp(pixt, 8, &count); 00088 if (i == 1 || count > max_count) { 00089 max_count = count; 00090 min_count = count; 00091 } else if (i > 1 && count < min_count) { 00092 min_count = count; 00093 pixDestroy(&pixout); 00094 pixout = pixCopy(NULL, pixt); // Save the best. 00095 } else if (count >= min_count) { 00096 break; // We have passed by the best. 00097 } 00098 } 00099 pixDestroy(&pixt); 00100 pixDestroy(&pixc); 00101 return pixout; 00102 } 00103 00109 int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, 00110 Tesseract* osd_tess, OSResults* osr) { 00111 ASSERT_HOST(pix_binary_ != NULL); 00112 int width = pixGetWidth(pix_binary_); 00113 int height = pixGetHeight(pix_binary_); 00114 // Get page segmentation mode. 00115 PageSegMode pageseg_mode = static_cast<PageSegMode>( 00116 static_cast<int>(tessedit_pageseg_mode)); 00117 // If a UNLV zone file can be found, use that instead of segmentation. 00118 if (!PSM_COL_FIND_ENABLED(pageseg_mode) && 00119 input_file != NULL && input_file->length() > 0) { 00120 STRING name = *input_file; 00121 const char* lastdot = strrchr(name.string(), '.'); 00122 if (lastdot != NULL) 00123 name[lastdot - name.string()] = '\0'; 00124 read_unlv_file(name, width, height, blocks); 00125 } 00126 if (blocks->empty()) { 00127 // No UNLV file present. Work according to the PageSegMode. 00128 // First make a single block covering the whole image. 00129 BLOCK_IT block_it(blocks); 00130 BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height); 00131 block->set_right_to_left(right_to_left()); 00132 block_it.add_to_end(block); 00133 } else { 00134 // UNLV file present. Use PSM_SINGLE_BLOCK. 00135 pageseg_mode = PSM_SINGLE_BLOCK; 00136 } 00137 int auto_page_seg_ret_val = 0; 00138 TO_BLOCK_LIST to_blocks; 00139 if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) || 00140 PSM_SPARSE(pageseg_mode)) { 00141 auto_page_seg_ret_val = 00142 AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr); 00143 if (pageseg_mode == PSM_OSD_ONLY) 00144 return auto_page_seg_ret_val; 00145 // To create blobs from the image region bounds uncomment this line: 00146 // to_blocks.clear(); // Uncomment to go back to the old mode. 00147 } else { 00148 deskew_ = FCOORD(1.0f, 0.0f); 00149 reskew_ = FCOORD(1.0f, 0.0f); 00150 if (pageseg_mode == PSM_CIRCLE_WORD) { 00151 Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_); 00152 if (pixcleaned != NULL) { 00153 pixDestroy(&pix_binary_); 00154 pix_binary_ = pixcleaned; 00155 } 00156 } 00157 } 00158 00159 if (auto_page_seg_ret_val < 0) { 00160 return -1; 00161 } 00162 00163 if (blocks->empty()) { 00164 if (textord_debug_tabfind) 00165 tprintf("Empty page\n"); 00166 return 0; // AutoPageSeg found an empty page. 00167 } 00168 bool splitting = 00169 pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT; 00170 bool cjk_mode = textord_use_cjk_fp_model; 00171 00172 textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, 00173 pix_thresholds_, pix_grey_, splitting || cjk_mode, 00174 blocks, &to_blocks); 00175 return auto_page_seg_ret_val; 00176 } 00177 00178 // Helper writes a grey image to a file for use by scrollviewer. 00179 // Normally for speed we don't display the image in the layout debug windows. 00180 // If textord_debug_images is true, we draw the image as a background to some 00181 // of the debug windows. printable determines whether these 00182 // images are optimized for printing instead of screen display. 00183 static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { 00184 Pix* grey_pix = pixCreate(pixGetWidth(pix_binary), 00185 pixGetHeight(pix_binary), 8); 00186 // Printable images are light grey on white, but for screen display 00187 // they are black on dark grey so the other colors show up well. 00188 if (printable) { 00189 pixSetAll(grey_pix); 00190 pixSetMasked(grey_pix, pix_binary, 192); 00191 } else { 00192 pixSetAllArbitrary(grey_pix, 64); 00193 pixSetMasked(grey_pix, pix_binary, 0); 00194 } 00195 AlignedBlob::IncrementDebugPix(); 00196 pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG); 00197 pixDestroy(&grey_pix); 00198 } 00199 00200 00220 int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, 00221 BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, 00222 Tesseract* osd_tess, OSResults* osr) { 00223 if (textord_debug_images) { 00224 WriteDebugBackgroundImage(textord_debug_printable, pix_binary_); 00225 } 00226 Pix* photomask_pix = NULL; 00227 Pix* musicmask_pix = NULL; 00228 // The blocks made by the ColumnFinder. Moved to blocks before return. 00229 BLOCK_LIST found_blocks; 00230 TO_BLOCK_LIST temp_blocks; 00231 00232 bool single_column = !PSM_COL_FIND_ENABLED(pageseg_mode); 00233 bool osd_enabled = PSM_OSD_ENABLED(pageseg_mode); 00234 bool osd_only = pageseg_mode == PSM_OSD_ONLY; 00235 ColumnFinder* finder = SetupPageSegAndDetectOrientation( 00236 single_column, osd_enabled, osd_only, blocks, osd_tess, osr, 00237 &temp_blocks, &photomask_pix, &musicmask_pix); 00238 int result = 0; 00239 if (finder != NULL) { 00240 TO_BLOCK_IT to_block_it(&temp_blocks); 00241 TO_BLOCK* to_block = to_block_it.data(); 00242 if (musicmask_pix != NULL) { 00243 // TODO(rays) pass the musicmask_pix into FindBlocks and mark music 00244 // blocks separately. For now combine with photomask_pix. 00245 pixOr(photomask_pix, photomask_pix, musicmask_pix); 00246 } 00247 if (equ_detect_) { 00248 finder->SetEquationDetect(equ_detect_); 00249 } 00250 result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, 00251 to_block, photomask_pix, 00252 pix_thresholds_, pix_grey_, 00253 &found_blocks, to_blocks); 00254 if (result >= 0) 00255 finder->GetDeskewVectors(&deskew_, &reskew_); 00256 delete finder; 00257 } 00258 pixDestroy(&photomask_pix); 00259 pixDestroy(&musicmask_pix); 00260 if (result < 0) return result; 00261 00262 blocks->clear(); 00263 BLOCK_IT block_it(blocks); 00264 // Move the found blocks to the input/output blocks. 00265 block_it.add_list_after(&found_blocks); 00266 00267 if (textord_debug_images) { 00268 // The debug image is no longer needed so delete it. 00269 unlink(AlignedBlob::textord_debug_pix().string()); 00270 } 00271 return result; 00272 } 00273 00287 ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation( 00288 bool single_column, bool osd, bool only_osd, 00289 BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr, 00290 TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix) { 00291 int vertical_x = 0; 00292 int vertical_y = 1; 00293 TabVector_LIST v_lines; 00294 TabVector_LIST h_lines; 00295 ICOORD bleft(0, 0); 00296 00297 ASSERT_HOST(pix_binary_ != NULL); 00298 if (tessedit_dump_pageseg_images) { 00299 pixWrite("tessinput.png", pix_binary_, IFF_PNG); 00300 } 00301 // Leptonica is used to find the rule/separator lines in the input. 00302 LineFinder::FindAndRemoveLines(source_resolution_, 00303 textord_tabfind_show_vlines, pix_binary_, 00304 &vertical_x, &vertical_y, music_mask_pix, 00305 &v_lines, &h_lines); 00306 if (tessedit_dump_pageseg_images) 00307 pixWrite("tessnolines.png", pix_binary_, IFF_PNG); 00308 // Leptonica is used to find a mask of the photo regions in the input. 00309 *photo_mask_pix = ImageFind::FindImages(pix_binary_); 00310 if (tessedit_dump_pageseg_images) 00311 pixWrite("tessnoimages.png", pix_binary_, IFF_PNG); 00312 if (single_column) 00313 v_lines.clear(); 00314 00315 // The rest of the algorithm uses the usual connected components. 00316 textord_.find_components(pix_binary_, blocks, to_blocks); 00317 00318 TO_BLOCK_IT to_block_it(to_blocks); 00319 // There must be exactly one input block. 00320 // TODO(rays) handle new textline finding with a UNLV zone file. 00321 ASSERT_HOST(to_blocks->singleton()); 00322 TO_BLOCK* to_block = to_block_it.data(); 00323 TBOX blkbox = to_block->block->bounding_box(); 00324 ColumnFinder* finder = NULL; 00325 00326 if (to_block->line_size >= 2) { 00327 finder = new ColumnFinder(static_cast<int>(to_block->line_size), 00328 blkbox.botleft(), blkbox.topright(), 00329 source_resolution_, textord_use_cjk_fp_model, 00330 &v_lines, &h_lines, vertical_x, vertical_y); 00331 00332 finder->SetupAndFilterNoise(*photo_mask_pix, to_block); 00333 00334 if (equ_detect_) { 00335 equ_detect_->LabelSpecialText(to_block); 00336 } 00337 00338 BLOBNBOX_CLIST osd_blobs; 00339 // osd_orientation is the number of 90 degree rotations to make the 00340 // characters upright. (See osdetect.h for precise definition.) 00341 // We want the text lines horizontal, (vertical text indicates vertical 00342 // textlines) which may conflict (eg vertically written CJK). 00343 int osd_orientation = 0; 00344 bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs); 00345 if (osd && osd_tess != NULL && osr != NULL) { 00346 os_detect_blobs(&osd_blobs, osr, osd_tess); 00347 if (only_osd) { 00348 delete finder; 00349 return NULL; 00350 } 00351 osd_orientation = osr->best_result.orientation_id; 00352 double osd_score = osr->orientations[osd_orientation]; 00353 double osd_margin = min_orientation_margin * 2; 00354 for (int i = 0; i < 4; ++i) { 00355 if (i != osd_orientation && 00356 osd_score - osr->orientations[i] < osd_margin) { 00357 osd_margin = osd_score - osr->orientations[i]; 00358 } 00359 } 00360 int best_script_id = osr->best_result.script_id; 00361 const char* best_script_str = 00362 osd_tess->unicharset.get_script_from_script_id(best_script_id); 00363 bool cjk = best_script_id == osd_tess->unicharset.han_sid() || 00364 best_script_id == osd_tess->unicharset.hiragana_sid() || 00365 best_script_id == osd_tess->unicharset.katakana_sid() || 00366 strcmp("Japanese", best_script_str) == 0 || 00367 strcmp("Korean", best_script_str) == 0 || 00368 strcmp("Hangul", best_script_str) == 0; 00369 if (cjk) { 00370 finder->set_cjk_script(true); 00371 } 00372 if (osd_margin < min_orientation_margin) { 00373 // The margin is weak. 00374 if (!cjk && !vertical_text && osd_orientation == 2) { 00375 // upside down latin text is improbable with such a weak margin. 00376 tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: " 00377 "Don't rotate.\n", osd_margin); 00378 osd_orientation = 0; 00379 } else { 00380 tprintf("OSD: Weak margin (%.2f) for %d blob text block, " 00381 "but using orientation anyway: %d\n", 00382 osd_blobs.length(), osd_margin, osd_orientation); 00383 } 00384 } 00385 } 00386 osd_blobs.shallow_clear(); 00387 finder->CorrectOrientation(to_block, vertical_text, osd_orientation); 00388 } 00389 00390 return finder; 00391 } 00392 00393 } // namespace tesseract.