tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/textord/tordmain.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        tordmain.cpp  (Formerly textordp.c)
00003  * Description: C++ top level textord code.
00004  * Author:                  Ray Smith
00005  * Created:                 Tue Jul 28 17:12:33 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifdef HAVE_CONFIG_H
00021 #include "config_auto.h"
00022 #endif
00023 
00024 #ifdef __UNIX__
00025 #include <assert.h>
00026 #endif
00027 #include "stderr.h"
00028 #include "globaloc.h"
00029 #include "blread.h"
00030 #include "blobbox.h"
00031 #include "ccstruct.h"
00032 #include "edgblob.h"
00033 #include "drawtord.h"
00034 #include "makerow.h"
00035 #include "wordseg.h"
00036 #include "textord.h"
00037 #include "tordmain.h"
00038 #include "secname.h"
00039 
00040 #include "allheaders.h"
00041 
00042 const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block";
00043 
00044 #undef EXTERN
00045 #define EXTERN
00046 
00047 #define MAX_NEAREST_DIST  600    //for block skew stats
00048 
00049 /**********************************************************************
00050  * SetBlobStrokeWidth
00051  *
00052  * Set the horizontal and vertical stroke widths in the blob.
00053  **********************************************************************/
00054 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) {
00055   // Cut the blob rectangle into a Pix.
00056   int pix_height = pixGetHeight(pix);
00057   const TBOX& box = blob->bounding_box();
00058   int width = box.width();
00059   int height = box.height();
00060   Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(),
00061                                 width, height);
00062   Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL);
00063   boxDestroy(&blob_pix_box);
00064   Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
00065   pixDestroy(&pix_blob);
00066   // Compute the stroke widths.
00067   uinT32* data = pixGetData(dist_pix);
00068   int wpl = pixGetWpl(dist_pix);
00069   // Horizontal width of stroke.
00070   STATS h_stats(0, width + 1);
00071   for (int y = 0; y < height; ++y) {
00072     uinT32* pixels = data + y*wpl;
00073     int prev_pixel = 0;
00074     int pixel = GET_DATA_BYTE(pixels, 0);
00075     for (int x = 1; x < width; ++x) {
00076       int next_pixel = GET_DATA_BYTE(pixels, x);
00077       // We are looking for a pixel that is equal to its vertical neighbours,
00078       // yet greater than its left neighbour.
00079       if (prev_pixel < pixel &&
00080           (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
00081           (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
00082         if (pixel > next_pixel) {
00083           // Single local max, so an odd width.
00084           h_stats.add(pixel * 2 - 1, 1);
00085         } else if (pixel == next_pixel && x + 1 < width &&
00086                  pixel > GET_DATA_BYTE(pixels, x + 1)) {
00087           // Double local max, so an even width.
00088           h_stats.add(pixel * 2, 1);
00089         }
00090       }
00091       prev_pixel = pixel;
00092       pixel = next_pixel;
00093     }
00094   }
00095   // Vertical width of stroke.
00096   STATS v_stats(0, height + 1);
00097   for (int x = 0; x < width; ++x) {
00098     int prev_pixel = 0;
00099     int pixel = GET_DATA_BYTE(data, x);
00100     for (int y = 1; y < height; ++y) {
00101       uinT32* pixels = data + y*wpl;
00102       int next_pixel = GET_DATA_BYTE(pixels, x);
00103       // We are looking for a pixel that is equal to its horizontal neighbours,
00104       // yet greater than its upper neighbour.
00105       if (prev_pixel < pixel &&
00106           (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
00107           (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
00108         if (pixel > next_pixel) {
00109           // Single local max, so an odd width.
00110           v_stats.add(pixel * 2 - 1, 1);
00111         } else if (pixel == next_pixel && y + 1 < height &&
00112                  pixel > GET_DATA_BYTE(pixels + wpl, x)) {
00113           // Double local max, so an even width.
00114           v_stats.add(pixel * 2, 1);
00115         }
00116       }
00117       prev_pixel = pixel;
00118       pixel = next_pixel;
00119     }
00120   }
00121   pixDestroy(&dist_pix);
00122   // Store the horizontal and vertical width in the blob, keeping both
00123   // widths if there is enough information, otherwse only the one with
00124   // the most samples.
00125   // If there are insufficent samples, store zero, rather than using
00126   // 2*area/perimeter, as the numbers that gives do not match the numbers
00127   // from the distance method.
00128   if (h_stats.get_total() >= (width + height) / 4) {
00129     blob->set_horz_stroke_width(h_stats.ile(0.5f));
00130     if (v_stats.get_total() >= (width + height) / 4)
00131       blob->set_vert_stroke_width(v_stats.ile(0.5f));
00132     else
00133       blob->set_vert_stroke_width(0.0f);
00134   } else {
00135     if (v_stats.get_total() >= (width + height) / 4 ||
00136         v_stats.get_total() > h_stats.get_total()) {
00137       blob->set_horz_stroke_width(0.0f);
00138       blob->set_vert_stroke_width(v_stats.ile(0.5f));
00139     } else {
00140       blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
00141                                                           : 0.0f);
00142       blob->set_vert_stroke_width(0.0f);
00143     }
00144   }
00145 }
00146 
00147 
00148 /**********************************************************************
00149  * assign_blobs_to_blocks2
00150  *
00151  * Make a list of TO_BLOCKs for portrait and landscape orientation.
00152  **********************************************************************/
00153 
00154 void assign_blobs_to_blocks2(Pix* pix,
00155                              BLOCK_LIST *blocks,          // blocks to process
00156                              TO_BLOCK_LIST *port_blocks) {  // output list
00157   BLOCK *block;                  // current block
00158   BLOBNBOX *newblob;             // created blob
00159   C_BLOB *blob;                  // current blob
00160   BLOCK_IT block_it = blocks;
00161   C_BLOB_IT blob_it;             // iterator
00162   BLOBNBOX_IT port_box_it;       // iterator
00163                                  // destination iterator
00164   TO_BLOCK_IT port_block_it = port_blocks;
00165   TO_BLOCK *port_block;          // created block
00166 
00167   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
00168     block = block_it.data();
00169     port_block = new TO_BLOCK(block);
00170 
00171     // Convert the good outlines to block->blob_list
00172     port_box_it.set_to_list(&port_block->blobs);
00173     blob_it.set_to_list(block->blob_list());
00174     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00175       blob = blob_it.extract();
00176       newblob = new BLOBNBOX(blob);  // Convert blob to BLOBNBOX.
00177       SetBlobStrokeWidth(pix, newblob);
00178       port_box_it.add_after_then_move(newblob);
00179     }
00180 
00181     // Put the rejected outlines in block->noise_blobs, which allows them to
00182     // be reconsidered and sorted back into rows and recover outlines mistakenly
00183     // rejected.
00184     port_box_it.set_to_list(&port_block->noise_blobs);
00185     blob_it.set_to_list(block->reject_blobs());
00186     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00187       blob = blob_it.extract();
00188       newblob = new BLOBNBOX(blob);  // Convert blob to BLOBNBOX.
00189       SetBlobStrokeWidth(pix, newblob);
00190       port_box_it.add_after_then_move(newblob);
00191     }
00192 
00193     port_block_it.add_after_then_move(port_block);
00194   }
00195 }
00196 
00197 namespace tesseract {
00198 /**********************************************************************
00199  * find_components
00200  *
00201  * Find the C_OUTLINEs of the connected components in each block, put them
00202  * in C_BLOBs, and filter them by size, putting the different size
00203  * grades on different lists in the matching TO_BLOCK in to_blocks.
00204  **********************************************************************/
00205 
00206 void Textord::find_components(Pix* pix, BLOCK_LIST *blocks,
00207                               TO_BLOCK_LIST *to_blocks) {
00208   int width = pixGetWidth(pix);
00209   int height = pixGetHeight(pix);
00210   if (width > MAX_INT16 || height > MAX_INT16) {
00211     tprintf("Input image too large! (%d, %d)\n", width, height);
00212     return;  // Can't handle it.
00213   }
00214 
00215   set_global_loc_code(LOC_EDGE_PROG);
00216 
00217   BLOCK_IT block_it(blocks);    // iterator
00218   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00219        block_it.forward()) {
00220     BLOCK* block = block_it.data();
00221     if (block->poly_block() == NULL || block->poly_block()->IsText()) {
00222       extract_edges(pix, block);
00223     }
00224   }
00225 
00226   assign_blobs_to_blocks2(pix, blocks, to_blocks);
00227   ICOORD page_tr(width, height);
00228   filter_blobs(page_tr, to_blocks, !textord_test_landscape);
00229 }
00230 
00231 /**********************************************************************
00232  * filter_blobs
00233  *
00234  * Sort the blobs into sizes in all the blocks for later work.
00235  **********************************************************************/
00236 
00237 void Textord::filter_blobs(ICOORD page_tr,         // top right
00238                            TO_BLOCK_LIST *blocks,  // output list
00239                            BOOL8 testing_on) {     // for plotting
00240   TO_BLOCK_IT block_it = blocks;          // destination iterator
00241   TO_BLOCK *block;                        // created block
00242 
00243   #ifndef GRAPHICS_DISABLED
00244   if (to_win != NULL)
00245     to_win->Clear();
00246   #endif  // GRAPHICS_DISABLED
00247 
00248   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00249        block_it.forward()) {
00250     block = block_it.data();
00251     block->line_size = filter_noise_blobs(&block->blobs,
00252       &block->noise_blobs,
00253       &block->small_blobs,
00254       &block->large_blobs);
00255     block->line_spacing = block->line_size *
00256         (tesseract::CCStruct::kDescenderFraction +
00257          tesseract::CCStruct::kXHeightFraction +
00258          2 * tesseract::CCStruct::kAscenderFraction) /
00259          tesseract::CCStruct::kXHeightFraction;
00260     block->line_size *= textord_min_linesize;
00261     block->max_blob_size = block->line_size * textord_excess_blobsize;
00262 
00263     #ifndef GRAPHICS_DISABLED
00264     if (textord_show_blobs && testing_on) {
00265       if (to_win == NULL)
00266         create_to_win(page_tr);
00267       block->plot_graded_blobs(to_win);
00268     }
00269     if (textord_show_boxes && testing_on) {
00270       if (to_win == NULL)
00271         create_to_win(page_tr);
00272       plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE);
00273       plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE);
00274       plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE);
00275       plot_box_list(to_win, &block->blobs, ScrollView::WHITE);
00276     }
00277     #endif  // GRAPHICS_DISABLED
00278   }
00279 }
00280 
00281 /**********************************************************************
00282  * filter_noise_blobs
00283  *
00284  * Move small blobs to a separate list.
00285  **********************************************************************/
00286 
00287 float Textord::filter_noise_blobs(
00288     BLOBNBOX_LIST *src_list,      // original list
00289     BLOBNBOX_LIST *noise_list,    // noise list
00290     BLOBNBOX_LIST *small_list,    // small blobs
00291     BLOBNBOX_LIST *large_list) {  // large blobs
00292   inT16 height;                  //height of blob
00293   inT16 width;                   //of blob
00294   BLOBNBOX *blob;                //current blob
00295   float initial_x;               //first guess
00296   BLOBNBOX_IT src_it = src_list; //iterators
00297   BLOBNBOX_IT noise_it = noise_list;
00298   BLOBNBOX_IT small_it = small_list;
00299   BLOBNBOX_IT large_it = large_list;
00300   STATS size_stats (0, MAX_NEAREST_DIST);
00301   //blob heights
00302   float min_y;                   //size limits
00303   float max_y;
00304   float max_x;
00305   float max_height;              //of good blobs
00306 
00307   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
00308     blob = src_it.data();
00309     if (blob->bounding_box().height() < textord_max_noise_size)
00310       noise_it.add_after_then_move(src_it.extract());
00311     else if (blob->enclosed_area() >= blob->bounding_box().height()
00312       * blob->bounding_box().width() * textord_noise_area_ratio)
00313       small_it.add_after_then_move(src_it.extract());
00314   }
00315   for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
00316     size_stats.add(src_it.data()->bounding_box().height(), 1);
00317   }
00318   initial_x = size_stats.ile(textord_initialx_ile);
00319   max_y = ceil(initial_x *
00320                (tesseract::CCStruct::kDescenderFraction +
00321                 tesseract::CCStruct::kXHeightFraction +
00322                 2 * tesseract::CCStruct::kAscenderFraction) /
00323                tesseract::CCStruct::kXHeightFraction);
00324   min_y = floor (initial_x / 2);
00325   max_x = ceil (initial_x * textord_width_limit);
00326   small_it.move_to_first ();
00327   for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
00328   small_it.forward ()) {
00329     height = small_it.data()->bounding_box().height();
00330     if (height > max_y)
00331       large_it.add_after_then_move(small_it.extract ());
00332     else if (height >= min_y)
00333       src_it.add_after_then_move(small_it.extract ());
00334   }
00335   size_stats.clear ();
00336   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00337     height = src_it.data ()->bounding_box ().height ();
00338     width = src_it.data ()->bounding_box ().width ();
00339     if (height < min_y)
00340       small_it.add_after_then_move (src_it.extract ());
00341     else if (height > max_y || width > max_x)
00342       large_it.add_after_then_move (src_it.extract ());
00343     else
00344       size_stats.add (height, 1);
00345   }
00346   max_height = size_stats.ile (textord_initialasc_ile);
00347   //      tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
00348   //              max_y,min_y,initial_x,max_height);
00349   max_height *= tesseract::CCStruct::kXHeightCapRatio;
00350   if (max_height > initial_x)
00351     initial_x = max_height;
00352   //      tprintf(" ret=%g\n",initial_x);
00353   return initial_x;
00354 }
00355 
00356 // Fixes the block so it obeys all the rules:
00357 // Must have at least one ROW.
00358 // Must have at least one WERD.
00359 // WERDs contain a fake blob.
00360 void Textord::cleanup_nontext_block(BLOCK* block) {
00361   // Non-text blocks must contain at least one row.
00362   ROW_IT row_it(block->row_list());
00363   if (row_it.empty()) {
00364     float height = block->bounding_box().height();
00365     inT32 zero = 0;
00366     ROW* row = new ROW(0, &zero, NULL, height / 2.0f, height / 4.0f,
00367                        height / 4.0f, 0, 1);
00368     row_it.add_after_then_move(row);
00369   }
00370   // Each row must contain at least one word.
00371   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00372     ROW* row = row_it.data();
00373     WERD_IT w_it(row->word_list());
00374     if (w_it.empty()) {
00375       // Make a fake blob to put in the word.
00376       TBOX box = block->row_list()->singleton() ? block->bounding_box()
00377                                                 : row->bounding_box();
00378       C_BLOB* blob = C_BLOB::FakeBlob(box);
00379       C_BLOB_LIST blobs;
00380       C_BLOB_IT blob_it(&blobs);
00381       blob_it.add_after_then_move(blob);
00382       WERD* word = new WERD(&blobs, 0, NULL);
00383       w_it.add_after_then_move(word);
00384     }
00385     // Each word must contain a fake blob.
00386     for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
00387       WERD* word = w_it.data();
00388       // Just assert that this is true, as it would be useful to find
00389       // out why it isn't.
00390       ASSERT_HOST(!word->cblob_list()->empty());
00391     }
00392     row->recalc_bounding_box();
00393   }
00394 }
00395 
00396 /**********************************************************************
00397  * cleanup_blocks
00398  *
00399  * Delete empty blocks, rows from the page.
00400  **********************************************************************/
00401 
00402 void Textord::cleanup_blocks(                    //remove empties
00403                              BLOCK_LIST *blocks  //list
00404                             ) {
00405   BLOCK_IT block_it = blocks;    //iterator
00406   ROW_IT row_it;                 //row iterator
00407 
00408   int num_rows = 0;
00409   int num_rows_all = 0;
00410   int num_blocks = 0;
00411   int num_blocks_all = 0;
00412   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00413        block_it.forward()) {
00414     BLOCK* block = block_it.data();
00415     if (block->poly_block() != NULL && !block->poly_block()->IsText()) {
00416       cleanup_nontext_block(block);
00417       continue;
00418     }
00419     num_rows = 0;
00420     num_rows_all = 0;
00421     row_it.set_to_list(block->row_list());
00422     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
00423       ++num_rows_all;
00424       clean_small_noise_from_words(row_it.data());
00425       if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() &&
00426            clean_noise_from_row(row_it.data())) ||
00427           row_it.data()->word_list()->empty()) {
00428         delete row_it.extract();  // lose empty row.
00429       } else {
00430         if (textord_noise_rejwords)
00431           clean_noise_from_words(row_it.data());
00432         if (textord_blshift_maxshift >= 0)
00433           tweak_row_baseline(row_it.data(),
00434                              textord_blshift_maxshift,
00435                              textord_blshift_xfraction);
00436         ++num_rows;
00437       }
00438     }
00439     if (block->row_list()->empty()) {
00440       delete block_it.extract();  // Lose empty text blocks.
00441     } else {
00442       ++num_blocks;
00443     }
00444     ++num_blocks_all;
00445     if (textord_noise_debug)
00446       tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
00447   }
00448   if (textord_noise_debug)
00449     tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
00450 }
00451 
00452 
00453 /**********************************************************************
00454  * clean_noise_from_row
00455  *
00456  * Move blobs of words from rows of garbage into the reject blobs list.
00457  **********************************************************************/
00458 
00459 BOOL8 Textord::clean_noise_from_row(          //remove empties
00460                                     ROW *row  //row to clean
00461                                    ) {
00462   BOOL8 testing_on;
00463   TBOX blob_box;                  //bounding box
00464   C_BLOB *blob;                  //current blob
00465   C_OUTLINE *outline;            //current outline
00466   WERD *word;                    //current word
00467   inT32 blob_size;               //biggest size
00468   inT32 trans_count = 0;         //no of transitions
00469   inT32 trans_threshold;         //noise tolerance
00470   inT32 dot_count;               //small objects
00471   inT32 norm_count;              //normal objects
00472   inT32 super_norm_count;        //real char-like
00473                                  //words of row
00474   WERD_IT word_it = row->word_list ();
00475   C_BLOB_IT blob_it;             //blob iterator
00476   C_OUTLINE_IT out_it;           //outline iterator
00477 
00478   if (textord_test_y > row->base_line (textord_test_x)
00479     && textord_show_blobs
00480     && textord_test_y < row->base_line (textord_test_x) + row->x_height ())
00481     testing_on = TRUE;
00482   else
00483     testing_on = FALSE;
00484   dot_count = 0;
00485   norm_count = 0;
00486   super_norm_count = 0;
00487   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00488     word = word_it.data ();      //current word
00489                                  //blobs in word
00490     blob_it.set_to_list (word->cblob_list ());
00491     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00492     blob_it.forward ()) {
00493       blob = blob_it.data ();
00494       if (!word->flag (W_DONT_CHOP)) {
00495                                  //get outlines
00496         out_it.set_to_list (blob->out_list ());
00497         for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
00498         out_it.forward ()) {
00499           outline = out_it.data ();
00500           blob_box = outline->bounding_box ();
00501           blob_size =
00502             blob_box.width () >
00503             blob_box.height ()? blob_box.width () : blob_box.
00504             height();
00505           if (blob_size < textord_noise_sizelimit * row->x_height ())
00506             dot_count++;         //count smal outlines
00507           if (!outline->child ()->empty ()
00508             && blob_box.height () <
00509             (1 + textord_noise_syfract) * row->x_height ()
00510             && blob_box.height () >
00511             (1 - textord_noise_syfract) * row->x_height ()
00512             && blob_box.width () <
00513             (1 + textord_noise_sxfract) * row->x_height ()
00514             && blob_box.width () >
00515             (1 - textord_noise_sxfract) * row->x_height ())
00516             super_norm_count++;  //count smal outlines
00517         }
00518       }
00519       else
00520         super_norm_count++;
00521       blob_box = blob->bounding_box ();
00522       blob_size =
00523         blob_box.width () >
00524         blob_box.height ()? blob_box.width () : blob_box.height ();
00525       if (blob_size >= textord_noise_sizelimit * row->x_height ()
00526           && blob_size < row->x_height () * 2) {
00527         trans_threshold = blob_size / textord_noise_sizefraction;
00528         trans_count = blob->count_transitions (trans_threshold);
00529         if (trans_count < textord_noise_translimit)
00530           norm_count++;
00531       }
00532       else if (blob_box.height () > row->x_height () * 2
00533         && (!word_it.at_first () || !blob_it.at_first ()))
00534         dot_count += 2;
00535       #ifndef SECURE_NAMES
00536       if (testing_on) {
00537         tprintf
00538           ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
00539           blob_box.left (), blob_box.bottom (), blob_box.right (),
00540           blob_box.top (), blob->out_list ()->length (), trans_count,
00541           blob_box.bottom () - row->base_line (blob_box.left ()));
00542       }
00543       #endif
00544     }
00545   }
00546   #ifndef SECURE_NAMES
00547   if (textord_noise_debug) {
00548     tprintf ("Row ending at (%d,%g):",
00549       blob_box.right (), row->base_line (blob_box.right ()));
00550     tprintf (" R=%g, dc=%d, nc=%d, %s\n",
00551       norm_count > 0 ? (float) dot_count / norm_count : 9999,
00552       dot_count, norm_count,
00553       dot_count > norm_count * textord_noise_normratio
00554       && dot_count > 2 ? "REJECTED" : "ACCEPTED");
00555   }
00556   #endif
00557   return super_norm_count < textord_noise_sncount
00558     && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
00559 }
00560 
00561 /**********************************************************************
00562  * clean_noise_from_words
00563  *
00564  * Move blobs of words from rows of garbage into the reject blobs list.
00565  **********************************************************************/
00566 
00567 void Textord::clean_noise_from_words(          //remove empties
00568                                      ROW *row  //row to clean
00569                                     ) {
00570   TBOX blob_box;                  //bounding box
00571   inT8 *word_dud;                //was it chucked
00572   C_BLOB *blob;                  //current blob
00573   C_OUTLINE *outline;            //current outline
00574   WERD *word;                    //current word
00575   inT32 blob_size;               //biggest size
00576   inT32 trans_count;             //no of transitions
00577   inT32 trans_threshold;         //noise tolerance
00578   inT32 dot_count;               //small objects
00579   inT32 norm_count;              //normal objects
00580   inT32 dud_words;               //number discarded
00581   inT32 ok_words;                //number remaining
00582   inT32 word_index;              //current word
00583                                  //words of row
00584   WERD_IT word_it = row->word_list ();
00585   C_BLOB_IT blob_it;             //blob iterator
00586   C_OUTLINE_IT out_it;           //outline iterator
00587 
00588   ok_words = word_it.length ();
00589   if (ok_words == 0 || textord_no_rejects)
00590     return;
00591   word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8));
00592   dud_words = 0;
00593   ok_words = 0;
00594   word_index = 0;
00595   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00596     word = word_it.data ();      //current word
00597     dot_count = 0;
00598     norm_count = 0;
00599                                  //blobs in word
00600     blob_it.set_to_list (word->cblob_list ());
00601     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00602     blob_it.forward ()) {
00603       blob = blob_it.data ();
00604       if (!word->flag (W_DONT_CHOP)) {
00605                                  //get outlines
00606         out_it.set_to_list (blob->out_list ());
00607         for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
00608         out_it.forward ()) {
00609           outline = out_it.data ();
00610           blob_box = outline->bounding_box ();
00611           blob_size =
00612             blob_box.width () >
00613             blob_box.height ()? blob_box.width () : blob_box.
00614             height();
00615           if (blob_size < textord_noise_sizelimit * row->x_height ())
00616             dot_count++;         //count smal outlines
00617           if (!outline->child ()->empty ()
00618             && blob_box.height () <
00619             (1 + textord_noise_syfract) * row->x_height ()
00620             && blob_box.height () >
00621             (1 - textord_noise_syfract) * row->x_height ()
00622             && blob_box.width () <
00623             (1 + textord_noise_sxfract) * row->x_height ()
00624             && blob_box.width () >
00625             (1 - textord_noise_sxfract) * row->x_height ())
00626             norm_count++;        //count smal outlines
00627         }
00628       }
00629       else
00630         norm_count++;
00631       blob_box = blob->bounding_box ();
00632       blob_size =
00633         blob_box.width () >
00634         blob_box.height ()? blob_box.width () : blob_box.height ();
00635       if (blob_size >= textord_noise_sizelimit * row->x_height ()
00636       && blob_size < row->x_height () * 2) {
00637         trans_threshold = blob_size / textord_noise_sizefraction;
00638         trans_count = blob->count_transitions (trans_threshold);
00639         if (trans_count < textord_noise_translimit)
00640           norm_count++;
00641       }
00642       else if (blob_box.height () > row->x_height () * 2
00643         && (!word_it.at_first () || !blob_it.at_first ()))
00644         dot_count += 2;
00645     }
00646     if (dot_count > 2) {
00647       if (dot_count > norm_count * textord_noise_normratio * 2)
00648         word_dud[word_index] = 2;
00649       else if (dot_count > norm_count * textord_noise_normratio)
00650         word_dud[word_index] = 1;
00651       else
00652         word_dud[word_index] = 0;
00653     }
00654     else
00655       word_dud[word_index] = 0;
00656     if (word_dud[word_index] == 2)
00657       dud_words++;
00658     else
00659       ok_words++;
00660     word_index++;
00661   }
00662 
00663   word_index = 0;
00664   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00665     if (word_dud[word_index] == 2
00666     || (word_dud[word_index] == 1 && dud_words > ok_words)) {
00667       word = word_it.data ();    //current word
00668                                  //rejected blobs
00669       blob_it.set_to_list (word->rej_cblob_list ());
00670                                  //move from blobs
00671       blob_it.add_list_after (word->cblob_list ());
00672     }
00673     word_index++;
00674   }
00675   free_mem(word_dud);
00676 }
00677 
00678 // Remove outlines that are a tiny fraction in either width or height
00679 // of the word height.
00680 void Textord::clean_small_noise_from_words(ROW *row) {
00681   WERD_IT word_it(row->word_list());
00682   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
00683     WERD* word = word_it.data();
00684     int min_size = static_cast<int>(
00685       textord_noise_hfract * word->bounding_box().height() + 0.5);
00686     C_BLOB_IT blob_it(word->cblob_list());
00687     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
00688       C_BLOB* blob = blob_it.data();
00689       C_OUTLINE_IT out_it(blob->out_list());
00690       for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
00691         C_OUTLINE* outline = out_it.data();
00692         outline->RemoveSmallRecursive(min_size, &out_it);
00693       }
00694       if (blob->out_list()->empty()) {
00695         delete blob_it.extract();
00696       }
00697     }
00698     if (word->cblob_list()->empty()) {
00699       if (!word_it.at_last()) {
00700         // The next word is no longer a fuzzy non space if it was before,
00701         // since the word before is about to be deleted.
00702         WERD* next_word = word_it.data_relative(1);
00703         if (next_word->flag(W_FUZZY_NON)) {
00704           next_word->set_flag(W_FUZZY_NON, false);
00705         }
00706       }
00707       delete word_it.extract();
00708     }
00709   }
00710 }
00711 }  // tesseract
00712 
00713 /**********************************************************************
00714  * tweak_row_baseline
00715  *
00716  * Shift baseline to fit the blobs more accurately where they are
00717  * close enough.
00718  **********************************************************************/
00719 
00720 void tweak_row_baseline(ROW *row,
00721                         double blshift_maxshift,
00722                         double blshift_xfraction) {
00723   TBOX blob_box;                 //bounding box
00724   C_BLOB *blob;                  //current blob
00725   WERD *word;                    //current word
00726   inT32 blob_count;              //no of blobs
00727   inT32 src_index;               //source segment
00728   inT32 dest_index;              //destination segment
00729   inT32 *xstarts;                //spline segments
00730   double *coeffs;                //spline coeffs
00731   float ydiff;                   //baseline error
00732   float x_centre;                //centre of blob
00733                                  //words of row
00734   WERD_IT word_it = row->word_list ();
00735   C_BLOB_IT blob_it;             //blob iterator
00736 
00737   blob_count = 0;
00738   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00739     word = word_it.data ();      //current word
00740                                  //get total blobs
00741     blob_count += word->cblob_list ()->length ();
00742   }
00743   if (blob_count == 0)
00744     return;
00745   xstarts =
00746     (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
00747     sizeof (inT32));
00748   coeffs =
00749     (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
00750     sizeof (double));
00751 
00752   src_index = 0;
00753   dest_index = 0;
00754   xstarts[0] = row->baseline.xcoords[0];
00755   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00756     word = word_it.data ();      //current word
00757                                  //blobs in word
00758     blob_it.set_to_list (word->cblob_list ());
00759     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00760     blob_it.forward ()) {
00761       blob = blob_it.data ();
00762       blob_box = blob->bounding_box ();
00763       x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
00764       ydiff = blob_box.bottom () - row->base_line (x_centre);
00765       if (ydiff < 0)
00766         ydiff = -ydiff / row->x_height ();
00767       else
00768         ydiff = ydiff / row->x_height ();
00769       if (ydiff < blshift_maxshift
00770         && blob_box.height () / row->x_height () > blshift_xfraction) {
00771         if (xstarts[dest_index] >= x_centre)
00772           xstarts[dest_index] = blob_box.left ();
00773         coeffs[dest_index * 3] = 0;
00774         coeffs[dest_index * 3 + 1] = 0;
00775         coeffs[dest_index * 3 + 2] = blob_box.bottom ();
00776         //shift it
00777         dest_index++;
00778         xstarts[dest_index] = blob_box.right () + 1;
00779       }
00780       else {
00781         if (xstarts[dest_index] <= x_centre) {
00782           while (row->baseline.xcoords[src_index + 1] <= x_centre
00783           && src_index < row->baseline.segments - 1) {
00784             if (row->baseline.xcoords[src_index + 1] >
00785             xstarts[dest_index]) {
00786               coeffs[dest_index * 3] =
00787                 row->baseline.quadratics[src_index].a;
00788               coeffs[dest_index * 3 + 1] =
00789                 row->baseline.quadratics[src_index].b;
00790               coeffs[dest_index * 3 + 2] =
00791                 row->baseline.quadratics[src_index].c;
00792               dest_index++;
00793               xstarts[dest_index] =
00794                 row->baseline.xcoords[src_index + 1];
00795             }
00796             src_index++;
00797           }
00798           coeffs[dest_index * 3] =
00799             row->baseline.quadratics[src_index].a;
00800           coeffs[dest_index * 3 + 1] =
00801             row->baseline.quadratics[src_index].b;
00802           coeffs[dest_index * 3 + 2] =
00803             row->baseline.quadratics[src_index].c;
00804           dest_index++;
00805           xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
00806         }
00807       }
00808     }
00809   }
00810   while (src_index < row->baseline.segments
00811     && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
00812     src_index++;
00813   while (src_index < row->baseline.segments) {
00814     coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
00815     coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
00816     coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
00817     dest_index++;
00818     src_index++;
00819     xstarts[dest_index] = row->baseline.xcoords[src_index];
00820   }
00821                                  //turn to spline
00822   row->baseline = QSPLINE (dest_index, xstarts, coeffs);
00823   free_mem(xstarts);
00824   free_mem(coeffs);
00825 }
00826 
00827 /**********************************************************************
00828  * blob_y_order
00829  *
00830  * Sort function to sort blobs in y from page top.
00831  **********************************************************************/
00832 
00833 inT32 blob_y_order(              //sort function
00834                    void *item1,  //items to compare
00835                    void *item2) {
00836                                  //converted ptr
00837   BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
00838                                  //converted ptr
00839   BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
00840 
00841   if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ())
00842     return -1;
00843   else if (blob1->bounding_box ().bottom () <
00844     blob2->bounding_box ().bottom ())
00845     return 1;
00846   else {
00847     if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
00848       return -1;
00849     else if (blob1->bounding_box ().left () >
00850       blob2->bounding_box ().left ())
00851       return 1;
00852     else
00853       return 0;
00854   }
00855 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines