tesseract
3.03
|
00001 /********************************************************************** 00002 * File: tordmain.cpp (Formerly textordp.c) 00003 * Description: C++ top level textord code. 00004 * Author: Ray Smith 00005 * Created: Tue Jul 28 17:12:33 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef HAVE_CONFIG_H 00021 #include "config_auto.h" 00022 #endif 00023 00024 #ifdef __UNIX__ 00025 #include <assert.h> 00026 #endif 00027 #include "stderr.h" 00028 #include "globaloc.h" 00029 #include "blread.h" 00030 #include "blobbox.h" 00031 #include "ccstruct.h" 00032 #include "edgblob.h" 00033 #include "drawtord.h" 00034 #include "makerow.h" 00035 #include "wordseg.h" 00036 #include "textord.h" 00037 #include "tordmain.h" 00038 #include "secname.h" 00039 00040 #include "allheaders.h" 00041 00042 const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block"; 00043 00044 #undef EXTERN 00045 #define EXTERN 00046 00047 #define MAX_NEAREST_DIST 600 //for block skew stats 00048 00049 /********************************************************************** 00050 * SetBlobStrokeWidth 00051 * 00052 * Set the horizontal and vertical stroke widths in the blob. 00053 **********************************************************************/ 00054 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) { 00055 // Cut the blob rectangle into a Pix. 00056 int pix_height = pixGetHeight(pix); 00057 const TBOX& box = blob->bounding_box(); 00058 int width = box.width(); 00059 int height = box.height(); 00060 Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(), 00061 width, height); 00062 Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL); 00063 boxDestroy(&blob_pix_box); 00064 Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG); 00065 pixDestroy(&pix_blob); 00066 // Compute the stroke widths. 00067 uinT32* data = pixGetData(dist_pix); 00068 int wpl = pixGetWpl(dist_pix); 00069 // Horizontal width of stroke. 00070 STATS h_stats(0, width + 1); 00071 for (int y = 0; y < height; ++y) { 00072 uinT32* pixels = data + y*wpl; 00073 int prev_pixel = 0; 00074 int pixel = GET_DATA_BYTE(pixels, 0); 00075 for (int x = 1; x < width; ++x) { 00076 int next_pixel = GET_DATA_BYTE(pixels, x); 00077 // We are looking for a pixel that is equal to its vertical neighbours, 00078 // yet greater than its left neighbour. 00079 if (prev_pixel < pixel && 00080 (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && 00081 (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) { 00082 if (pixel > next_pixel) { 00083 // Single local max, so an odd width. 00084 h_stats.add(pixel * 2 - 1, 1); 00085 } else if (pixel == next_pixel && x + 1 < width && 00086 pixel > GET_DATA_BYTE(pixels, x + 1)) { 00087 // Double local max, so an even width. 00088 h_stats.add(pixel * 2, 1); 00089 } 00090 } 00091 prev_pixel = pixel; 00092 pixel = next_pixel; 00093 } 00094 } 00095 // Vertical width of stroke. 00096 STATS v_stats(0, height + 1); 00097 for (int x = 0; x < width; ++x) { 00098 int prev_pixel = 0; 00099 int pixel = GET_DATA_BYTE(data, x); 00100 for (int y = 1; y < height; ++y) { 00101 uinT32* pixels = data + y*wpl; 00102 int next_pixel = GET_DATA_BYTE(pixels, x); 00103 // We are looking for a pixel that is equal to its horizontal neighbours, 00104 // yet greater than its upper neighbour. 00105 if (prev_pixel < pixel && 00106 (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) && 00107 (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) { 00108 if (pixel > next_pixel) { 00109 // Single local max, so an odd width. 00110 v_stats.add(pixel * 2 - 1, 1); 00111 } else if (pixel == next_pixel && y + 1 < height && 00112 pixel > GET_DATA_BYTE(pixels + wpl, x)) { 00113 // Double local max, so an even width. 00114 v_stats.add(pixel * 2, 1); 00115 } 00116 } 00117 prev_pixel = pixel; 00118 pixel = next_pixel; 00119 } 00120 } 00121 pixDestroy(&dist_pix); 00122 // Store the horizontal and vertical width in the blob, keeping both 00123 // widths if there is enough information, otherwse only the one with 00124 // the most samples. 00125 // If there are insufficent samples, store zero, rather than using 00126 // 2*area/perimeter, as the numbers that gives do not match the numbers 00127 // from the distance method. 00128 if (h_stats.get_total() >= (width + height) / 4) { 00129 blob->set_horz_stroke_width(h_stats.ile(0.5f)); 00130 if (v_stats.get_total() >= (width + height) / 4) 00131 blob->set_vert_stroke_width(v_stats.ile(0.5f)); 00132 else 00133 blob->set_vert_stroke_width(0.0f); 00134 } else { 00135 if (v_stats.get_total() >= (width + height) / 4 || 00136 v_stats.get_total() > h_stats.get_total()) { 00137 blob->set_horz_stroke_width(0.0f); 00138 blob->set_vert_stroke_width(v_stats.ile(0.5f)); 00139 } else { 00140 blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f) 00141 : 0.0f); 00142 blob->set_vert_stroke_width(0.0f); 00143 } 00144 } 00145 } 00146 00147 00148 /********************************************************************** 00149 * assign_blobs_to_blocks2 00150 * 00151 * Make a list of TO_BLOCKs for portrait and landscape orientation. 00152 **********************************************************************/ 00153 00154 void assign_blobs_to_blocks2(Pix* pix, 00155 BLOCK_LIST *blocks, // blocks to process 00156 TO_BLOCK_LIST *port_blocks) { // output list 00157 BLOCK *block; // current block 00158 BLOBNBOX *newblob; // created blob 00159 C_BLOB *blob; // current blob 00160 BLOCK_IT block_it = blocks; 00161 C_BLOB_IT blob_it; // iterator 00162 BLOBNBOX_IT port_box_it; // iterator 00163 // destination iterator 00164 TO_BLOCK_IT port_block_it = port_blocks; 00165 TO_BLOCK *port_block; // created block 00166 00167 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { 00168 block = block_it.data(); 00169 port_block = new TO_BLOCK(block); 00170 00171 // Convert the good outlines to block->blob_list 00172 port_box_it.set_to_list(&port_block->blobs); 00173 blob_it.set_to_list(block->blob_list()); 00174 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00175 blob = blob_it.extract(); 00176 newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. 00177 SetBlobStrokeWidth(pix, newblob); 00178 port_box_it.add_after_then_move(newblob); 00179 } 00180 00181 // Put the rejected outlines in block->noise_blobs, which allows them to 00182 // be reconsidered and sorted back into rows and recover outlines mistakenly 00183 // rejected. 00184 port_box_it.set_to_list(&port_block->noise_blobs); 00185 blob_it.set_to_list(block->reject_blobs()); 00186 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00187 blob = blob_it.extract(); 00188 newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX. 00189 SetBlobStrokeWidth(pix, newblob); 00190 port_box_it.add_after_then_move(newblob); 00191 } 00192 00193 port_block_it.add_after_then_move(port_block); 00194 } 00195 } 00196 00197 namespace tesseract { 00198 /********************************************************************** 00199 * find_components 00200 * 00201 * Find the C_OUTLINEs of the connected components in each block, put them 00202 * in C_BLOBs, and filter them by size, putting the different size 00203 * grades on different lists in the matching TO_BLOCK in to_blocks. 00204 **********************************************************************/ 00205 00206 void Textord::find_components(Pix* pix, BLOCK_LIST *blocks, 00207 TO_BLOCK_LIST *to_blocks) { 00208 int width = pixGetWidth(pix); 00209 int height = pixGetHeight(pix); 00210 if (width > MAX_INT16 || height > MAX_INT16) { 00211 tprintf("Input image too large! (%d, %d)\n", width, height); 00212 return; // Can't handle it. 00213 } 00214 00215 set_global_loc_code(LOC_EDGE_PROG); 00216 00217 BLOCK_IT block_it(blocks); // iterator 00218 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00219 block_it.forward()) { 00220 BLOCK* block = block_it.data(); 00221 if (block->poly_block() == NULL || block->poly_block()->IsText()) { 00222 extract_edges(pix, block); 00223 } 00224 } 00225 00226 assign_blobs_to_blocks2(pix, blocks, to_blocks); 00227 ICOORD page_tr(width, height); 00228 filter_blobs(page_tr, to_blocks, !textord_test_landscape); 00229 } 00230 00231 /********************************************************************** 00232 * filter_blobs 00233 * 00234 * Sort the blobs into sizes in all the blocks for later work. 00235 **********************************************************************/ 00236 00237 void Textord::filter_blobs(ICOORD page_tr, // top right 00238 TO_BLOCK_LIST *blocks, // output list 00239 BOOL8 testing_on) { // for plotting 00240 TO_BLOCK_IT block_it = blocks; // destination iterator 00241 TO_BLOCK *block; // created block 00242 00243 #ifndef GRAPHICS_DISABLED 00244 if (to_win != NULL) 00245 to_win->Clear(); 00246 #endif // GRAPHICS_DISABLED 00247 00248 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00249 block_it.forward()) { 00250 block = block_it.data(); 00251 block->line_size = filter_noise_blobs(&block->blobs, 00252 &block->noise_blobs, 00253 &block->small_blobs, 00254 &block->large_blobs); 00255 block->line_spacing = block->line_size * 00256 (tesseract::CCStruct::kDescenderFraction + 00257 tesseract::CCStruct::kXHeightFraction + 00258 2 * tesseract::CCStruct::kAscenderFraction) / 00259 tesseract::CCStruct::kXHeightFraction; 00260 block->line_size *= textord_min_linesize; 00261 block->max_blob_size = block->line_size * textord_excess_blobsize; 00262 00263 #ifndef GRAPHICS_DISABLED 00264 if (textord_show_blobs && testing_on) { 00265 if (to_win == NULL) 00266 create_to_win(page_tr); 00267 block->plot_graded_blobs(to_win); 00268 } 00269 if (textord_show_boxes && testing_on) { 00270 if (to_win == NULL) 00271 create_to_win(page_tr); 00272 plot_box_list(to_win, &block->noise_blobs, ScrollView::WHITE); 00273 plot_box_list(to_win, &block->small_blobs, ScrollView::WHITE); 00274 plot_box_list(to_win, &block->large_blobs, ScrollView::WHITE); 00275 plot_box_list(to_win, &block->blobs, ScrollView::WHITE); 00276 } 00277 #endif // GRAPHICS_DISABLED 00278 } 00279 } 00280 00281 /********************************************************************** 00282 * filter_noise_blobs 00283 * 00284 * Move small blobs to a separate list. 00285 **********************************************************************/ 00286 00287 float Textord::filter_noise_blobs( 00288 BLOBNBOX_LIST *src_list, // original list 00289 BLOBNBOX_LIST *noise_list, // noise list 00290 BLOBNBOX_LIST *small_list, // small blobs 00291 BLOBNBOX_LIST *large_list) { // large blobs 00292 inT16 height; //height of blob 00293 inT16 width; //of blob 00294 BLOBNBOX *blob; //current blob 00295 float initial_x; //first guess 00296 BLOBNBOX_IT src_it = src_list; //iterators 00297 BLOBNBOX_IT noise_it = noise_list; 00298 BLOBNBOX_IT small_it = small_list; 00299 BLOBNBOX_IT large_it = large_list; 00300 STATS size_stats (0, MAX_NEAREST_DIST); 00301 //blob heights 00302 float min_y; //size limits 00303 float max_y; 00304 float max_x; 00305 float max_height; //of good blobs 00306 00307 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { 00308 blob = src_it.data(); 00309 if (blob->bounding_box().height() < textord_max_noise_size) 00310 noise_it.add_after_then_move(src_it.extract()); 00311 else if (blob->enclosed_area() >= blob->bounding_box().height() 00312 * blob->bounding_box().width() * textord_noise_area_ratio) 00313 small_it.add_after_then_move(src_it.extract()); 00314 } 00315 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { 00316 size_stats.add(src_it.data()->bounding_box().height(), 1); 00317 } 00318 initial_x = size_stats.ile(textord_initialx_ile); 00319 max_y = ceil(initial_x * 00320 (tesseract::CCStruct::kDescenderFraction + 00321 tesseract::CCStruct::kXHeightFraction + 00322 2 * tesseract::CCStruct::kAscenderFraction) / 00323 tesseract::CCStruct::kXHeightFraction); 00324 min_y = floor (initial_x / 2); 00325 max_x = ceil (initial_x * textord_width_limit); 00326 small_it.move_to_first (); 00327 for (small_it.mark_cycle_pt (); !small_it.cycled_list (); 00328 small_it.forward ()) { 00329 height = small_it.data()->bounding_box().height(); 00330 if (height > max_y) 00331 large_it.add_after_then_move(small_it.extract ()); 00332 else if (height >= min_y) 00333 src_it.add_after_then_move(small_it.extract ()); 00334 } 00335 size_stats.clear (); 00336 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00337 height = src_it.data ()->bounding_box ().height (); 00338 width = src_it.data ()->bounding_box ().width (); 00339 if (height < min_y) 00340 small_it.add_after_then_move (src_it.extract ()); 00341 else if (height > max_y || width > max_x) 00342 large_it.add_after_then_move (src_it.extract ()); 00343 else 00344 size_stats.add (height, 1); 00345 } 00346 max_height = size_stats.ile (textord_initialasc_ile); 00347 // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,", 00348 // max_y,min_y,initial_x,max_height); 00349 max_height *= tesseract::CCStruct::kXHeightCapRatio; 00350 if (max_height > initial_x) 00351 initial_x = max_height; 00352 // tprintf(" ret=%g\n",initial_x); 00353 return initial_x; 00354 } 00355 00356 // Fixes the block so it obeys all the rules: 00357 // Must have at least one ROW. 00358 // Must have at least one WERD. 00359 // WERDs contain a fake blob. 00360 void Textord::cleanup_nontext_block(BLOCK* block) { 00361 // Non-text blocks must contain at least one row. 00362 ROW_IT row_it(block->row_list()); 00363 if (row_it.empty()) { 00364 float height = block->bounding_box().height(); 00365 inT32 zero = 0; 00366 ROW* row = new ROW(0, &zero, NULL, height / 2.0f, height / 4.0f, 00367 height / 4.0f, 0, 1); 00368 row_it.add_after_then_move(row); 00369 } 00370 // Each row must contain at least one word. 00371 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00372 ROW* row = row_it.data(); 00373 WERD_IT w_it(row->word_list()); 00374 if (w_it.empty()) { 00375 // Make a fake blob to put in the word. 00376 TBOX box = block->row_list()->singleton() ? block->bounding_box() 00377 : row->bounding_box(); 00378 C_BLOB* blob = C_BLOB::FakeBlob(box); 00379 C_BLOB_LIST blobs; 00380 C_BLOB_IT blob_it(&blobs); 00381 blob_it.add_after_then_move(blob); 00382 WERD* word = new WERD(&blobs, 0, NULL); 00383 w_it.add_after_then_move(word); 00384 } 00385 // Each word must contain a fake blob. 00386 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { 00387 WERD* word = w_it.data(); 00388 // Just assert that this is true, as it would be useful to find 00389 // out why it isn't. 00390 ASSERT_HOST(!word->cblob_list()->empty()); 00391 } 00392 row->recalc_bounding_box(); 00393 } 00394 } 00395 00396 /********************************************************************** 00397 * cleanup_blocks 00398 * 00399 * Delete empty blocks, rows from the page. 00400 **********************************************************************/ 00401 00402 void Textord::cleanup_blocks( //remove empties 00403 BLOCK_LIST *blocks //list 00404 ) { 00405 BLOCK_IT block_it = blocks; //iterator 00406 ROW_IT row_it; //row iterator 00407 00408 int num_rows = 0; 00409 int num_rows_all = 0; 00410 int num_blocks = 0; 00411 int num_blocks_all = 0; 00412 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00413 block_it.forward()) { 00414 BLOCK* block = block_it.data(); 00415 if (block->poly_block() != NULL && !block->poly_block()->IsText()) { 00416 cleanup_nontext_block(block); 00417 continue; 00418 } 00419 num_rows = 0; 00420 num_rows_all = 0; 00421 row_it.set_to_list(block->row_list()); 00422 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 00423 ++num_rows_all; 00424 clean_small_noise_from_words(row_it.data()); 00425 if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() && 00426 clean_noise_from_row(row_it.data())) || 00427 row_it.data()->word_list()->empty()) { 00428 delete row_it.extract(); // lose empty row. 00429 } else { 00430 if (textord_noise_rejwords) 00431 clean_noise_from_words(row_it.data()); 00432 if (textord_blshift_maxshift >= 0) 00433 tweak_row_baseline(row_it.data(), 00434 textord_blshift_maxshift, 00435 textord_blshift_xfraction); 00436 ++num_rows; 00437 } 00438 } 00439 if (block->row_list()->empty()) { 00440 delete block_it.extract(); // Lose empty text blocks. 00441 } else { 00442 ++num_blocks; 00443 } 00444 ++num_blocks_all; 00445 if (textord_noise_debug) 00446 tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all); 00447 } 00448 if (textord_noise_debug) 00449 tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all); 00450 } 00451 00452 00453 /********************************************************************** 00454 * clean_noise_from_row 00455 * 00456 * Move blobs of words from rows of garbage into the reject blobs list. 00457 **********************************************************************/ 00458 00459 BOOL8 Textord::clean_noise_from_row( //remove empties 00460 ROW *row //row to clean 00461 ) { 00462 BOOL8 testing_on; 00463 TBOX blob_box; //bounding box 00464 C_BLOB *blob; //current blob 00465 C_OUTLINE *outline; //current outline 00466 WERD *word; //current word 00467 inT32 blob_size; //biggest size 00468 inT32 trans_count = 0; //no of transitions 00469 inT32 trans_threshold; //noise tolerance 00470 inT32 dot_count; //small objects 00471 inT32 norm_count; //normal objects 00472 inT32 super_norm_count; //real char-like 00473 //words of row 00474 WERD_IT word_it = row->word_list (); 00475 C_BLOB_IT blob_it; //blob iterator 00476 C_OUTLINE_IT out_it; //outline iterator 00477 00478 if (textord_test_y > row->base_line (textord_test_x) 00479 && textord_show_blobs 00480 && textord_test_y < row->base_line (textord_test_x) + row->x_height ()) 00481 testing_on = TRUE; 00482 else 00483 testing_on = FALSE; 00484 dot_count = 0; 00485 norm_count = 0; 00486 super_norm_count = 0; 00487 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00488 word = word_it.data (); //current word 00489 //blobs in word 00490 blob_it.set_to_list (word->cblob_list ()); 00491 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00492 blob_it.forward ()) { 00493 blob = blob_it.data (); 00494 if (!word->flag (W_DONT_CHOP)) { 00495 //get outlines 00496 out_it.set_to_list (blob->out_list ()); 00497 for (out_it.mark_cycle_pt (); !out_it.cycled_list (); 00498 out_it.forward ()) { 00499 outline = out_it.data (); 00500 blob_box = outline->bounding_box (); 00501 blob_size = 00502 blob_box.width () > 00503 blob_box.height ()? blob_box.width () : blob_box. 00504 height(); 00505 if (blob_size < textord_noise_sizelimit * row->x_height ()) 00506 dot_count++; //count smal outlines 00507 if (!outline->child ()->empty () 00508 && blob_box.height () < 00509 (1 + textord_noise_syfract) * row->x_height () 00510 && blob_box.height () > 00511 (1 - textord_noise_syfract) * row->x_height () 00512 && blob_box.width () < 00513 (1 + textord_noise_sxfract) * row->x_height () 00514 && blob_box.width () > 00515 (1 - textord_noise_sxfract) * row->x_height ()) 00516 super_norm_count++; //count smal outlines 00517 } 00518 } 00519 else 00520 super_norm_count++; 00521 blob_box = blob->bounding_box (); 00522 blob_size = 00523 blob_box.width () > 00524 blob_box.height ()? blob_box.width () : blob_box.height (); 00525 if (blob_size >= textord_noise_sizelimit * row->x_height () 00526 && blob_size < row->x_height () * 2) { 00527 trans_threshold = blob_size / textord_noise_sizefraction; 00528 trans_count = blob->count_transitions (trans_threshold); 00529 if (trans_count < textord_noise_translimit) 00530 norm_count++; 00531 } 00532 else if (blob_box.height () > row->x_height () * 2 00533 && (!word_it.at_first () || !blob_it.at_first ())) 00534 dot_count += 2; 00535 #ifndef SECURE_NAMES 00536 if (testing_on) { 00537 tprintf 00538 ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", 00539 blob_box.left (), blob_box.bottom (), blob_box.right (), 00540 blob_box.top (), blob->out_list ()->length (), trans_count, 00541 blob_box.bottom () - row->base_line (blob_box.left ())); 00542 } 00543 #endif 00544 } 00545 } 00546 #ifndef SECURE_NAMES 00547 if (textord_noise_debug) { 00548 tprintf ("Row ending at (%d,%g):", 00549 blob_box.right (), row->base_line (blob_box.right ())); 00550 tprintf (" R=%g, dc=%d, nc=%d, %s\n", 00551 norm_count > 0 ? (float) dot_count / norm_count : 9999, 00552 dot_count, norm_count, 00553 dot_count > norm_count * textord_noise_normratio 00554 && dot_count > 2 ? "REJECTED" : "ACCEPTED"); 00555 } 00556 #endif 00557 return super_norm_count < textord_noise_sncount 00558 && dot_count > norm_count * textord_noise_rowratio && dot_count > 2; 00559 } 00560 00561 /********************************************************************** 00562 * clean_noise_from_words 00563 * 00564 * Move blobs of words from rows of garbage into the reject blobs list. 00565 **********************************************************************/ 00566 00567 void Textord::clean_noise_from_words( //remove empties 00568 ROW *row //row to clean 00569 ) { 00570 TBOX blob_box; //bounding box 00571 inT8 *word_dud; //was it chucked 00572 C_BLOB *blob; //current blob 00573 C_OUTLINE *outline; //current outline 00574 WERD *word; //current word 00575 inT32 blob_size; //biggest size 00576 inT32 trans_count; //no of transitions 00577 inT32 trans_threshold; //noise tolerance 00578 inT32 dot_count; //small objects 00579 inT32 norm_count; //normal objects 00580 inT32 dud_words; //number discarded 00581 inT32 ok_words; //number remaining 00582 inT32 word_index; //current word 00583 //words of row 00584 WERD_IT word_it = row->word_list (); 00585 C_BLOB_IT blob_it; //blob iterator 00586 C_OUTLINE_IT out_it; //outline iterator 00587 00588 ok_words = word_it.length (); 00589 if (ok_words == 0 || textord_no_rejects) 00590 return; 00591 word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8)); 00592 dud_words = 0; 00593 ok_words = 0; 00594 word_index = 0; 00595 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00596 word = word_it.data (); //current word 00597 dot_count = 0; 00598 norm_count = 0; 00599 //blobs in word 00600 blob_it.set_to_list (word->cblob_list ()); 00601 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00602 blob_it.forward ()) { 00603 blob = blob_it.data (); 00604 if (!word->flag (W_DONT_CHOP)) { 00605 //get outlines 00606 out_it.set_to_list (blob->out_list ()); 00607 for (out_it.mark_cycle_pt (); !out_it.cycled_list (); 00608 out_it.forward ()) { 00609 outline = out_it.data (); 00610 blob_box = outline->bounding_box (); 00611 blob_size = 00612 blob_box.width () > 00613 blob_box.height ()? blob_box.width () : blob_box. 00614 height(); 00615 if (blob_size < textord_noise_sizelimit * row->x_height ()) 00616 dot_count++; //count smal outlines 00617 if (!outline->child ()->empty () 00618 && blob_box.height () < 00619 (1 + textord_noise_syfract) * row->x_height () 00620 && blob_box.height () > 00621 (1 - textord_noise_syfract) * row->x_height () 00622 && blob_box.width () < 00623 (1 + textord_noise_sxfract) * row->x_height () 00624 && blob_box.width () > 00625 (1 - textord_noise_sxfract) * row->x_height ()) 00626 norm_count++; //count smal outlines 00627 } 00628 } 00629 else 00630 norm_count++; 00631 blob_box = blob->bounding_box (); 00632 blob_size = 00633 blob_box.width () > 00634 blob_box.height ()? blob_box.width () : blob_box.height (); 00635 if (blob_size >= textord_noise_sizelimit * row->x_height () 00636 && blob_size < row->x_height () * 2) { 00637 trans_threshold = blob_size / textord_noise_sizefraction; 00638 trans_count = blob->count_transitions (trans_threshold); 00639 if (trans_count < textord_noise_translimit) 00640 norm_count++; 00641 } 00642 else if (blob_box.height () > row->x_height () * 2 00643 && (!word_it.at_first () || !blob_it.at_first ())) 00644 dot_count += 2; 00645 } 00646 if (dot_count > 2) { 00647 if (dot_count > norm_count * textord_noise_normratio * 2) 00648 word_dud[word_index] = 2; 00649 else if (dot_count > norm_count * textord_noise_normratio) 00650 word_dud[word_index] = 1; 00651 else 00652 word_dud[word_index] = 0; 00653 } 00654 else 00655 word_dud[word_index] = 0; 00656 if (word_dud[word_index] == 2) 00657 dud_words++; 00658 else 00659 ok_words++; 00660 word_index++; 00661 } 00662 00663 word_index = 0; 00664 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00665 if (word_dud[word_index] == 2 00666 || (word_dud[word_index] == 1 && dud_words > ok_words)) { 00667 word = word_it.data (); //current word 00668 //rejected blobs 00669 blob_it.set_to_list (word->rej_cblob_list ()); 00670 //move from blobs 00671 blob_it.add_list_after (word->cblob_list ()); 00672 } 00673 word_index++; 00674 } 00675 free_mem(word_dud); 00676 } 00677 00678 // Remove outlines that are a tiny fraction in either width or height 00679 // of the word height. 00680 void Textord::clean_small_noise_from_words(ROW *row) { 00681 WERD_IT word_it(row->word_list()); 00682 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { 00683 WERD* word = word_it.data(); 00684 int min_size = static_cast<int>( 00685 textord_noise_hfract * word->bounding_box().height() + 0.5); 00686 C_BLOB_IT blob_it(word->cblob_list()); 00687 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00688 C_BLOB* blob = blob_it.data(); 00689 C_OUTLINE_IT out_it(blob->out_list()); 00690 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { 00691 C_OUTLINE* outline = out_it.data(); 00692 outline->RemoveSmallRecursive(min_size, &out_it); 00693 } 00694 if (blob->out_list()->empty()) { 00695 delete blob_it.extract(); 00696 } 00697 } 00698 if (word->cblob_list()->empty()) { 00699 if (!word_it.at_last()) { 00700 // The next word is no longer a fuzzy non space if it was before, 00701 // since the word before is about to be deleted. 00702 WERD* next_word = word_it.data_relative(1); 00703 if (next_word->flag(W_FUZZY_NON)) { 00704 next_word->set_flag(W_FUZZY_NON, false); 00705 } 00706 } 00707 delete word_it.extract(); 00708 } 00709 } 00710 } 00711 } // tesseract 00712 00713 /********************************************************************** 00714 * tweak_row_baseline 00715 * 00716 * Shift baseline to fit the blobs more accurately where they are 00717 * close enough. 00718 **********************************************************************/ 00719 00720 void tweak_row_baseline(ROW *row, 00721 double blshift_maxshift, 00722 double blshift_xfraction) { 00723 TBOX blob_box; //bounding box 00724 C_BLOB *blob; //current blob 00725 WERD *word; //current word 00726 inT32 blob_count; //no of blobs 00727 inT32 src_index; //source segment 00728 inT32 dest_index; //destination segment 00729 inT32 *xstarts; //spline segments 00730 double *coeffs; //spline coeffs 00731 float ydiff; //baseline error 00732 float x_centre; //centre of blob 00733 //words of row 00734 WERD_IT word_it = row->word_list (); 00735 C_BLOB_IT blob_it; //blob iterator 00736 00737 blob_count = 0; 00738 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00739 word = word_it.data (); //current word 00740 //get total blobs 00741 blob_count += word->cblob_list ()->length (); 00742 } 00743 if (blob_count == 0) 00744 return; 00745 xstarts = 00746 (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) * 00747 sizeof (inT32)); 00748 coeffs = 00749 (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 * 00750 sizeof (double)); 00751 00752 src_index = 0; 00753 dest_index = 0; 00754 xstarts[0] = row->baseline.xcoords[0]; 00755 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00756 word = word_it.data (); //current word 00757 //blobs in word 00758 blob_it.set_to_list (word->cblob_list ()); 00759 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00760 blob_it.forward ()) { 00761 blob = blob_it.data (); 00762 blob_box = blob->bounding_box (); 00763 x_centre = (blob_box.left () + blob_box.right ()) / 2.0; 00764 ydiff = blob_box.bottom () - row->base_line (x_centre); 00765 if (ydiff < 0) 00766 ydiff = -ydiff / row->x_height (); 00767 else 00768 ydiff = ydiff / row->x_height (); 00769 if (ydiff < blshift_maxshift 00770 && blob_box.height () / row->x_height () > blshift_xfraction) { 00771 if (xstarts[dest_index] >= x_centre) 00772 xstarts[dest_index] = blob_box.left (); 00773 coeffs[dest_index * 3] = 0; 00774 coeffs[dest_index * 3 + 1] = 0; 00775 coeffs[dest_index * 3 + 2] = blob_box.bottom (); 00776 //shift it 00777 dest_index++; 00778 xstarts[dest_index] = blob_box.right () + 1; 00779 } 00780 else { 00781 if (xstarts[dest_index] <= x_centre) { 00782 while (row->baseline.xcoords[src_index + 1] <= x_centre 00783 && src_index < row->baseline.segments - 1) { 00784 if (row->baseline.xcoords[src_index + 1] > 00785 xstarts[dest_index]) { 00786 coeffs[dest_index * 3] = 00787 row->baseline.quadratics[src_index].a; 00788 coeffs[dest_index * 3 + 1] = 00789 row->baseline.quadratics[src_index].b; 00790 coeffs[dest_index * 3 + 2] = 00791 row->baseline.quadratics[src_index].c; 00792 dest_index++; 00793 xstarts[dest_index] = 00794 row->baseline.xcoords[src_index + 1]; 00795 } 00796 src_index++; 00797 } 00798 coeffs[dest_index * 3] = 00799 row->baseline.quadratics[src_index].a; 00800 coeffs[dest_index * 3 + 1] = 00801 row->baseline.quadratics[src_index].b; 00802 coeffs[dest_index * 3 + 2] = 00803 row->baseline.quadratics[src_index].c; 00804 dest_index++; 00805 xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; 00806 } 00807 } 00808 } 00809 } 00810 while (src_index < row->baseline.segments 00811 && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) 00812 src_index++; 00813 while (src_index < row->baseline.segments) { 00814 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; 00815 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; 00816 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; 00817 dest_index++; 00818 src_index++; 00819 xstarts[dest_index] = row->baseline.xcoords[src_index]; 00820 } 00821 //turn to spline 00822 row->baseline = QSPLINE (dest_index, xstarts, coeffs); 00823 free_mem(xstarts); 00824 free_mem(coeffs); 00825 } 00826 00827 /********************************************************************** 00828 * blob_y_order 00829 * 00830 * Sort function to sort blobs in y from page top. 00831 **********************************************************************/ 00832 00833 inT32 blob_y_order( //sort function 00834 void *item1, //items to compare 00835 void *item2) { 00836 //converted ptr 00837 BLOBNBOX *blob1 = *(BLOBNBOX **) item1; 00838 //converted ptr 00839 BLOBNBOX *blob2 = *(BLOBNBOX **) item2; 00840 00841 if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ()) 00842 return -1; 00843 else if (blob1->bounding_box ().bottom () < 00844 blob2->bounding_box ().bottom ()) 00845 return 1; 00846 else { 00847 if (blob1->bounding_box ().left () < blob2->bounding_box ().left ()) 00848 return -1; 00849 else if (blob1->bounding_box ().left () > 00850 blob2->bounding_box ().left ()) 00851 return 1; 00852 else 00853 return 0; 00854 } 00855 }