tesseract
3.03
|
00001 /********************************************************************** 00002 * File: wordseg.cpp (Formerly wspace.c) 00003 * Description: Code to segment the blobs into words. 00004 * Author: Ray Smith 00005 * Created: Fri Oct 16 11:32:28 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef __UNIX__ 00021 #include <assert.h> 00022 #endif 00023 #include "stderr.h" 00024 #include "blobbox.h" 00025 #include "statistc.h" 00026 #include "drawtord.h" 00027 #include "makerow.h" 00028 #include "pitsync1.h" 00029 #include "tovars.h" 00030 #include "topitch.h" 00031 #include "cjkpitch.h" 00032 #include "textord.h" 00033 #include "fpchop.h" 00034 #include "wordseg.h" 00035 00036 // Include automatically generated configuration file if running autoconf. 00037 #ifdef HAVE_CONFIG_H 00038 #include "config_auto.h" 00039 #endif 00040 00041 #define EXTERN 00042 00043 EXTERN BOOL_VAR(textord_fp_chopping, TRUE, "Do fixed pitch chopping"); 00044 EXTERN BOOL_VAR(textord_force_make_prop_words, FALSE, 00045 "Force proportional word segmentation on all rows"); 00046 EXTERN BOOL_VAR(textord_chopper_test, FALSE, 00047 "Chopper is being tested."); 00048 00049 #define FIXED_WIDTH_MULTIPLE 5 00050 #define BLOCK_STATS_CLUSTERS 10 00051 00052 00060 void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) { 00061 TO_ROW_IT to_row_it(rows); 00062 ROW_IT row_it(real_rows); 00063 for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); 00064 to_row_it.forward()) { 00065 TO_ROW* row = to_row_it.data(); 00066 // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready 00067 // to create the word. 00068 C_BLOB_LIST cblobs; 00069 C_BLOB_IT cblob_it(&cblobs); 00070 BLOBNBOX_IT box_it(row->blob_list()); 00071 for (;!box_it.empty(); box_it.forward()) { 00072 BLOBNBOX* bblob= box_it.extract(); 00073 if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { 00074 if (bblob->cblob() != NULL) { 00075 C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); 00076 cout_it.move_to_last(); 00077 cout_it.add_list_after(bblob->cblob()->out_list()); 00078 delete bblob->cblob(); 00079 } 00080 } else { 00081 if (bblob->cblob() != NULL) 00082 cblob_it.add_after_then_move(bblob->cblob()); 00083 } 00084 delete bblob; 00085 } 00086 // Convert the TO_ROW to a ROW. 00087 ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size), 00088 static_cast<inT16>(row->space_size)); 00089 WERD_IT word_it(real_row->word_list()); 00090 WERD* word = new WERD(&cblobs, 0, NULL); 00091 word->set_flag(W_BOL, TRUE); 00092 word->set_flag(W_EOL, TRUE); 00093 word->set_flag(W_DONT_CHOP, one_blob); 00094 word_it.add_after_then_move(word); 00095 row_it.add_after_then_move(real_row); 00096 } 00097 } 00098 00104 void make_words(tesseract::Textord *textord, 00105 ICOORD page_tr, // top right 00106 float gradient, // page skew 00107 BLOCK_LIST *blocks, // block list 00108 TO_BLOCK_LIST *port_blocks) { // output list 00109 TO_BLOCK_IT block_it; // iterator 00110 TO_BLOCK *block; // current block 00111 00112 if (textord->use_cjk_fp_model()) { 00113 compute_fixed_pitch_cjk(page_tr, port_blocks); 00114 } else { 00115 compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f), 00116 !(BOOL8) textord_test_landscape); 00117 } 00118 textord->to_spacing(page_tr, port_blocks); 00119 block_it.set_to_list(port_blocks); 00120 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { 00121 block = block_it.data(); 00122 make_real_words(textord, block, FCOORD(1.0f, 0.0f)); 00123 } 00124 } 00125 00126 00134 void set_row_spaces( //find space sizes 00135 TO_BLOCK *block, //block to do 00136 FCOORD rotation, //for drawing 00137 BOOL8 testing_on //correct orientation 00138 ) { 00139 TO_ROW *row; //current row 00140 TO_ROW_IT row_it = block->get_rows (); 00141 00142 if (row_it.empty ()) 00143 return; //empty block 00144 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00145 row = row_it.data (); 00146 if (row->fixed_pitch == 0) { 00147 row->min_space = 00148 (inT32) ceil (row->pr_space - 00149 (row->pr_space - 00150 row->pr_nonsp) * textord_words_definite_spread); 00151 row->max_nonspace = 00152 (inT32) floor (row->pr_nonsp + 00153 (row->pr_space - 00154 row->pr_nonsp) * textord_words_definite_spread); 00155 if (testing_on && textord_show_initial_words) { 00156 tprintf ("Assigning defaults %d non, %d space to row at %g\n", 00157 row->max_nonspace, row->min_space, row->intercept ()); 00158 } 00159 row->space_threshold = (row->max_nonspace + row->min_space) / 2; 00160 row->space_size = row->pr_space; 00161 row->kern_size = row->pr_nonsp; 00162 } 00163 #ifndef GRAPHICS_DISABLED 00164 if (textord_show_initial_words && testing_on) { 00165 plot_word_decisions (to_win, (inT16) row->fixed_pitch, row); 00166 } 00167 #endif 00168 } 00169 } 00170 00171 00178 inT32 row_words( //compute space size 00179 TO_BLOCK *block, //block it came from 00180 TO_ROW *row, //row to operate on 00181 inT32 maxwidth, //max expected space size 00182 FCOORD rotation, //for drawing 00183 BOOL8 testing_on //for debug 00184 ) { 00185 BOOL8 testing_row; //contains testpt 00186 BOOL8 prev_valid; //if decent size 00187 BOOL8 this_valid; //current blob big enough 00188 inT32 prev_x; //end of prev blob 00189 inT32 min_gap; //min interesting gap 00190 inT32 cluster_count; //no of clusters 00191 inT32 gap_index; //which cluster 00192 inT32 smooth_factor; //for smoothing stats 00193 BLOBNBOX *blob; //current blob 00194 float lower, upper; //clustering parameters 00195 float gaps[3]; //gap clusers 00196 ICOORD testpt; 00197 TBOX blob_box; //bounding box 00198 //iterator 00199 BLOBNBOX_IT blob_it = row->blob_list (); 00200 STATS gap_stats (0, maxwidth); 00201 STATS cluster_stats[4]; //clusters 00202 00203 testpt = ICOORD (textord_test_x, textord_test_y); 00204 smooth_factor = 00205 (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5); 00206 // if (testing_on) 00207 // tprintf("Row smooth factor=%d\n",smooth_factor); 00208 prev_valid = FALSE; 00209 prev_x = -MAX_INT32; 00210 testing_row = FALSE; 00211 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { 00212 blob = blob_it.data (); 00213 blob_box = blob->bounding_box (); 00214 if (blob_box.contains (testpt)) 00215 testing_row = TRUE; 00216 gap_stats.add (blob_box.width (), 1); 00217 } 00218 min_gap = (inT32) floor (gap_stats.ile (textord_words_width_ile)); 00219 gap_stats.clear (); 00220 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { 00221 blob = blob_it.data (); 00222 if (!blob->joined_to_prev ()) { 00223 blob_box = blob->bounding_box (); 00224 // this_valid=blob_box.width()>=min_gap; 00225 this_valid = TRUE; 00226 if (this_valid && prev_valid 00227 && blob_box.left () - prev_x < maxwidth) { 00228 gap_stats.add (blob_box.left () - prev_x, 1); 00229 } 00230 prev_x = blob_box.right (); 00231 prev_valid = this_valid; 00232 } 00233 } 00234 if (gap_stats.get_total () == 0) { 00235 row->min_space = 0; //no evidence 00236 row->max_nonspace = 0; 00237 return 0; 00238 } 00239 gap_stats.smooth (smooth_factor); 00240 lower = row->xheight * textord_words_initial_lower; 00241 upper = row->xheight * textord_words_initial_upper; 00242 cluster_count = gap_stats.cluster (lower, upper, 00243 textord_spacesize_ratioprop, 3, 00244 cluster_stats); 00245 while (cluster_count < 2 && ceil (lower) < floor (upper)) { 00246 //shrink gap 00247 upper = (upper * 3 + lower) / 4; 00248 lower = (lower * 3 + upper) / 4; 00249 cluster_count = gap_stats.cluster (lower, upper, 00250 textord_spacesize_ratioprop, 3, 00251 cluster_stats); 00252 } 00253 if (cluster_count < 2) { 00254 row->min_space = 0; //no evidence 00255 row->max_nonspace = 0; 00256 return 0; 00257 } 00258 for (gap_index = 0; gap_index < cluster_count; gap_index++) 00259 gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5); 00260 //get medians 00261 if (cluster_count > 2) { 00262 if (testing_on && textord_show_initial_words) { 00263 tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n", 00264 row->intercept (), 00265 cluster_stats[1].ile (0.5), 00266 cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5)); 00267 } 00268 lower = gaps[0]; 00269 if (gaps[1] > lower) { 00270 upper = gaps[1]; //prefer most frequent 00271 if (upper < block->xheight * textord_words_min_minspace 00272 && gaps[2] > gaps[1]) { 00273 upper = gaps[2]; 00274 } 00275 } 00276 else if (gaps[2] > lower 00277 && gaps[2] >= block->xheight * textord_words_min_minspace) 00278 upper = gaps[2]; 00279 else if (lower >= block->xheight * textord_words_min_minspace) { 00280 upper = lower; //not nice 00281 lower = gaps[1]; 00282 if (testing_on && textord_show_initial_words) { 00283 tprintf ("Had to switch most common from lower to upper!!\n"); 00284 gap_stats.print(); 00285 } 00286 } 00287 else { 00288 row->min_space = 0; //no evidence 00289 row->max_nonspace = 0; 00290 return 0; 00291 } 00292 } 00293 else { 00294 if (gaps[1] < gaps[0]) { 00295 if (testing_on && textord_show_initial_words) { 00296 tprintf ("Had to switch most common from lower to upper!!\n"); 00297 gap_stats.print(); 00298 } 00299 lower = gaps[1]; 00300 upper = gaps[0]; 00301 } 00302 else { 00303 upper = gaps[1]; 00304 lower = gaps[0]; 00305 } 00306 } 00307 if (upper < block->xheight * textord_words_min_minspace) { 00308 row->min_space = 0; //no evidence 00309 row->max_nonspace = 0; 00310 return 0; 00311 } 00312 if (upper * 3 < block->min_space * 2 + block->max_nonspace 00313 || lower * 3 > block->min_space * 2 + block->max_nonspace) { 00314 if (testing_on && textord_show_initial_words) { 00315 tprintf ("Disagreement between block and row at %g!!\n", 00316 row->intercept ()); 00317 tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper); 00318 gap_stats.print(); 00319 } 00320 } 00321 row->min_space = 00322 (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread); 00323 row->max_nonspace = 00324 (inT32) floor (lower + (upper - lower) * textord_words_definite_spread); 00325 row->space_threshold = (row->max_nonspace + row->min_space) / 2; 00326 row->space_size = upper; 00327 row->kern_size = lower; 00328 if (testing_on && textord_show_initial_words) { 00329 if (testing_row) { 00330 tprintf ("GAP STATS\n"); 00331 gap_stats.print(); 00332 tprintf ("SPACE stats\n"); 00333 cluster_stats[2].print_summary(); 00334 tprintf ("NONSPACE stats\n"); 00335 cluster_stats[1].print_summary(); 00336 } 00337 tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", 00338 row->intercept (), row->min_space, upper, 00339 row->max_nonspace, lower); 00340 } 00341 return cluster_stats[2].get_total (); 00342 } 00343 00344 00351 inT32 row_words2( //compute space size 00352 TO_BLOCK *block, //block it came from 00353 TO_ROW *row, //row to operate on 00354 inT32 maxwidth, //max expected space size 00355 FCOORD rotation, //for drawing 00356 BOOL8 testing_on //for debug 00357 ) { 00358 BOOL8 testing_row; //contains testpt 00359 BOOL8 prev_valid; //if decent size 00360 BOOL8 this_valid; //current blob big enough 00361 inT32 prev_x; //end of prev blob 00362 inT32 min_width; //min interesting width 00363 inT32 valid_count; //good gaps 00364 inT32 total_count; //total gaps 00365 inT32 cluster_count; //no of clusters 00366 inT32 prev_count; //previous cluster_count 00367 inT32 gap_index; //which cluster 00368 inT32 smooth_factor; //for smoothing stats 00369 BLOBNBOX *blob; //current blob 00370 float lower, upper; //clustering parameters 00371 ICOORD testpt; 00372 TBOX blob_box; //bounding box 00373 //iterator 00374 BLOBNBOX_IT blob_it = row->blob_list (); 00375 STATS gap_stats (0, maxwidth); 00376 //gap sizes 00377 float gaps[BLOCK_STATS_CLUSTERS]; 00378 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; 00379 //clusters 00380 00381 testpt = ICOORD (textord_test_x, textord_test_y); 00382 smooth_factor = 00383 (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5); 00384 // if (testing_on) 00385 // tprintf("Row smooth factor=%d\n",smooth_factor); 00386 prev_valid = FALSE; 00387 prev_x = -MAX_INT16; 00388 testing_row = FALSE; 00389 //min blob size 00390 min_width = (inT32) block->pr_space; 00391 total_count = 0; 00392 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { 00393 blob = blob_it.data (); 00394 if (!blob->joined_to_prev ()) { 00395 blob_box = blob->bounding_box (); 00396 this_valid = blob_box.width () >= min_width; 00397 this_valid = TRUE; 00398 if (this_valid && prev_valid 00399 && blob_box.left () - prev_x < maxwidth) { 00400 gap_stats.add (blob_box.left () - prev_x, 1); 00401 } 00402 total_count++; //count possibles 00403 prev_x = blob_box.right (); 00404 prev_valid = this_valid; 00405 } 00406 } 00407 valid_count = gap_stats.get_total (); 00408 if (valid_count < total_count * textord_words_minlarge) { 00409 gap_stats.clear (); 00410 prev_x = -MAX_INT16; 00411 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00412 blob_it.forward ()) { 00413 blob = blob_it.data (); 00414 if (!blob->joined_to_prev ()) { 00415 blob_box = blob->bounding_box (); 00416 if (blob_box.left () - prev_x < maxwidth) { 00417 gap_stats.add (blob_box.left () - prev_x, 1); 00418 } 00419 prev_x = blob_box.right (); 00420 } 00421 } 00422 } 00423 if (gap_stats.get_total () == 0) { 00424 row->min_space = 0; //no evidence 00425 row->max_nonspace = 0; 00426 return 0; 00427 } 00428 00429 cluster_count = 0; 00430 lower = block->xheight * words_initial_lower; 00431 upper = block->xheight * words_initial_upper; 00432 gap_stats.smooth (smooth_factor); 00433 do { 00434 prev_count = cluster_count; 00435 cluster_count = gap_stats.cluster (lower, upper, 00436 textord_spacesize_ratioprop, 00437 BLOCK_STATS_CLUSTERS, cluster_stats); 00438 } 00439 while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); 00440 if (cluster_count < 1) { 00441 row->min_space = 0; 00442 row->max_nonspace = 0; 00443 return 0; 00444 } 00445 for (gap_index = 0; gap_index < cluster_count; gap_index++) 00446 gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5); 00447 //get medians 00448 if (testing_on) { 00449 tprintf ("cluster_count=%d:", cluster_count); 00450 for (gap_index = 0; gap_index < cluster_count; gap_index++) 00451 tprintf (" %g(%d)", gaps[gap_index], 00452 cluster_stats[gap_index + 1].get_total ()); 00453 tprintf ("\n"); 00454 } 00455 00456 //Try to find proportional non-space and space for row. 00457 for (gap_index = 0; gap_index < cluster_count 00458 && gaps[gap_index] > block->max_nonspace; gap_index++); 00459 if (gap_index < cluster_count) 00460 lower = gaps[gap_index]; //most frequent below 00461 else { 00462 if (testing_on) 00463 tprintf ("No cluster below block threshold!, using default=%g\n", 00464 block->pr_nonsp); 00465 lower = block->pr_nonsp; 00466 } 00467 for (gap_index = 0; gap_index < cluster_count 00468 && gaps[gap_index] <= block->max_nonspace; gap_index++); 00469 if (gap_index < cluster_count) 00470 upper = gaps[gap_index]; //most frequent above 00471 else { 00472 if (testing_on) 00473 tprintf ("No cluster above block threshold!, using default=%g\n", 00474 block->pr_space); 00475 upper = block->pr_space; 00476 } 00477 row->min_space = 00478 (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread); 00479 row->max_nonspace = 00480 (inT32) floor (lower + (upper - lower) * textord_words_definite_spread); 00481 row->space_threshold = (row->max_nonspace + row->min_space) / 2; 00482 row->space_size = upper; 00483 row->kern_size = lower; 00484 if (testing_on) { 00485 if (testing_row) { 00486 tprintf ("GAP STATS\n"); 00487 gap_stats.print(); 00488 tprintf ("SPACE stats\n"); 00489 cluster_stats[2].print_summary(); 00490 tprintf ("NONSPACE stats\n"); 00491 cluster_stats[1].print_summary(); 00492 } 00493 tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", 00494 row->intercept (), row->min_space, upper, 00495 row->max_nonspace, lower); 00496 } 00497 return 1; 00498 } 00499 00500 00507 void make_real_words( 00508 tesseract::Textord *textord, 00509 TO_BLOCK *block, //block to do 00510 FCOORD rotation //for drawing 00511 ) { 00512 TO_ROW *row; //current row 00513 TO_ROW_IT row_it = block->get_rows (); 00514 ROW *real_row = NULL; //output row 00515 ROW_IT real_row_it = block->block->row_list (); 00516 00517 if (row_it.empty ()) 00518 return; //empty block 00519 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00520 row = row_it.data (); 00521 if (row->blob_list ()->empty () && !row->rep_words.empty ()) { 00522 real_row = make_rep_words (row, block); 00523 } else if (!row->blob_list()->empty()) { 00524 // In a fixed pitch document, some lines may be detected as fixed pitch 00525 // while others don't, and will go through different path. 00526 // For non-space delimited language like CJK, fixed pitch chop always 00527 // leave the entire line as one word. We can force consistent chopping 00528 // with force_make_prop_words flag. 00529 POLY_BLOCK* pb = block->block->poly_block(); 00530 if (textord_chopper_test) { 00531 real_row = textord->make_blob_words (row, rotation); 00532 } else if (textord_force_make_prop_words || 00533 (pb != NULL && !pb->IsText()) || 00534 row->pitch_decision == PITCH_DEF_PROP || 00535 row->pitch_decision == PITCH_CORR_PROP) { 00536 real_row = textord->make_prop_words (row, rotation); 00537 } else if (row->pitch_decision == PITCH_DEF_FIXED || 00538 row->pitch_decision == PITCH_CORR_FIXED) { 00539 real_row = fixed_pitch_words (row, rotation); 00540 } else { 00541 ASSERT_HOST(FALSE); 00542 } 00543 } 00544 if (real_row != NULL) { 00545 //put row in block 00546 real_row_it.add_after_then_move (real_row); 00547 } 00548 } 00549 block->block->set_stats (block->fixed_pitch == 0, (inT16) block->kern_size, 00550 (inT16) block->space_size, 00551 (inT16) block->fixed_pitch); 00552 block->block->check_pitch (); 00553 } 00554 00555 00563 ROW *make_rep_words( //make a row 00564 TO_ROW *row, //row to convert 00565 TO_BLOCK *block //block it lives in 00566 ) { 00567 ROW *real_row; //output row 00568 TBOX word_box; //bounding box 00569 //iterator 00570 WERD_IT word_it = &row->rep_words; 00571 00572 if (word_it.empty ()) 00573 return NULL; 00574 word_box = word_it.data ()->bounding_box (); 00575 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) 00576 word_box += word_it.data ()->bounding_box (); 00577 row->xheight = block->xheight; 00578 real_row = new ROW(row, 00579 (inT16) block->kern_size, (inT16) block->space_size); 00580 word_it.set_to_list (real_row->word_list ()); 00581 //put words in row 00582 word_it.add_list_after (&row->rep_words); 00583 real_row->recalc_bounding_box (); 00584 return real_row; 00585 } 00586 00587 00595 WERD *make_real_word(BLOBNBOX_IT *box_it, //iterator 00596 inT32 blobcount, //no of blobs to use 00597 BOOL8 bol, //start of line 00598 uinT8 blanks //no of blanks 00599 ) { 00600 C_OUTLINE_IT cout_it; 00601 C_BLOB_LIST cblobs; 00602 C_BLOB_IT cblob_it = &cblobs; 00603 WERD *word; // new word 00604 BLOBNBOX *bblob; // current blob 00605 inT32 blobindex; // in row 00606 00607 for (blobindex = 0; blobindex < blobcount; blobindex++) { 00608 bblob = box_it->extract(); 00609 if (bblob->joined_to_prev()) { 00610 if (bblob->cblob() != NULL) { 00611 cout_it.set_to_list(cblob_it.data()->out_list()); 00612 cout_it.move_to_last(); 00613 cout_it.add_list_after(bblob->cblob()->out_list()); 00614 delete bblob->cblob(); 00615 } 00616 } 00617 else { 00618 if (bblob->cblob() != NULL) 00619 cblob_it.add_after_then_move(bblob->cblob()); 00620 } 00621 delete bblob; 00622 box_it->forward(); // next one 00623 } 00624 00625 if (blanks < 1) 00626 blanks = 1; 00627 00628 word = new WERD(&cblobs, blanks, NULL); 00629 00630 if (bol) 00631 word->set_flag(W_BOL, TRUE); 00632 if (box_it->at_first()) 00633 word->set_flag(W_EOL, TRUE); // at end of line 00634 00635 return word; 00636 }