tesseract
3.03
|
00001 /********************************************************************** 00002 * File: topitch.cpp (Formerly to_pitch.c) 00003 * Description: Code to determine fixed pitchness and the pitch if fixed. 00004 * Author: Ray Smith 00005 * Created: Tue Aug 24 16:57:29 BST 1993 00006 * 00007 * (C) Copyright 1993, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef __UNIX__ 00021 #include <assert.h> 00022 #endif 00023 #include "stderr.h" 00024 #include "blobbox.h" 00025 #include "statistc.h" 00026 #include "drawtord.h" 00027 #include "makerow.h" 00028 #include "pitsync1.h" 00029 #include "pithsync.h" 00030 #include "tovars.h" 00031 #include "wordseg.h" 00032 #include "topitch.h" 00033 #include "secname.h" 00034 #include "helpers.h" 00035 00036 // Include automatically generated configuration file if running autoconf. 00037 #ifdef HAVE_CONFIG_H 00038 #include "config_auto.h" 00039 #endif 00040 00041 #define EXTERN 00042 00043 EXTERN BOOL_VAR (textord_all_prop, FALSE, "All doc is proportial text"); 00044 EXTERN BOOL_VAR (textord_debug_pitch_test, FALSE, 00045 "Debug on fixed pitch test"); 00046 EXTERN BOOL_VAR (textord_disable_pitch_test, FALSE, 00047 "Turn off dp fixed pitch algorithm"); 00048 EXTERN BOOL_VAR (textord_fast_pitch_test, FALSE, 00049 "Do even faster pitch algorithm"); 00050 EXTERN BOOL_VAR (textord_debug_pitch_metric, FALSE, 00051 "Write full metric stuff"); 00052 EXTERN BOOL_VAR (textord_show_row_cuts, FALSE, "Draw row-level cuts"); 00053 EXTERN BOOL_VAR (textord_show_page_cuts, FALSE, "Draw page-level cuts"); 00054 EXTERN BOOL_VAR (textord_pitch_cheat, FALSE, 00055 "Use correct answer for fixed/prop"); 00056 EXTERN BOOL_VAR (textord_blockndoc_fixed, FALSE, 00057 "Attempt whole doc/block fixed pitch"); 00058 EXTERN double_VAR (textord_projection_scale, 0.200, "Ding rate for mid-cuts"); 00059 EXTERN double_VAR (textord_balance_factor, 1.0, 00060 "Ding rate for unbalanced char cells"); 00061 00062 #define FIXED_WIDTH_MULTIPLE 5 00063 #define BLOCK_STATS_CLUSTERS 10 00064 #define MAX_ALLOWED_PITCH 100 //max pixel pitch. 00065 00066 /********************************************************************** 00067 * compute_fixed_pitch 00068 * 00069 * Decide whether each row is fixed pitch individually. 00070 * Correlate definite and uncertain results to obtain an individual 00071 * result for each row in the TO_ROW class. 00072 **********************************************************************/ 00073 00074 void compute_fixed_pitch(ICOORD page_tr, // top right 00075 TO_BLOCK_LIST *port_blocks, // input list 00076 float gradient, // page skew 00077 FCOORD rotation, // for drawing 00078 BOOL8 testing_on) { // correct orientation 00079 TO_BLOCK_IT block_it; //iterator 00080 TO_BLOCK *block; //current block; 00081 TO_ROW_IT row_it; //row iterator 00082 TO_ROW *row; //current row 00083 int block_index; //block number 00084 int row_index; //row number 00085 00086 #ifndef GRAPHICS_DISABLED 00087 if (textord_show_initial_words && testing_on) { 00088 if (to_win == NULL) 00089 create_to_win(page_tr); 00090 } 00091 #endif 00092 00093 block_it.set_to_list (port_blocks); 00094 block_index = 1; 00095 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00096 block_it.forward ()) { 00097 block = block_it.data (); 00098 compute_block_pitch(block, rotation, block_index, testing_on); 00099 block_index++; 00100 } 00101 00102 if (!try_doc_fixed (page_tr, port_blocks, gradient)) { 00103 block_index = 1; 00104 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00105 block_it.forward ()) { 00106 block = block_it.data (); 00107 if (!try_block_fixed (block, block_index)) 00108 try_rows_fixed(block, block_index, testing_on); 00109 block_index++; 00110 } 00111 } 00112 00113 block_index = 1; 00114 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00115 block_it.forward()) { 00116 block = block_it.data (); 00117 POLY_BLOCK* pb = block->block->poly_block(); 00118 if (pb != NULL && !pb->IsText()) continue; // Non-text doesn't exist! 00119 row_it.set_to_list (block->get_rows ()); 00120 row_index = 1; 00121 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00122 row = row_it.data (); 00123 fix_row_pitch(row, block, port_blocks, row_index, block_index); 00124 row_index++; 00125 } 00126 block_index++; 00127 } 00128 #ifndef GRAPHICS_DISABLED 00129 if (textord_show_initial_words && testing_on) { 00130 ScrollView::Update(); 00131 } 00132 #endif 00133 } 00134 00135 00136 /********************************************************************** 00137 * fix_row_pitch 00138 * 00139 * Get a pitch_decision for this row by voting among similar rows in the 00140 * block, then similar rows over all the page, or any other rows at all. 00141 **********************************************************************/ 00142 00143 void fix_row_pitch(TO_ROW *bad_row, // row to fix 00144 TO_BLOCK *bad_block, // block of bad_row 00145 TO_BLOCK_LIST *blocks, // blocks to scan 00146 inT32 row_target, // number of row 00147 inT32 block_target) { // number of block 00148 inT16 mid_cuts; 00149 int block_votes; //votes in block 00150 int like_votes; //votes over page 00151 int other_votes; //votes of unlike blocks 00152 int block_index; //number of block 00153 int row_index; //number of row 00154 int maxwidth; //max pitch 00155 TO_BLOCK_IT block_it = blocks; //block iterator 00156 TO_ROW_IT row_it; 00157 TO_BLOCK *block; //current block 00158 TO_ROW *row; //current row 00159 float sp_sd; //space deviation 00160 STATS block_stats; //pitches in block 00161 STATS like_stats; //pitches in page 00162 00163 block_votes = like_votes = other_votes = 0; 00164 maxwidth = (inT32) ceil (bad_row->xheight * textord_words_maxspace); 00165 if (bad_row->pitch_decision != PITCH_DEF_FIXED 00166 && bad_row->pitch_decision != PITCH_DEF_PROP) { 00167 block_stats.set_range (0, maxwidth); 00168 like_stats.set_range (0, maxwidth); 00169 block_index = 1; 00170 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); 00171 block_it.forward()) { 00172 block = block_it.data(); 00173 POLY_BLOCK* pb = block->block->poly_block(); 00174 if (pb != NULL && !pb->IsText()) continue; // Non text doesn't exist! 00175 row_index = 1; 00176 row_it.set_to_list (block->get_rows ()); 00177 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); 00178 row_it.forward ()) { 00179 row = row_it.data (); 00180 if ((bad_row->all_caps 00181 && row->xheight + row->ascrise 00182 < 00183 (bad_row->xheight + bad_row->ascrise) * (1 + 00184 textord_pitch_rowsimilarity) 00185 && row->xheight + row->ascrise > 00186 (bad_row->xheight + bad_row->ascrise) * (1 - 00187 textord_pitch_rowsimilarity)) 00188 || (!bad_row->all_caps 00189 && row->xheight < 00190 bad_row->xheight * (1 + textord_pitch_rowsimilarity) 00191 && row->xheight > 00192 bad_row->xheight * (1 - textord_pitch_rowsimilarity))) { 00193 if (block_index == block_target) { 00194 if (row->pitch_decision == PITCH_DEF_FIXED) { 00195 block_votes += textord_words_veto_power; 00196 block_stats.add ((inT32) row->fixed_pitch, 00197 textord_words_veto_power); 00198 } 00199 else if (row->pitch_decision == PITCH_MAYBE_FIXED 00200 || row->pitch_decision == PITCH_CORR_FIXED) { 00201 block_votes++; 00202 block_stats.add ((inT32) row->fixed_pitch, 1); 00203 } 00204 else if (row->pitch_decision == PITCH_DEF_PROP) 00205 block_votes -= textord_words_veto_power; 00206 else if (row->pitch_decision == PITCH_MAYBE_PROP 00207 || row->pitch_decision == PITCH_CORR_PROP) 00208 block_votes--; 00209 } 00210 else { 00211 if (row->pitch_decision == PITCH_DEF_FIXED) { 00212 like_votes += textord_words_veto_power; 00213 like_stats.add ((inT32) row->fixed_pitch, 00214 textord_words_veto_power); 00215 } 00216 else if (row->pitch_decision == PITCH_MAYBE_FIXED 00217 || row->pitch_decision == PITCH_CORR_FIXED) { 00218 like_votes++; 00219 like_stats.add ((inT32) row->fixed_pitch, 1); 00220 } 00221 else if (row->pitch_decision == PITCH_DEF_PROP) 00222 like_votes -= textord_words_veto_power; 00223 else if (row->pitch_decision == PITCH_MAYBE_PROP 00224 || row->pitch_decision == PITCH_CORR_PROP) 00225 like_votes--; 00226 } 00227 } 00228 else { 00229 if (row->pitch_decision == PITCH_DEF_FIXED) 00230 other_votes += textord_words_veto_power; 00231 else if (row->pitch_decision == PITCH_MAYBE_FIXED 00232 || row->pitch_decision == PITCH_CORR_FIXED) 00233 other_votes++; 00234 else if (row->pitch_decision == PITCH_DEF_PROP) 00235 other_votes -= textord_words_veto_power; 00236 else if (row->pitch_decision == PITCH_MAYBE_PROP 00237 || row->pitch_decision == PITCH_CORR_PROP) 00238 other_votes--; 00239 } 00240 row_index++; 00241 } 00242 block_index++; 00243 } 00244 if (block_votes > textord_words_veto_power) { 00245 bad_row->fixed_pitch = block_stats.ile (0.5); 00246 bad_row->pitch_decision = PITCH_CORR_FIXED; 00247 } 00248 else if (block_votes <= textord_words_veto_power && like_votes > 0) { 00249 bad_row->fixed_pitch = like_stats.ile (0.5); 00250 bad_row->pitch_decision = PITCH_CORR_FIXED; 00251 } 00252 else { 00253 bad_row->pitch_decision = PITCH_CORR_PROP; 00254 #ifndef SECURE_NAMES 00255 if (block_votes == 0 && like_votes == 0 && other_votes > 0 00256 && (textord_debug_pitch_test || textord_debug_pitch_metric)) 00257 tprintf 00258 ("Warning:row %d of block %d set prop with no like rows against trend\n", 00259 row_target, block_target); 00260 #endif 00261 } 00262 } 00263 if (textord_debug_pitch_metric) { 00264 tprintf(":b_votes=%d:l_votes=%d:o_votes=%d", 00265 block_votes, like_votes, other_votes); 00266 tprintf("x=%g:asc=%g\n", bad_row->xheight, bad_row->ascrise); 00267 } 00268 if (bad_row->pitch_decision == PITCH_CORR_FIXED) { 00269 if (bad_row->fixed_pitch < textord_min_xheight) { 00270 if (block_votes > 0) 00271 bad_row->fixed_pitch = block_stats.ile (0.5); 00272 else if (block_votes == 0 && like_votes > 0) 00273 bad_row->fixed_pitch = like_stats.ile (0.5); 00274 else { 00275 tprintf 00276 ("Warning:guessing pitch as xheight on row %d, block %d\n", 00277 row_target, block_target); 00278 bad_row->fixed_pitch = bad_row->xheight; 00279 } 00280 } 00281 if (bad_row->fixed_pitch < textord_min_xheight) 00282 bad_row->fixed_pitch = (float) textord_min_xheight; 00283 bad_row->kern_size = bad_row->fixed_pitch / 4; 00284 bad_row->min_space = (inT32) (bad_row->fixed_pitch * 0.6); 00285 bad_row->max_nonspace = (inT32) (bad_row->fixed_pitch * 0.4); 00286 bad_row->space_threshold = 00287 (bad_row->min_space + bad_row->max_nonspace) / 2; 00288 bad_row->space_size = bad_row->fixed_pitch; 00289 if (bad_row->char_cells.empty ()) 00290 tune_row_pitch (bad_row, &bad_row->projection, 00291 bad_row->projection_left, bad_row->projection_right, 00292 (bad_row->fixed_pitch + 00293 bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch, 00294 sp_sd, mid_cuts, &bad_row->char_cells, FALSE); 00295 } 00296 else if (bad_row->pitch_decision == PITCH_CORR_PROP 00297 || bad_row->pitch_decision == PITCH_DEF_PROP) { 00298 bad_row->fixed_pitch = 0.0f; 00299 bad_row->char_cells.clear (); 00300 } 00301 } 00302 00303 00304 /********************************************************************** 00305 * compute_block_pitch 00306 * 00307 * Decide whether each block is fixed pitch individually. 00308 **********************************************************************/ 00309 00310 void compute_block_pitch(TO_BLOCK *block, // input list 00311 FCOORD rotation, // for drawing 00312 inT32 block_index, // block number 00313 BOOL8 testing_on) { // correct orientation 00314 TBOX block_box; //bounding box 00315 00316 block_box = block->block->bounding_box (); 00317 if (testing_on && textord_debug_pitch_test) { 00318 tprintf ("Block %d at (%d,%d)->(%d,%d)\n", 00319 block_index, 00320 block_box.left (), block_box.bottom (), 00321 block_box.right (), block_box.top ()); 00322 } 00323 block->min_space = (inT32) floor (block->xheight 00324 * textord_words_default_minspace); 00325 block->max_nonspace = (inT32) ceil (block->xheight 00326 * textord_words_default_nonspace); 00327 block->fixed_pitch = 0.0f; 00328 block->space_size = (float) block->min_space; 00329 block->kern_size = (float) block->max_nonspace; 00330 block->pr_nonsp = block->xheight * words_default_prop_nonspace; 00331 block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop; 00332 if (!block->get_rows ()->empty ()) { 00333 ASSERT_HOST (block->xheight > 0); 00334 find_repeated_chars(block, textord_show_initial_words && testing_on); 00335 #ifndef GRAPHICS_DISABLED 00336 if (textord_show_initial_words && testing_on) 00337 //overlap_picture_ops(TRUE); 00338 ScrollView::Update(); 00339 #endif 00340 compute_rows_pitch(block, 00341 block_index, 00342 textord_debug_pitch_test &&testing_on); 00343 } 00344 } 00345 00346 00347 /********************************************************************** 00348 * compute_rows_pitch 00349 * 00350 * Decide whether each row is fixed pitch individually. 00351 **********************************************************************/ 00352 00353 BOOL8 compute_rows_pitch( //find line stats 00354 TO_BLOCK *block, //block to do 00355 inT32 block_index, //block number 00356 BOOL8 testing_on //correct orientation 00357 ) { 00358 inT32 maxwidth; //of spaces 00359 TO_ROW *row; //current row 00360 inT32 row_index; //row number. 00361 float lower, upper; //cluster thresholds 00362 TO_ROW_IT row_it = block->get_rows (); 00363 00364 row_index = 1; 00365 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00366 row = row_it.data (); 00367 ASSERT_HOST (row->xheight > 0); 00368 row->compute_vertical_projection (); 00369 maxwidth = (inT32) ceil (row->xheight * textord_words_maxspace); 00370 if (row_pitch_stats (row, maxwidth, testing_on) 00371 && find_row_pitch (row, maxwidth, 00372 textord_dotmatrix_gap + 1, block, block_index, 00373 row_index, testing_on)) { 00374 if (row->fixed_pitch == 0) { 00375 lower = row->pr_nonsp; 00376 upper = row->pr_space; 00377 row->space_size = upper; 00378 row->kern_size = lower; 00379 } 00380 } 00381 else { 00382 row->fixed_pitch = 0.0f; //insufficient data 00383 row->pitch_decision = PITCH_DUNNO; 00384 } 00385 row_index++; 00386 } 00387 return FALSE; 00388 } 00389 00390 00391 /********************************************************************** 00392 * try_doc_fixed 00393 * 00394 * Attempt to call the entire document fixed pitch. 00395 **********************************************************************/ 00396 00397 BOOL8 try_doc_fixed( //determine pitch 00398 ICOORD page_tr, //top right 00399 TO_BLOCK_LIST *port_blocks, //input list 00400 float gradient //page skew 00401 ) { 00402 inT16 master_x; //uniform shifts 00403 inT16 pitch; //median pitch. 00404 int x; //profile coord 00405 int prop_blocks; //correct counts 00406 int fixed_blocks; 00407 int total_row_count; //total in page 00408 //iterator 00409 TO_BLOCK_IT block_it = port_blocks; 00410 TO_BLOCK *block; //current block; 00411 TO_ROW_IT row_it; //row iterator 00412 TO_ROW *row; //current row 00413 inT16 projection_left; //edges 00414 inT16 projection_right; 00415 inT16 row_left; //edges of row 00416 inT16 row_right; 00417 ICOORDELT_LIST *master_cells; //cells for page 00418 float master_y; //uniform shifts 00419 float shift_factor; //page skew correction 00420 float row_shift; //shift for row 00421 float final_pitch; //output pitch 00422 float row_y; //baseline 00423 STATS projection; //entire page 00424 STATS pitches (0, MAX_ALLOWED_PITCH); 00425 //for median 00426 float sp_sd; //space sd 00427 inT16 mid_cuts; //no of cheap cuts 00428 float pitch_sd; //sync rating 00429 00430 if (block_it.empty () 00431 // || block_it.data()==block_it.data_relative(1) 00432 || !textord_blockndoc_fixed) 00433 return FALSE; 00434 shift_factor = gradient / (gradient * gradient + 1); 00435 row_it.set_to_list (block_it.data ()->get_rows ()); 00436 master_x = row_it.data ()->projection_left; 00437 master_y = row_it.data ()->baseline.y (master_x); 00438 projection_left = MAX_INT16; 00439 projection_right = -MAX_INT16; 00440 prop_blocks = 0; 00441 fixed_blocks = 0; 00442 total_row_count = 0; 00443 00444 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00445 block_it.forward ()) { 00446 block = block_it.data (); 00447 row_it.set_to_list (block->get_rows ()); 00448 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00449 row = row_it.data (); 00450 total_row_count++; 00451 if (row->fixed_pitch > 0) 00452 pitches.add ((inT32) (row->fixed_pitch), 1); 00453 //find median 00454 row_y = row->baseline.y (master_x); 00455 row_left = 00456 (inT16) (row->projection_left - 00457 shift_factor * (master_y - row_y)); 00458 row_right = 00459 (inT16) (row->projection_right - 00460 shift_factor * (master_y - row_y)); 00461 if (row_left < projection_left) 00462 projection_left = row_left; 00463 if (row_right > projection_right) 00464 projection_right = row_right; 00465 } 00466 } 00467 if (pitches.get_total () == 0) 00468 return FALSE; 00469 projection.set_range (projection_left, projection_right); 00470 00471 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00472 block_it.forward ()) { 00473 block = block_it.data (); 00474 row_it.set_to_list (block->get_rows ()); 00475 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00476 row = row_it.data (); 00477 row_y = row->baseline.y (master_x); 00478 row_left = 00479 (inT16) (row->projection_left - 00480 shift_factor * (master_y - row_y)); 00481 for (x = row->projection_left; x < row->projection_right; 00482 x++, row_left++) { 00483 projection.add (row_left, row->projection.pile_count (x)); 00484 } 00485 } 00486 } 00487 00488 row_it.set_to_list (block_it.data ()->get_rows ()); 00489 row = row_it.data (); 00490 #ifndef GRAPHICS_DISABLED 00491 if (textord_show_page_cuts && to_win != NULL) 00492 projection.plot (to_win, projection_left, 00493 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); 00494 #endif 00495 final_pitch = pitches.ile (0.5); 00496 pitch = (inT16) final_pitch; 00497 pitch_sd = 00498 tune_row_pitch (row, &projection, projection_left, projection_right, 00499 pitch * 0.75, final_pitch, sp_sd, mid_cuts, 00500 &row->char_cells, FALSE); 00501 00502 if (textord_debug_pitch_metric) 00503 tprintf 00504 ("try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n", 00505 prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd, 00506 pitch_sd / total_row_count, pitch_sd / pitch, 00507 pitch_sd / total_row_count / pitch); 00508 00509 #ifndef GRAPHICS_DISABLED 00510 if (textord_show_page_cuts && to_win != NULL) { 00511 master_cells = &row->char_cells; 00512 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00513 block_it.forward ()) { 00514 block = block_it.data (); 00515 row_it.set_to_list (block->get_rows ()); 00516 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); 00517 row_it.forward ()) { 00518 row = row_it.data (); 00519 row_y = row->baseline.y (master_x); 00520 row_shift = shift_factor * (master_y - row_y); 00521 plot_row_cells(to_win, ScrollView::GOLDENROD, row, row_shift, master_cells); 00522 } 00523 } 00524 } 00525 #endif 00526 row->char_cells.clear (); 00527 return FALSE; 00528 } 00529 00530 00531 /********************************************************************** 00532 * try_block_fixed 00533 * 00534 * Try to call the entire block fixed. 00535 **********************************************************************/ 00536 00537 BOOL8 try_block_fixed( //find line stats 00538 TO_BLOCK *block, //block to do 00539 inT32 block_index //block number 00540 ) { 00541 return FALSE; 00542 } 00543 00544 00545 /********************************************************************** 00546 * try_rows_fixed 00547 * 00548 * Decide whether each row is fixed pitch individually. 00549 **********************************************************************/ 00550 00551 BOOL8 try_rows_fixed( //find line stats 00552 TO_BLOCK *block, //block to do 00553 inT32 block_index, //block number 00554 BOOL8 testing_on //correct orientation 00555 ) { 00556 TO_ROW *row; //current row 00557 inT32 row_index; //row number. 00558 inT32 def_fixed = 0; //counters 00559 inT32 def_prop = 0; 00560 inT32 maybe_fixed = 0; 00561 inT32 maybe_prop = 0; 00562 inT32 dunno = 0; 00563 inT32 corr_fixed = 0; 00564 inT32 corr_prop = 0; 00565 float lower, upper; //cluster thresholds 00566 TO_ROW_IT row_it = block->get_rows (); 00567 00568 row_index = 1; 00569 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00570 row = row_it.data (); 00571 ASSERT_HOST (row->xheight > 0); 00572 if (row->fixed_pitch > 0 && 00573 fixed_pitch_row(row, block->block, block_index)) { 00574 if (row->fixed_pitch == 0) { 00575 lower = row->pr_nonsp; 00576 upper = row->pr_space; 00577 row->space_size = upper; 00578 row->kern_size = lower; 00579 } 00580 } 00581 row_index++; 00582 } 00583 count_block_votes(block, 00584 def_fixed, 00585 def_prop, 00586 maybe_fixed, 00587 maybe_prop, 00588 corr_fixed, 00589 corr_prop, 00590 dunno); 00591 if (testing_on 00592 && (textord_debug_pitch_test 00593 || textord_blocksall_prop || textord_blocksall_fixed)) { 00594 tprintf ("Initially:"); 00595 print_block_counts(block, block_index); 00596 } 00597 if (def_fixed > def_prop * textord_words_veto_power) 00598 block->pitch_decision = PITCH_DEF_FIXED; 00599 else if (def_prop > def_fixed * textord_words_veto_power) 00600 block->pitch_decision = PITCH_DEF_PROP; 00601 else if (def_fixed > 0 || def_prop > 0) 00602 block->pitch_decision = PITCH_DUNNO; 00603 else if (maybe_fixed > maybe_prop * textord_words_veto_power) 00604 block->pitch_decision = PITCH_MAYBE_FIXED; 00605 else if (maybe_prop > maybe_fixed * textord_words_veto_power) 00606 block->pitch_decision = PITCH_MAYBE_PROP; 00607 else 00608 block->pitch_decision = PITCH_DUNNO; 00609 return FALSE; 00610 } 00611 00612 00613 /********************************************************************** 00614 * print_block_counts 00615 * 00616 * Count up how many rows have what decision and print the results. 00617 **********************************************************************/ 00618 00619 void print_block_counts( //find line stats 00620 TO_BLOCK *block, //block to do 00621 inT32 block_index //block number 00622 ) { 00623 inT32 def_fixed = 0; //counters 00624 inT32 def_prop = 0; 00625 inT32 maybe_fixed = 0; 00626 inT32 maybe_prop = 0; 00627 inT32 dunno = 0; 00628 inT32 corr_fixed = 0; 00629 inT32 corr_prop = 0; 00630 00631 count_block_votes(block, 00632 def_fixed, 00633 def_prop, 00634 maybe_fixed, 00635 maybe_prop, 00636 corr_fixed, 00637 corr_prop, 00638 dunno); 00639 tprintf ("Block %d has (%d,%d,%d)", 00640 block_index, def_fixed, maybe_fixed, corr_fixed); 00641 if (textord_blocksall_prop && (def_fixed || maybe_fixed || corr_fixed)) 00642 tprintf (" (Wrongly)"); 00643 tprintf (" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop); 00644 if (textord_blocksall_fixed && (def_prop || maybe_prop || corr_prop)) 00645 tprintf (" (Wrongly)"); 00646 tprintf (" prop, %d dunno\n", dunno); 00647 } 00648 00649 00650 /********************************************************************** 00651 * count_block_votes 00652 * 00653 * Count the number of rows in the block with each kind of pitch_decision. 00654 **********************************************************************/ 00655 00656 void count_block_votes( //find line stats 00657 TO_BLOCK *block, //block to do 00658 inT32 &def_fixed, //add to counts 00659 inT32 &def_prop, 00660 inT32 &maybe_fixed, 00661 inT32 &maybe_prop, 00662 inT32 &corr_fixed, 00663 inT32 &corr_prop, 00664 inT32 &dunno) { 00665 TO_ROW *row; //current row 00666 TO_ROW_IT row_it = block->get_rows (); 00667 00668 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00669 row = row_it.data (); 00670 switch (row->pitch_decision) { 00671 case PITCH_DUNNO: 00672 dunno++; 00673 break; 00674 case PITCH_DEF_PROP: 00675 def_prop++; 00676 break; 00677 case PITCH_MAYBE_PROP: 00678 maybe_prop++; 00679 break; 00680 case PITCH_DEF_FIXED: 00681 def_fixed++; 00682 break; 00683 case PITCH_MAYBE_FIXED: 00684 maybe_fixed++; 00685 break; 00686 case PITCH_CORR_PROP: 00687 corr_prop++; 00688 break; 00689 case PITCH_CORR_FIXED: 00690 corr_fixed++; 00691 break; 00692 } 00693 } 00694 } 00695 00696 00697 /********************************************************************** 00698 * row_pitch_stats 00699 * 00700 * Decide whether each row is fixed pitch individually. 00701 **********************************************************************/ 00702 00703 BOOL8 row_pitch_stats( //find line stats 00704 TO_ROW *row, //current row 00705 inT32 maxwidth, //of spaces 00706 BOOL8 testing_on //correct orientation 00707 ) { 00708 BLOBNBOX *blob; //current blob 00709 int gap_index; //current gap 00710 inT32 prev_x; //end of prev blob 00711 inT32 cluster_count; //no of clusters 00712 inT32 prev_count; //of clusters 00713 inT32 smooth_factor; //for smoothing stats 00714 TBOX blob_box; //bounding box 00715 float lower, upper; //cluster thresholds 00716 //gap sizes 00717 float gaps[BLOCK_STATS_CLUSTERS]; 00718 //blobs 00719 BLOBNBOX_IT blob_it = row->blob_list (); 00720 STATS gap_stats (0, maxwidth); 00721 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; 00722 //clusters 00723 00724 smooth_factor = 00725 (inT32) (row->xheight * textord_wordstats_smooth_factor + 1.5); 00726 if (!blob_it.empty ()) { 00727 prev_x = blob_it.data ()->bounding_box ().right (); 00728 blob_it.forward (); 00729 while (!blob_it.at_first ()) { 00730 blob = blob_it.data (); 00731 if (!blob->joined_to_prev ()) { 00732 blob_box = blob->bounding_box (); 00733 if (blob_box.left () - prev_x < maxwidth) 00734 gap_stats.add (blob_box.left () - prev_x, 1); 00735 prev_x = blob_box.right (); 00736 } 00737 blob_it.forward (); 00738 } 00739 } 00740 if (gap_stats.get_total () == 0) { 00741 return FALSE; 00742 } 00743 cluster_count = 0; 00744 lower = row->xheight * words_initial_lower; 00745 upper = row->xheight * words_initial_upper; 00746 gap_stats.smooth (smooth_factor); 00747 do { 00748 prev_count = cluster_count; 00749 cluster_count = gap_stats.cluster (lower, upper, 00750 textord_spacesize_ratioprop, 00751 BLOCK_STATS_CLUSTERS, cluster_stats); 00752 } 00753 while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); 00754 if (cluster_count < 1) { 00755 return FALSE; 00756 } 00757 for (gap_index = 0; gap_index < cluster_count; gap_index++) 00758 gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5); 00759 //get medians 00760 if (testing_on) { 00761 tprintf ("cluster_count=%d:", cluster_count); 00762 for (gap_index = 0; gap_index < cluster_count; gap_index++) 00763 tprintf (" %g(%d)", gaps[gap_index], 00764 cluster_stats[gap_index + 1].get_total ()); 00765 tprintf ("\n"); 00766 } 00767 qsort (gaps, cluster_count, sizeof (float), sort_floats); 00768 00769 //Try to find proportional non-space and space for row. 00770 lower = row->xheight * words_default_prop_nonspace; 00771 upper = row->xheight * textord_words_min_minspace; 00772 for (gap_index = 0; gap_index < cluster_count 00773 && gaps[gap_index] < lower; gap_index++); 00774 if (gap_index == 0) { 00775 if (testing_on) 00776 tprintf ("No clusters below nonspace threshold!!\n"); 00777 if (cluster_count > 1) { 00778 row->pr_nonsp = gaps[0]; 00779 row->pr_space = gaps[1]; 00780 } 00781 else { 00782 row->pr_nonsp = lower; 00783 row->pr_space = gaps[0]; 00784 } 00785 } 00786 else { 00787 row->pr_nonsp = gaps[gap_index - 1]; 00788 while (gap_index < cluster_count && gaps[gap_index] < upper) 00789 gap_index++; 00790 if (gap_index == cluster_count) { 00791 if (testing_on) 00792 tprintf ("No clusters above nonspace threshold!!\n"); 00793 row->pr_space = lower * textord_spacesize_ratioprop; 00794 } 00795 else 00796 row->pr_space = gaps[gap_index]; 00797 } 00798 00799 //Now try to find the fixed pitch space and non-space. 00800 upper = row->xheight * words_default_fixed_space; 00801 for (gap_index = 0; gap_index < cluster_count 00802 && gaps[gap_index] < upper; gap_index++); 00803 if (gap_index == 0) { 00804 if (testing_on) 00805 tprintf ("No clusters below space threshold!!\n"); 00806 row->fp_nonsp = upper; 00807 row->fp_space = gaps[0]; 00808 } 00809 else { 00810 row->fp_nonsp = gaps[gap_index - 1]; 00811 if (gap_index == cluster_count) { 00812 if (testing_on) 00813 tprintf ("No clusters above space threshold!!\n"); 00814 row->fp_space = row->xheight; 00815 } 00816 else 00817 row->fp_space = gaps[gap_index]; 00818 } 00819 if (testing_on) { 00820 tprintf 00821 ("Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, fp_space=%g\n", 00822 row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space); 00823 } 00824 return TRUE; //computed some stats 00825 } 00826 00827 00828 /********************************************************************** 00829 * find_row_pitch 00830 * 00831 * Check to see if this row could be fixed pitch using the given spacings. 00832 * Blobs with gaps smaller than the lower threshold are assumed to be one. 00833 * The larger threshold is the word gap threshold. 00834 **********************************************************************/ 00835 00836 BOOL8 find_row_pitch( //find lines 00837 TO_ROW *row, //row to do 00838 inT32 maxwidth, //max permitted space 00839 inT32 dm_gap, //ignorable gaps 00840 TO_BLOCK *block, //block of row 00841 inT32 block_index, //block_number 00842 inT32 row_index, //number of row 00843 BOOL8 testing_on //correct orientation 00844 ) { 00845 BOOL8 used_dm_model; //looks lik dot matrix 00846 float min_space; //estimate threshold 00847 float non_space; //gap size 00848 float gap_iqr; //interquartile range 00849 float pitch_iqr; 00850 float dm_gap_iqr; //interquartile range 00851 float dm_pitch_iqr; 00852 float dm_pitch; //pitch with dm on 00853 float pitch; //revised estimate 00854 float initial_pitch; //guess at pitch 00855 STATS gap_stats (0, maxwidth); 00856 //centre-centre 00857 STATS pitch_stats (0, maxwidth); 00858 00859 row->fixed_pitch = 0.0f; 00860 initial_pitch = row->fp_space; 00861 if (initial_pitch > row->xheight * (1 + words_default_fixed_limit)) 00862 initial_pitch = row->xheight;//keep pitch decent 00863 non_space = row->fp_nonsp; 00864 if (non_space > initial_pitch) 00865 non_space = initial_pitch; 00866 min_space = (initial_pitch + non_space) / 2; 00867 00868 if (!count_pitch_stats (row, &gap_stats, &pitch_stats, 00869 initial_pitch, min_space, TRUE, FALSE, dm_gap)) { 00870 dm_gap_iqr = 0.0001; 00871 dm_pitch_iqr = maxwidth * 2.0f; 00872 dm_pitch = initial_pitch; 00873 } 00874 else { 00875 dm_gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00876 dm_pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00877 dm_pitch = pitch_stats.ile (0.5); 00878 } 00879 gap_stats.clear (); 00880 pitch_stats.clear (); 00881 if (!count_pitch_stats (row, &gap_stats, &pitch_stats, 00882 initial_pitch, min_space, TRUE, FALSE, 0)) { 00883 gap_iqr = 0.0001; 00884 pitch_iqr = maxwidth * 3.0f; 00885 } 00886 else { 00887 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00888 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00889 if (testing_on) 00890 tprintf 00891 ("First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n", 00892 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5)); 00893 initial_pitch = pitch_stats.ile (0.5); 00894 if (min_space > initial_pitch 00895 && count_pitch_stats (row, &gap_stats, &pitch_stats, 00896 initial_pitch, initial_pitch, TRUE, FALSE, 0)) { 00897 min_space = initial_pitch; 00898 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00899 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00900 if (testing_on) 00901 tprintf 00902 ("Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n", 00903 initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5)); 00904 initial_pitch = pitch_stats.ile (0.5); 00905 } 00906 } 00907 if (textord_debug_pitch_metric) 00908 tprintf("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:", 00909 block_index, row_index, 'X', 00910 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr, 00911 pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth ? 'D' : 00912 (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr ? 'S' : 'M')); 00913 if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) { 00914 row->pitch_decision = PITCH_DUNNO; 00915 if (textord_debug_pitch_metric) 00916 tprintf ("\n"); 00917 return FALSE; //insufficient data 00918 } 00919 if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) { 00920 if (testing_on) 00921 tprintf 00922 ("Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n", 00923 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr); 00924 gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25); 00925 pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25); 00926 pitch = pitch_stats.ile (0.5); 00927 used_dm_model = FALSE; 00928 } 00929 else { 00930 if (testing_on) 00931 tprintf 00932 ("Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n", 00933 pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr); 00934 gap_iqr = dm_gap_iqr; 00935 pitch_iqr = dm_pitch_iqr; 00936 pitch = dm_pitch; 00937 used_dm_model = TRUE; 00938 } 00939 if (textord_debug_pitch_metric) { 00940 tprintf ("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:", 00941 pitch_iqr, gap_iqr, pitch); 00942 tprintf ("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:", 00943 pitch_iqr / gap_iqr, pitch_iqr / block->xheight, 00944 pitch_iqr < gap_iqr * textord_fpiqr_ratio 00945 && pitch_iqr < block->xheight * textord_max_pitch_iqr 00946 && pitch < block->xheight * textord_words_default_maxspace 00947 ? 'F' : 'P'); 00948 } 00949 if (pitch_iqr < gap_iqr * textord_fpiqr_ratio 00950 && pitch_iqr < block->xheight * textord_max_pitch_iqr 00951 && pitch < block->xheight * textord_words_default_maxspace) 00952 row->pitch_decision = PITCH_MAYBE_FIXED; 00953 else 00954 row->pitch_decision = PITCH_MAYBE_PROP; 00955 row->fixed_pitch = pitch; 00956 row->kern_size = gap_stats.ile (0.5); 00957 row->min_space = (inT32) (row->fixed_pitch + non_space) / 2; 00958 if (row->min_space > row->fixed_pitch) 00959 row->min_space = (inT32) row->fixed_pitch; 00960 row->max_nonspace = row->min_space; 00961 row->space_size = row->fixed_pitch; 00962 row->space_threshold = (row->max_nonspace + row->min_space) / 2; 00963 row->used_dm_model = used_dm_model; 00964 return TRUE; 00965 } 00966 00967 00968 /********************************************************************** 00969 * fixed_pitch_row 00970 * 00971 * Check to see if this row could be fixed pitch using the given spacings. 00972 * Blobs with gaps smaller than the lower threshold are assumed to be one. 00973 * The larger threshold is the word gap threshold. 00974 **********************************************************************/ 00975 00976 BOOL8 fixed_pitch_row(TO_ROW *row, // row to do 00977 BLOCK* block, 00978 inT32 block_index // block_number 00979 ) { 00980 const char *res_string; // pitch result 00981 inT16 mid_cuts; // no of cheap cuts 00982 float non_space; // gap size 00983 float pitch_sd; // error on pitch 00984 float sp_sd = 0.0f; // space sd 00985 00986 non_space = row->fp_nonsp; 00987 if (non_space > row->fixed_pitch) 00988 non_space = row->fixed_pitch; 00989 POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL; 00990 if (textord_all_prop || (pb != NULL && !pb->IsText())) { 00991 // Set the decision to definitely proportional. 00992 pitch_sd = textord_words_def_prop * row->fixed_pitch; 00993 row->pitch_decision = PITCH_DEF_PROP; 00994 } else { 00995 pitch_sd = tune_row_pitch (row, &row->projection, row->projection_left, 00996 row->projection_right, 00997 (row->fixed_pitch + non_space * 3) / 4, 00998 row->fixed_pitch, sp_sd, mid_cuts, 00999 &row->char_cells, 01000 block_index == textord_debug_block); 01001 if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch 01002 && ((pitsync_linear_version & 3) < 3 01003 || ((pitsync_linear_version & 3) >= 3 && (row->used_dm_model 01004 || sp_sd > 20 01005 || (pitch_sd == 0 && sp_sd > 10))))) { 01006 if (pitch_sd < textord_words_def_fixed * row->fixed_pitch 01007 && !row->all_caps 01008 && ((pitsync_linear_version & 3) < 3 || sp_sd > 20)) 01009 row->pitch_decision = PITCH_DEF_FIXED; 01010 else 01011 row->pitch_decision = PITCH_MAYBE_FIXED; 01012 } 01013 else if ((pitsync_linear_version & 3) < 3 01014 || sp_sd > 20 01015 || mid_cuts > 0 01016 || pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) { 01017 if (pitch_sd < textord_words_def_prop * row->fixed_pitch) 01018 row->pitch_decision = PITCH_MAYBE_PROP; 01019 else 01020 row->pitch_decision = PITCH_DEF_PROP; 01021 } 01022 else 01023 row->pitch_decision = PITCH_DUNNO; 01024 } 01025 01026 if (textord_debug_pitch_metric) { 01027 res_string = "??"; 01028 switch (row->pitch_decision) { 01029 case PITCH_DEF_PROP: 01030 res_string = "DP"; 01031 break; 01032 case PITCH_MAYBE_PROP: 01033 res_string = "MP"; 01034 break; 01035 case PITCH_DEF_FIXED: 01036 res_string = "DF"; 01037 break; 01038 case PITCH_MAYBE_FIXED: 01039 res_string = "MF"; 01040 default: 01041 res_string = "??"; 01042 } 01043 tprintf (":sd/p=%g:occ=%g:init_res=%s\n", 01044 pitch_sd / row->fixed_pitch, sp_sd, res_string); 01045 } 01046 return TRUE; 01047 } 01048 01049 01050 /********************************************************************** 01051 * count_pitch_stats 01052 * 01053 * Count up the gap and pitch stats on the block to see if it is fixed pitch. 01054 * Blobs with gaps smaller than the lower threshold are assumed to be one. 01055 * The larger threshold is the word gap threshold. 01056 * The return value indicates whether there were any decent values to use. 01057 **********************************************************************/ 01058 01059 BOOL8 count_pitch_stats( //find lines 01060 TO_ROW *row, //row to do 01061 STATS *gap_stats, //blob gaps 01062 STATS *pitch_stats, //centre-centre stats 01063 float initial_pitch, //guess at pitch 01064 float min_space, //estimate space size 01065 BOOL8 ignore_outsize, //discard big objects 01066 BOOL8 split_outsize, //split big objects 01067 inT32 dm_gap //ignorable gaps 01068 ) { 01069 BOOL8 prev_valid; //not word broken 01070 BLOBNBOX *blob; //current blob 01071 //blobs 01072 BLOBNBOX_IT blob_it = row->blob_list (); 01073 inT32 prev_right; //end of prev blob 01074 inT32 prev_centre; //centre of previous blob 01075 inT32 x_centre; //centre of this blob 01076 inT32 blob_width; //width of blob 01077 inT32 width_units; //no of widths in blob 01078 float width; //blob width 01079 TBOX blob_box; //bounding box 01080 TBOX joined_box; //of super blob 01081 01082 gap_stats->clear (); 01083 pitch_stats->clear (); 01084 if (blob_it.empty ()) 01085 return FALSE; 01086 prev_valid = FALSE; 01087 prev_centre = 0; 01088 prev_right = 0; //stop complier warning 01089 joined_box = blob_it.data ()->bounding_box (); 01090 do { 01091 blob_it.forward (); 01092 blob = blob_it.data (); 01093 if (!blob->joined_to_prev ()) { 01094 blob_box = blob->bounding_box (); 01095 if ((blob_box.left () - joined_box.right () < dm_gap 01096 && !blob_it.at_first ()) 01097 || blob->cblob() == NULL) 01098 joined_box += blob_box; //merge blobs 01099 else { 01100 blob_width = joined_box.width (); 01101 if (split_outsize) { 01102 width_units = 01103 (inT32) floor ((float) blob_width / initial_pitch + 0.5); 01104 if (width_units < 1) 01105 width_units = 1; 01106 width_units--; 01107 } 01108 else if (ignore_outsize) { 01109 width = (float) blob_width / initial_pitch; 01110 width_units = width < 1 + words_default_fixed_limit 01111 && width > 1 - words_default_fixed_limit ? 0 : -1; 01112 } 01113 else 01114 width_units = 0; //everything in 01115 x_centre = (inT32) (joined_box.left () 01116 + (blob_width - 01117 width_units * initial_pitch) / 2); 01118 if (prev_valid && width_units >= 0) { 01119 // if (width_units>0) 01120 // { 01121 // tprintf("wu=%d, width=%d, xc=%d, adding %d\n", 01122 // width_units,blob_width,x_centre,x_centre-prev_centre); 01123 // } 01124 gap_stats->add (joined_box.left () - prev_right, 1); 01125 pitch_stats->add (x_centre - prev_centre, 1); 01126 } 01127 prev_centre = (inT32) (x_centre + width_units * initial_pitch); 01128 prev_right = joined_box.right (); 01129 prev_valid = blob_box.left () - joined_box.right () < min_space; 01130 prev_valid = prev_valid && width_units >= 0; 01131 joined_box = blob_box; 01132 } 01133 } 01134 } 01135 while (!blob_it.at_first ()); 01136 return gap_stats->get_total () >= 3; 01137 } 01138 01139 01140 /********************************************************************** 01141 * tune_row_pitch 01142 * 01143 * Use a dp algorithm to fit the character cells and return the sd of 01144 * the cell size over the row. 01145 **********************************************************************/ 01146 01147 float tune_row_pitch( //find fp cells 01148 TO_ROW *row, //row to do 01149 STATS *projection, //vertical projection 01150 inT16 projection_left, //edge of projection 01151 inT16 projection_right, //edge of projection 01152 float space_size, //size of blank 01153 float &initial_pitch, //guess at pitch 01154 float &best_sp_sd, //space sd 01155 inT16 &best_mid_cuts, //no of cheap cuts 01156 ICOORDELT_LIST *best_cells, //row cells 01157 BOOL8 testing_on //inidividual words 01158 ) { 01159 int pitch_delta; //offset pitch 01160 inT16 mid_cuts; //cheap cuts 01161 float pitch_sd; //current sd 01162 float best_sd; //best result 01163 float best_pitch; //pitch for best result 01164 float initial_sd; //starting error 01165 float sp_sd; //space sd 01166 ICOORDELT_LIST test_cells; //row cells 01167 ICOORDELT_IT best_it; //start of best list 01168 01169 if (textord_fast_pitch_test) 01170 return tune_row_pitch2 (row, projection, projection_left, 01171 projection_right, space_size, initial_pitch, 01172 best_sp_sd, 01173 //space sd 01174 best_mid_cuts, best_cells, testing_on); 01175 if (textord_disable_pitch_test) { 01176 best_sp_sd = initial_pitch; 01177 return initial_pitch; 01178 } 01179 initial_sd = 01180 compute_pitch_sd(row, 01181 projection, 01182 projection_left, 01183 projection_right, 01184 space_size, 01185 initial_pitch, 01186 best_sp_sd, 01187 best_mid_cuts, 01188 best_cells, 01189 testing_on); 01190 best_sd = initial_sd; 01191 best_pitch = initial_pitch; 01192 if (testing_on) 01193 tprintf ("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd); 01194 for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) { 01195 pitch_sd = 01196 compute_pitch_sd (row, projection, projection_left, projection_right, 01197 space_size, initial_pitch + pitch_delta, sp_sd, 01198 mid_cuts, &test_cells, testing_on); 01199 if (testing_on) 01200 tprintf ("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta, 01201 pitch_sd); 01202 if (pitch_sd < best_sd) { 01203 best_sd = pitch_sd; 01204 best_mid_cuts = mid_cuts; 01205 best_sp_sd = sp_sd; 01206 best_pitch = initial_pitch + pitch_delta; 01207 best_cells->clear (); 01208 best_it.set_to_list (best_cells); 01209 best_it.add_list_after (&test_cells); 01210 } 01211 else 01212 test_cells.clear (); 01213 if (pitch_sd > initial_sd) 01214 break; //getting worse 01215 } 01216 for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) { 01217 pitch_sd = 01218 compute_pitch_sd (row, projection, projection_left, projection_right, 01219 space_size, initial_pitch - pitch_delta, sp_sd, 01220 mid_cuts, &test_cells, testing_on); 01221 if (testing_on) 01222 tprintf ("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta, 01223 pitch_sd); 01224 if (pitch_sd < best_sd) { 01225 best_sd = pitch_sd; 01226 best_mid_cuts = mid_cuts; 01227 best_sp_sd = sp_sd; 01228 best_pitch = initial_pitch - pitch_delta; 01229 best_cells->clear (); 01230 best_it.set_to_list (best_cells); 01231 best_it.add_list_after (&test_cells); 01232 } 01233 else 01234 test_cells.clear (); 01235 if (pitch_sd > initial_sd) 01236 break; 01237 } 01238 initial_pitch = best_pitch; 01239 01240 if (textord_debug_pitch_metric) 01241 print_pitch_sd(row, 01242 projection, 01243 projection_left, 01244 projection_right, 01245 space_size, 01246 best_pitch); 01247 01248 return best_sd; 01249 } 01250 01251 01252 /********************************************************************** 01253 * tune_row_pitch 01254 * 01255 * Use a dp algorithm to fit the character cells and return the sd of 01256 * the cell size over the row. 01257 **********************************************************************/ 01258 01259 float tune_row_pitch2( //find fp cells 01260 TO_ROW *row, //row to do 01261 STATS *projection, //vertical projection 01262 inT16 projection_left, //edge of projection 01263 inT16 projection_right, //edge of projection 01264 float space_size, //size of blank 01265 float &initial_pitch, //guess at pitch 01266 float &best_sp_sd, //space sd 01267 inT16 &best_mid_cuts, //no of cheap cuts 01268 ICOORDELT_LIST *best_cells, //row cells 01269 BOOL8 testing_on //inidividual words 01270 ) { 01271 int pitch_delta; //offset pitch 01272 inT16 pixel; //pixel coord 01273 inT16 best_pixel; //pixel coord 01274 inT16 best_delta; //best pitch 01275 inT16 best_pitch; //best pitch 01276 inT16 start; //of good range 01277 inT16 end; //of good range 01278 inT32 best_count; //lowest sum 01279 float best_sd; //best result 01280 STATS *sum_proj; //summed projection 01281 01282 best_sp_sd = initial_pitch; 01283 01284 if (textord_disable_pitch_test) { 01285 return initial_pitch; 01286 } 01287 sum_proj = new STATS[textord_pitch_range * 2 + 1]; 01288 if (sum_proj == NULL) 01289 return initial_pitch; 01290 best_pitch = (inT32) initial_pitch; 01291 01292 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; 01293 pitch_delta++) 01294 sum_proj[textord_pitch_range + pitch_delta].set_range (0, 01295 best_pitch + 01296 pitch_delta + 1); 01297 for (pixel = projection_left; pixel <= projection_right; pixel++) { 01298 for (pitch_delta = -textord_pitch_range; 01299 pitch_delta <= textord_pitch_range; pitch_delta++) 01300 sum_proj[textord_pitch_range + 01301 pitch_delta].add ((pixel - projection_left) % (best_pitch + 01302 pitch_delta), 01303 projection->pile_count (pixel)); 01304 } 01305 best_count = sum_proj[textord_pitch_range].pile_count (0); 01306 best_delta = 0; 01307 best_pixel = 0; 01308 for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; 01309 pitch_delta++) { 01310 for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) { 01311 if (sum_proj[textord_pitch_range + pitch_delta].pile_count (pixel) 01312 < best_count) { 01313 best_count = 01314 sum_proj[textord_pitch_range + 01315 pitch_delta].pile_count (pixel); 01316 best_delta = pitch_delta; 01317 best_pixel = pixel; 01318 } 01319 } 01320 } 01321 if (testing_on) 01322 tprintf ("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n", 01323 initial_pitch, best_delta, best_count); 01324 best_pitch += best_delta; 01325 initial_pitch = best_pitch; 01326 best_count++; 01327 best_count += best_count; 01328 for (start = best_pixel - 2; start > best_pixel - best_pitch 01329 && sum_proj[textord_pitch_range + 01330 best_delta].pile_count (start % best_pitch) <= best_count; 01331 start--); 01332 for (end = best_pixel + 2; 01333 end < best_pixel + best_pitch 01334 && sum_proj[textord_pitch_range + 01335 best_delta].pile_count (end % best_pitch) <= best_count; 01336 end++); 01337 01338 best_sd = 01339 compute_pitch_sd(row, 01340 projection, 01341 projection_left, 01342 projection_right, 01343 space_size, 01344 initial_pitch, 01345 best_sp_sd, 01346 best_mid_cuts, 01347 best_cells, 01348 testing_on, 01349 start, 01350 end); 01351 if (testing_on) 01352 tprintf ("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch, 01353 best_sd); 01354 01355 if (textord_debug_pitch_metric) 01356 print_pitch_sd(row, 01357 projection, 01358 projection_left, 01359 projection_right, 01360 space_size, 01361 initial_pitch); 01362 01363 delete[]sum_proj; 01364 01365 return best_sd; 01366 } 01367 01368 01369 /********************************************************************** 01370 * compute_pitch_sd 01371 * 01372 * Use a dp algorithm to fit the character cells and return the sd of 01373 * the cell size over the row. 01374 **********************************************************************/ 01375 01376 float compute_pitch_sd( //find fp cells 01377 TO_ROW *row, //row to do 01378 STATS *projection, //vertical projection 01379 inT16 projection_left, //edge 01380 inT16 projection_right, //edge 01381 float space_size, //size of blank 01382 float initial_pitch, //guess at pitch 01383 float &sp_sd, //space sd 01384 inT16 &mid_cuts, //no of free cuts 01385 ICOORDELT_LIST *row_cells, //list of chop pts 01386 BOOL8 testing_on, //inidividual words 01387 inT16 start, //start of good range 01388 inT16 end //end of good range 01389 ) { 01390 inT16 occupation; //no of cells in word. 01391 //blobs 01392 BLOBNBOX_IT blob_it = row->blob_list (); 01393 BLOBNBOX_IT start_it; //start of word 01394 BLOBNBOX_IT plot_it; //for plotting 01395 inT16 blob_count; //no of blobs 01396 TBOX blob_box; //bounding box 01397 TBOX prev_box; //of super blob 01398 inT32 prev_right; //of word sync 01399 int scale_factor; //on scores for big words 01400 inT32 sp_count; //spaces 01401 FPSEGPT_LIST seg_list; //char cells 01402 FPSEGPT_IT seg_it; //iterator 01403 inT16 segpos; //position of segment 01404 inT16 cellpos; //previous cell boundary 01405 //iterator 01406 ICOORDELT_IT cell_it = row_cells; 01407 ICOORDELT *cell; //new cell 01408 double sqsum; //sum of squares 01409 double spsum; //of spaces 01410 double sp_var; //space error 01411 double word_sync; //result for word 01412 inT32 total_count; //total blobs 01413 01414 if ((pitsync_linear_version & 3) > 1) { 01415 word_sync = compute_pitch_sd2 (row, projection, projection_left, 01416 projection_right, initial_pitch, 01417 occupation, mid_cuts, row_cells, 01418 testing_on, start, end); 01419 sp_sd = occupation; 01420 return word_sync; 01421 } 01422 mid_cuts = 0; 01423 cellpos = 0; 01424 total_count = 0; 01425 sqsum = 0; 01426 sp_count = 0; 01427 spsum = 0; 01428 prev_right = -1; 01429 if (blob_it.empty ()) 01430 return space_size * 10; 01431 #ifndef GRAPHICS_DISABLED 01432 if (testing_on && to_win > 0) { 01433 blob_box = blob_it.data ()->bounding_box (); 01434 projection->plot (to_win, projection_left, 01435 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); 01436 } 01437 #endif 01438 start_it = blob_it; 01439 blob_count = 0; 01440 blob_box = box_next (&blob_it);//first blob 01441 blob_it.mark_cycle_pt (); 01442 do { 01443 for (; blob_count > 0; blob_count--) 01444 box_next(&start_it); 01445 do { 01446 prev_box = blob_box; 01447 blob_count++; 01448 blob_box = box_next (&blob_it); 01449 } 01450 while (!blob_it.cycled_list () 01451 && blob_box.left () - prev_box.right () < space_size); 01452 plot_it = start_it; 01453 if (pitsync_linear_version & 3) 01454 word_sync = 01455 check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2, 01456 projection, projection_left, projection_right, 01457 row->xheight * textord_projection_scale, 01458 occupation, &seg_list, start, end); 01459 else 01460 word_sync = 01461 check_pitch_sync (&start_it, blob_count, (inT16) initial_pitch, 2, 01462 projection, &seg_list); 01463 if (testing_on) { 01464 tprintf ("Word ending at (%d,%d), len=%d, sync rating=%g, ", 01465 prev_box.right (), prev_box.top (), 01466 seg_list.length () - 1, word_sync); 01467 seg_it.set_to_list (&seg_list); 01468 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); 01469 seg_it.forward ()) { 01470 if (seg_it.data ()->faked) 01471 tprintf ("(F)"); 01472 tprintf ("%d, ", seg_it.data ()->position ()); 01473 // tprintf("C=%g, s=%g, sq=%g\n", 01474 // seg_it.data()->cost_function(), 01475 // seg_it.data()->sum(), 01476 // seg_it.data()->squares()); 01477 } 01478 tprintf ("\n"); 01479 } 01480 #ifndef GRAPHICS_DISABLED 01481 if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0) 01482 plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); 01483 #endif 01484 seg_it.set_to_list (&seg_list); 01485 if (prev_right >= 0) { 01486 sp_var = seg_it.data ()->position () - prev_right; 01487 sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch; 01488 sp_var *= sp_var; 01489 spsum += sp_var; 01490 sp_count++; 01491 } 01492 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) { 01493 segpos = seg_it.data ()->position (); 01494 if (cell_it.empty () || segpos > cellpos + initial_pitch / 2) { 01495 //big gap 01496 while (!cell_it.empty () && segpos > cellpos + initial_pitch * 3 / 2) { 01497 cell = new ICOORDELT (cellpos + (inT16) initial_pitch, 0); 01498 cell_it.add_after_then_move (cell); 01499 cellpos += (inT16) initial_pitch; 01500 } 01501 //make new one 01502 cell = new ICOORDELT (segpos, 0); 01503 cell_it.add_after_then_move (cell); 01504 cellpos = segpos; 01505 } 01506 else if (segpos > cellpos - initial_pitch / 2) { 01507 cell = cell_it.data (); 01508 //average positions 01509 cell->set_x ((cellpos + segpos) / 2); 01510 cellpos = cell->x (); 01511 } 01512 } 01513 seg_it.move_to_last (); 01514 prev_right = seg_it.data ()->position (); 01515 if (textord_pitch_scalebigwords) { 01516 scale_factor = (seg_list.length () - 2) / 2; 01517 if (scale_factor < 1) 01518 scale_factor = 1; 01519 } 01520 else 01521 scale_factor = 1; 01522 sqsum += word_sync * scale_factor; 01523 total_count += (seg_list.length () - 1) * scale_factor; 01524 seg_list.clear (); 01525 } 01526 while (!blob_it.cycled_list ()); 01527 sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0; 01528 return total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10; 01529 } 01530 01531 01532 /********************************************************************** 01533 * compute_pitch_sd2 01534 * 01535 * Use a dp algorithm to fit the character cells and return the sd of 01536 * the cell size over the row. 01537 **********************************************************************/ 01538 01539 float compute_pitch_sd2( //find fp cells 01540 TO_ROW *row, //row to do 01541 STATS *projection, //vertical projection 01542 inT16 projection_left, //edge 01543 inT16 projection_right, //edge 01544 float initial_pitch, //guess at pitch 01545 inT16 &occupation, //no of occupied cells 01546 inT16 &mid_cuts, //no of free cuts 01547 ICOORDELT_LIST *row_cells, //list of chop pts 01548 BOOL8 testing_on, //inidividual words 01549 inT16 start, //start of good range 01550 inT16 end //end of good range 01551 ) { 01552 //blobs 01553 BLOBNBOX_IT blob_it = row->blob_list (); 01554 BLOBNBOX_IT plot_it; 01555 inT16 blob_count; //no of blobs 01556 TBOX blob_box; //bounding box 01557 FPSEGPT_LIST seg_list; //char cells 01558 FPSEGPT_IT seg_it; //iterator 01559 inT16 segpos; //position of segment 01560 //iterator 01561 ICOORDELT_IT cell_it = row_cells; 01562 ICOORDELT *cell; //new cell 01563 double word_sync; //result for word 01564 01565 mid_cuts = 0; 01566 if (blob_it.empty ()) { 01567 occupation = 0; 01568 return initial_pitch * 10; 01569 } 01570 #ifndef GRAPHICS_DISABLED 01571 if (testing_on && to_win > 0) { 01572 projection->plot (to_win, projection_left, 01573 row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); 01574 } 01575 #endif 01576 blob_count = 0; 01577 blob_it.mark_cycle_pt (); 01578 do { 01579 //first blob 01580 blob_box = box_next (&blob_it); 01581 blob_count++; 01582 } 01583 while (!blob_it.cycled_list ()); 01584 plot_it = blob_it; 01585 word_sync = check_pitch_sync2 (&blob_it, blob_count, (inT16) initial_pitch, 01586 2, projection, projection_left, 01587 projection_right, 01588 row->xheight * textord_projection_scale, 01589 occupation, &seg_list, start, end); 01590 if (testing_on) { 01591 tprintf ("Row ending at (%d,%d), len=%d, sync rating=%g, ", 01592 blob_box.right (), blob_box.top (), 01593 seg_list.length () - 1, word_sync); 01594 seg_it.set_to_list (&seg_list); 01595 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) { 01596 if (seg_it.data ()->faked) 01597 tprintf ("(F)"); 01598 tprintf ("%d, ", seg_it.data ()->position ()); 01599 // tprintf("C=%g, s=%g, sq=%g\n", 01600 // seg_it.data()->cost_function(), 01601 // seg_it.data()->sum(), 01602 // seg_it.data()->squares()); 01603 } 01604 tprintf ("\n"); 01605 } 01606 #ifndef GRAPHICS_DISABLED 01607 if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0) 01608 plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); 01609 #endif 01610 seg_it.set_to_list (&seg_list); 01611 for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) { 01612 segpos = seg_it.data ()->position (); 01613 //make new one 01614 cell = new ICOORDELT (segpos, 0); 01615 cell_it.add_after_then_move (cell); 01616 if (seg_it.at_last ()) 01617 mid_cuts = seg_it.data ()->cheap_cuts (); 01618 } 01619 seg_list.clear (); 01620 return occupation > 0 ? sqrt (word_sync / occupation) : initial_pitch * 10; 01621 } 01622 01623 01624 /********************************************************************** 01625 * print_pitch_sd 01626 * 01627 * Use a dp algorithm to fit the character cells and return the sd of 01628 * the cell size over the row. 01629 **********************************************************************/ 01630 01631 void print_pitch_sd( //find fp cells 01632 TO_ROW *row, //row to do 01633 STATS *projection, //vertical projection 01634 inT16 projection_left, //edges //size of blank 01635 inT16 projection_right, 01636 float space_size, 01637 float initial_pitch //guess at pitch 01638 ) { 01639 const char *res2; //pitch result 01640 inT16 occupation; //used cells 01641 float sp_sd; //space sd 01642 //blobs 01643 BLOBNBOX_IT blob_it = row->blob_list (); 01644 BLOBNBOX_IT start_it; //start of word 01645 BLOBNBOX_IT row_start; //start of row 01646 inT16 blob_count; //no of blobs 01647 inT16 total_blob_count; //total blobs in line 01648 TBOX blob_box; //bounding box 01649 TBOX prev_box; //of super blob 01650 inT32 prev_right; //of word sync 01651 int scale_factor; //on scores for big words 01652 inT32 sp_count; //spaces 01653 FPSEGPT_LIST seg_list; //char cells 01654 FPSEGPT_IT seg_it; //iterator 01655 double sqsum; //sum of squares 01656 double spsum; //of spaces 01657 double sp_var; //space error 01658 double word_sync; //result for word 01659 double total_count; //total cuts 01660 01661 if (blob_it.empty ()) 01662 return; 01663 row_start = blob_it; 01664 total_blob_count = 0; 01665 01666 total_count = 0; 01667 sqsum = 0; 01668 sp_count = 0; 01669 spsum = 0; 01670 prev_right = -1; 01671 blob_it = row_start; 01672 start_it = blob_it; 01673 blob_count = 0; 01674 blob_box = box_next (&blob_it);//first blob 01675 blob_it.mark_cycle_pt (); 01676 do { 01677 for (; blob_count > 0; blob_count--) 01678 box_next(&start_it); 01679 do { 01680 prev_box = blob_box; 01681 blob_count++; 01682 blob_box = box_next (&blob_it); 01683 } 01684 while (!blob_it.cycled_list () 01685 && blob_box.left () - prev_box.right () < space_size); 01686 word_sync = 01687 check_pitch_sync2 (&start_it, blob_count, (inT16) initial_pitch, 2, 01688 projection, projection_left, projection_right, 01689 row->xheight * textord_projection_scale, 01690 occupation, &seg_list, 0, 0); 01691 total_blob_count += blob_count; 01692 seg_it.set_to_list (&seg_list); 01693 if (prev_right >= 0) { 01694 sp_var = seg_it.data ()->position () - prev_right; 01695 sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch; 01696 sp_var *= sp_var; 01697 spsum += sp_var; 01698 sp_count++; 01699 } 01700 seg_it.move_to_last (); 01701 prev_right = seg_it.data ()->position (); 01702 if (textord_pitch_scalebigwords) { 01703 scale_factor = (seg_list.length () - 2) / 2; 01704 if (scale_factor < 1) 01705 scale_factor = 1; 01706 } 01707 else 01708 scale_factor = 1; 01709 sqsum += word_sync * scale_factor; 01710 total_count += (seg_list.length () - 1) * scale_factor; 01711 seg_list.clear (); 01712 } 01713 while (!blob_it.cycled_list ()); 01714 sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0; 01715 word_sync = total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10; 01716 tprintf ("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:", 01717 word_sync, word_sync / initial_pitch, sp_sd, 01718 word_sync < textord_words_pitchsd_threshold * initial_pitch 01719 ? 'F' : 'P'); 01720 01721 start_it = row_start; 01722 blob_it = row_start; 01723 word_sync = 01724 check_pitch_sync2 (&blob_it, total_blob_count, (inT16) initial_pitch, 2, 01725 projection, projection_left, projection_right, 01726 row->xheight * textord_projection_scale, occupation, 01727 &seg_list, 0, 0); 01728 if (occupation > 1) 01729 word_sync /= occupation; 01730 word_sync = sqrt (word_sync); 01731 01732 #ifndef GRAPHICS_DISABLED 01733 if (textord_show_row_cuts && to_win != NULL) 01734 plot_fp_cells2(to_win, ScrollView::CORAL, row, &seg_list); 01735 #endif 01736 seg_list.clear (); 01737 if (word_sync < textord_words_pitchsd_threshold * initial_pitch) { 01738 if (word_sync < textord_words_def_fixed * initial_pitch 01739 && !row->all_caps) 01740 res2 = "DF"; 01741 else 01742 res2 = "MF"; 01743 } 01744 else 01745 res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP"; 01746 tprintf 01747 ("row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, all_caps=%d\n", 01748 word_sync, word_sync / initial_pitch, 01749 word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P', 01750 occupation, res2, initial_pitch, row->fixed_pitch, row->all_caps); 01751 } 01752 01753 /********************************************************************** 01754 * find_repeated_chars 01755 * 01756 * Extract marked leader blobs and put them 01757 * into words in advance of fixed pitch checking and word generation. 01758 **********************************************************************/ 01759 void find_repeated_chars(TO_BLOCK *block, // Block to search. 01760 BOOL8 testing_on) { // Debug mode. 01761 POLY_BLOCK* pb = block->block->poly_block(); 01762 if (pb != NULL && !pb->IsText()) 01763 return; // Don't find repeated chars in non-text blocks. 01764 01765 TO_ROW *row; 01766 BLOBNBOX_IT box_it; 01767 BLOBNBOX_IT search_it; // forward search 01768 WERD_IT word_it; // new words 01769 WERD *word; // new word 01770 TBOX word_box; // for plotting 01771 int blobcount, repeated_set; 01772 01773 TO_ROW_IT row_it = block->get_rows(); 01774 if (row_it.empty()) return; // empty block 01775 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { 01776 row = row_it.data(); 01777 box_it.set_to_list(row->blob_list()); 01778 if (box_it.empty()) continue; // no blobs in this row 01779 if (!row->rep_chars_marked()) { 01780 mark_repeated_chars(row); 01781 } 01782 if (row->num_repeated_sets() == 0) continue; // nothing to do for this row 01783 word_it.set_to_list(&row->rep_words); 01784 do { 01785 if (box_it.data()->repeated_set() != 0 && 01786 !box_it.data()->joined_to_prev()) { 01787 blobcount = 1; 01788 repeated_set = box_it.data()->repeated_set(); 01789 search_it = box_it; 01790 search_it.forward(); 01791 while (!search_it.at_first() && 01792 search_it.data()->repeated_set() == repeated_set) { 01793 blobcount++; 01794 search_it.forward(); 01795 } 01796 // After the call to make_real_word() all the blobs from this 01797 // repeated set will be removed from the blob list. box_it will be 01798 // set to point to the blob after the end of the extracted sequence. 01799 word = make_real_word(&box_it, blobcount, box_it.at_first(), 1); 01800 if (!box_it.empty() && box_it.data()->joined_to_prev()) { 01801 tprintf("Bad box joined to prev at"); 01802 box_it.data()->bounding_box().print(); 01803 tprintf("After repeated word:"); 01804 word->bounding_box().print(); 01805 } 01806 ASSERT_HOST(box_it.empty() || !box_it.data()->joined_to_prev()); 01807 word->set_flag(W_REP_CHAR, true); 01808 word->set_flag(W_DONT_CHOP, true); 01809 word_it.add_after_then_move(word); 01810 } else { 01811 box_it.forward(); 01812 } 01813 } while (!box_it.at_first()); 01814 } 01815 } 01816 01817 01818 /********************************************************************** 01819 * plot_fp_word 01820 * 01821 * Plot a block of words as if fixed pitch. 01822 **********************************************************************/ 01823 01824 #ifndef GRAPHICS_DISABLED 01825 void plot_fp_word( //draw block of words 01826 TO_BLOCK *block, //block to draw 01827 float pitch, //pitch to draw with 01828 float nonspace //for space threshold 01829 ) { 01830 TO_ROW *row; //current row 01831 TO_ROW_IT row_it = block->get_rows (); 01832 01833 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 01834 row = row_it.data (); 01835 row->min_space = (inT32) ((pitch + nonspace) / 2); 01836 row->max_nonspace = row->min_space; 01837 row->space_threshold = row->min_space; 01838 plot_word_decisions (to_win, (inT16) pitch, row); 01839 } 01840 } 01841 #endif