tesseract
3.03
|
00001 /********************************************************************** 00002 * File: edgblob.c (Formerly edgeloop.c) 00003 * Description: Functions to clean up an outline before approximation. 00004 * Author: Ray Smith 00005 * Created: Tue Mar 26 16:56:25 GMT 1991 00006 * 00007 *(C) Copyright 1991, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0(the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "scanedg.h" 00021 #include "drawedg.h" 00022 #include "edgloop.h" 00023 #include "edgblob.h" 00024 00025 // Include automatically generated configuration file if running autoconf. 00026 #ifdef HAVE_CONFIG_H 00027 #include "config_auto.h" 00028 #endif 00029 00030 #define EXTERN 00031 00032 // Control parameters used in outline_complexity(), which rejects an outline 00033 // if any one of the 3 conditions is satisfied: 00034 // - number of children exceeds edges_max_children_per_outline 00035 // - number of nested layers exceeds edges_max_children_layers 00036 // - joint complexity exceeds edges_children_count_limit(as in child_count()) 00037 EXTERN BOOL_VAR(edges_use_new_outline_complexity, FALSE, 00038 "Use the new outline complexity module"); 00039 EXTERN INT_VAR(edges_max_children_per_outline, 10, 00040 "Max number of children inside a character outline"); 00041 EXTERN INT_VAR(edges_max_children_layers, 5, 00042 "Max layers of nested children inside a character outline"); 00043 EXTERN BOOL_VAR(edges_debug, FALSE, 00044 "turn on debugging for this module"); 00045 00046 00047 EXTERN INT_VAR(edges_children_per_grandchild, 10, 00048 "Importance ratio for chucking outlines"); 00049 EXTERN INT_VAR(edges_children_count_limit, 45, 00050 "Max holes allowed in blob"); 00051 EXTERN BOOL_VAR(edges_children_fix, FALSE, 00052 "Remove boxy parents of char-like children"); 00053 EXTERN INT_VAR(edges_min_nonhole, 12, 00054 "Min pixels for potential char in box"); 00055 EXTERN INT_VAR(edges_patharea_ratio, 40, 00056 "Max lensq/area for acceptable child outline"); 00057 EXTERN double_VAR(edges_childarea, 0.5, 00058 "Min area fraction of child outline"); 00059 EXTERN double_VAR(edges_boxarea, 0.875, 00060 "Min area fraction of grandchild for box"); 00061 00068 OL_BUCKETS::OL_BUCKETS( 00069 ICOORD bleft, // corners 00070 ICOORD tright): bl(bleft), tr(tright) { 00071 bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1; 00072 bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1; 00073 // make array 00074 buckets = new C_OUTLINE_LIST[bxdim * bydim]; 00075 index = 0; 00076 } 00077 00078 00086 C_OUTLINE_LIST * 00087 OL_BUCKETS::operator()( // array access 00088 inT16 x, // image coords 00089 inT16 y) { 00090 return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE]; 00091 } 00092 00093 00114 inT32 OL_BUCKETS::outline_complexity( 00115 C_OUTLINE *outline, // parent outline 00116 inT32 max_count, // max output 00117 inT16 depth // recurion depth 00118 ) { 00119 inT16 xmin, xmax; // coord limits 00120 inT16 ymin, ymax; 00121 inT16 xindex, yindex; // current bucket 00122 C_OUTLINE *child; // current child 00123 inT32 child_count; // no of children 00124 inT32 grandchild_count; // no of grandchildren 00125 C_OUTLINE_IT child_it; // search iterator 00126 00127 TBOX olbox = outline->bounding_box(); 00128 xmin =(olbox.left() - bl.x()) / BUCKETSIZE; 00129 xmax =(olbox.right() - bl.x()) / BUCKETSIZE; 00130 ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE; 00131 ymax =(olbox.top() - bl.y()) / BUCKETSIZE; 00132 child_count = 0; 00133 grandchild_count = 0; 00134 if (++depth > edges_max_children_layers) // nested loops are too deep 00135 return max_count + depth; 00136 00137 for (yindex = ymin; yindex <= ymax; yindex++) { 00138 for (xindex = xmin; xindex <= xmax; xindex++) { 00139 child_it.set_to_list(&buckets[yindex * bxdim + xindex]); 00140 if (child_it.empty()) 00141 continue; 00142 for (child_it.mark_cycle_pt(); !child_it.cycled_list(); 00143 child_it.forward()) { 00144 child = child_it.data(); 00145 if (child == outline || !(*child < *outline)) 00146 continue; 00147 child_count++; 00148 00149 if (child_count > edges_max_children_per_outline) { // too fragmented 00150 if (edges_debug) 00151 tprintf("Discard outline on child_count=%d > " 00152 "max_children_per_outline=%d\n", 00153 child_count, 00154 static_cast<inT32>(edges_max_children_per_outline)); 00155 return max_count + child_count; 00156 } 00157 00158 // Compute the "complexity" of each child recursively 00159 inT32 remaining_count = max_count - child_count - grandchild_count; 00160 if (remaining_count > 0) 00161 grandchild_count += edges_children_per_grandchild * 00162 outline_complexity(child, remaining_count, depth); 00163 if (child_count + grandchild_count > max_count) { // too complex 00164 if (edges_debug) 00165 tprintf("Disgard outline on child_count=%d + grandchild_count=%d " 00166 "> max_count=%d\n", 00167 child_count, grandchild_count, max_count); 00168 return child_count + grandchild_count; 00169 } 00170 } 00171 } 00172 } 00173 return child_count + grandchild_count; 00174 } 00175 00176 00182 // TODO(rays) Merge with outline_complexity. 00183 inT32 OL_BUCKETS::count_children( // recursive count 00184 C_OUTLINE *outline, // parent outline 00185 inT32 max_count // max output 00186 ) { 00187 BOOL8 parent_box; // could it be boxy 00188 inT16 xmin, xmax; // coord limits 00189 inT16 ymin, ymax; 00190 inT16 xindex, yindex; // current bucket 00191 C_OUTLINE *child; // current child 00192 inT32 child_count; // no of children 00193 inT32 grandchild_count; // no of grandchildren 00194 inT32 parent_area; // potential box 00195 FLOAT32 max_parent_area; // potential box 00196 inT32 child_area; // current child 00197 inT32 child_length; // current child 00198 TBOX olbox; 00199 C_OUTLINE_IT child_it; // search iterator 00200 00201 olbox = outline->bounding_box(); 00202 xmin =(olbox.left() - bl.x()) / BUCKETSIZE; 00203 xmax =(olbox.right() - bl.x()) / BUCKETSIZE; 00204 ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE; 00205 ymax =(olbox.top() - bl.y()) / BUCKETSIZE; 00206 child_count = 0; 00207 grandchild_count = 0; 00208 parent_area = 0; 00209 max_parent_area = 0; 00210 parent_box = TRUE; 00211 for (yindex = ymin; yindex <= ymax; yindex++) { 00212 for (xindex = xmin; xindex <= xmax; xindex++) { 00213 child_it.set_to_list(&buckets[yindex * bxdim + xindex]); 00214 if (child_it.empty()) 00215 continue; 00216 for (child_it.mark_cycle_pt(); !child_it.cycled_list(); 00217 child_it.forward()) { 00218 child = child_it.data(); 00219 if (child != outline && *child < *outline) { 00220 child_count++; 00221 if (child_count <= max_count) { 00222 int max_grand =(max_count - child_count) / 00223 edges_children_per_grandchild; 00224 if (max_grand > 0) 00225 grandchild_count += count_children(child, max_grand) * 00226 edges_children_per_grandchild; 00227 else 00228 grandchild_count += count_children(child, 1); 00229 } 00230 if (child_count + grandchild_count > max_count) { 00231 if (edges_debug) 00232 tprintf("Discarding parent with child count=%d, gc=%d\n", 00233 child_count,grandchild_count); 00234 return child_count + grandchild_count; 00235 } 00236 if (parent_area == 0) { 00237 parent_area = outline->outer_area(); 00238 if (parent_area < 0) 00239 parent_area = -parent_area; 00240 max_parent_area = outline->bounding_box().area() * edges_boxarea; 00241 if (parent_area < max_parent_area) 00242 parent_box = FALSE; 00243 } 00244 if (parent_box && 00245 (!edges_children_fix || 00246 child->bounding_box().height() > edges_min_nonhole)) { 00247 child_area = child->outer_area(); 00248 if (child_area < 0) 00249 child_area = -child_area; 00250 if (edges_children_fix) { 00251 if (parent_area - child_area < max_parent_area) { 00252 parent_box = FALSE; 00253 continue; 00254 } 00255 if (grandchild_count > 0) { 00256 if (edges_debug) 00257 tprintf("Discarding parent of area %d, child area=%d, max%g " 00258 "with gc=%d\n", 00259 parent_area, child_area, max_parent_area, 00260 grandchild_count); 00261 return max_count + 1; 00262 } 00263 child_length = child->pathlength(); 00264 if (child_length * child_length > 00265 child_area * edges_patharea_ratio) { 00266 if (edges_debug) 00267 tprintf("Discarding parent of area %d, child area=%d, max%g " 00268 "with child length=%d\n", 00269 parent_area, child_area, max_parent_area, 00270 child_length); 00271 return max_count + 1; 00272 } 00273 } 00274 if (child_area < child->bounding_box().area() * edges_childarea) { 00275 if (edges_debug) 00276 tprintf("Discarding parent of area %d, child area=%d, max%g " 00277 "with child rect=%d\n", 00278 parent_area, child_area, max_parent_area, 00279 child->bounding_box().area()); 00280 return max_count + 1; 00281 } 00282 } 00283 } 00284 } 00285 } 00286 } 00287 return child_count + grandchild_count; 00288 } 00289 00290 00291 00292 00299 void OL_BUCKETS::extract_children( // recursive count 00300 C_OUTLINE *outline, // parent outline 00301 C_OUTLINE_IT *it // destination iterator 00302 ) { 00303 inT16 xmin, xmax; // coord limits 00304 inT16 ymin, ymax; 00305 inT16 xindex, yindex; // current bucket 00306 TBOX olbox; 00307 C_OUTLINE_IT child_it; // search iterator 00308 00309 olbox = outline->bounding_box(); 00310 xmin =(olbox.left() - bl.x()) / BUCKETSIZE; 00311 xmax =(olbox.right() - bl.x()) / BUCKETSIZE; 00312 ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE; 00313 ymax =(olbox.top() - bl.y()) / BUCKETSIZE; 00314 for (yindex = ymin; yindex <= ymax; yindex++) { 00315 for (xindex = xmin; xindex <= xmax; xindex++) { 00316 child_it.set_to_list(&buckets[yindex * bxdim + xindex]); 00317 for (child_it.mark_cycle_pt(); !child_it.cycled_list(); 00318 child_it.forward()) { 00319 if (*child_it.data() < *outline) { 00320 it->add_after_then_move(child_it.extract()); 00321 } 00322 } 00323 } 00324 } 00325 } 00326 00327 00334 void extract_edges(Pix* pix, // thresholded image 00335 BLOCK *block) { // block to scan 00336 C_OUTLINE_LIST outlines; // outlines in block 00337 C_OUTLINE_IT out_it = &outlines; 00338 00339 block_edges(pix, block, &out_it); 00340 ICOORD bleft; // block box 00341 ICOORD tright; 00342 block->bounding_box(bleft, tright); 00343 // make blobs 00344 outlines_to_blobs(block, bleft, tright, &outlines); 00345 } 00346 00347 00354 void outlines_to_blobs( // find blobs 00355 BLOCK *block, // block to scan 00356 ICOORD bleft, 00357 ICOORD tright, 00358 C_OUTLINE_LIST *outlines) { 00359 // make buckets 00360 OL_BUCKETS buckets(bleft, tright); 00361 00362 fill_buckets(outlines, &buckets); 00363 empty_buckets(block, &buckets); 00364 } 00365 00366 00373 void fill_buckets( // find blobs 00374 C_OUTLINE_LIST *outlines, // outlines in block 00375 OL_BUCKETS *buckets // output buckets 00376 ) { 00377 TBOX ol_box; // outline box 00378 C_OUTLINE_IT out_it = outlines; // iterator 00379 C_OUTLINE_IT bucket_it; // iterator in bucket 00380 C_OUTLINE *outline; // current outline 00381 00382 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { 00383 outline = out_it.extract(); // take off list 00384 // get box 00385 ol_box = outline->bounding_box(); 00386 bucket_it.set_to_list((*buckets) (ol_box.left(), ol_box.bottom())); 00387 bucket_it.add_to_end(outline); 00388 } 00389 } 00390 00391 00398 void empty_buckets( // find blobs 00399 BLOCK *block, // block to scan 00400 OL_BUCKETS *buckets // output buckets 00401 ) { 00402 BOOL8 good_blob; // healthy blob 00403 C_OUTLINE_LIST outlines; // outlines in block 00404 // iterator 00405 C_OUTLINE_IT out_it = &outlines; 00406 C_OUTLINE_IT bucket_it = buckets->start_scan(); 00407 C_OUTLINE_IT parent_it; // parent outline 00408 C_BLOB_IT good_blobs = block->blob_list(); 00409 C_BLOB_IT junk_blobs = block->reject_blobs(); 00410 00411 while (!bucket_it.empty()) { 00412 out_it.set_to_list(&outlines); 00413 do { 00414 parent_it = bucket_it; // find outermost 00415 do { 00416 bucket_it.forward(); 00417 } while (!bucket_it.at_first() && 00418 !(*parent_it.data() < *bucket_it.data())); 00419 } while (!bucket_it.at_first()); 00420 00421 // move to new list 00422 out_it.add_after_then_move(parent_it.extract()); 00423 good_blob = capture_children(buckets, &junk_blobs, &out_it); 00424 C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs, 00425 &junk_blobs); 00426 00427 bucket_it.set_to_list(buckets->scan_next()); 00428 } 00429 } 00430 00431 00440 BOOL8 capture_children( // find children 00441 OL_BUCKETS *buckets, // bucket sort clanss 00442 C_BLOB_IT *reject_it, // dead grandchildren 00443 C_OUTLINE_IT *blob_it // output outlines 00444 ) { 00445 C_OUTLINE *outline; // master outline 00446 inT32 child_count; // no of children 00447 00448 outline = blob_it->data(); 00449 if (edges_use_new_outline_complexity) 00450 child_count = buckets->outline_complexity(outline, 00451 edges_children_count_limit, 00452 0); 00453 else 00454 child_count = buckets->count_children(outline, 00455 edges_children_count_limit); 00456 if (child_count > edges_children_count_limit) 00457 return FALSE; 00458 00459 if (child_count > 0) 00460 buckets->extract_children(outline, blob_it); 00461 return TRUE; 00462 }