tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/textord/edgblob.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        edgblob.c (Formerly edgeloop.c)
00003  * Description: Functions to clean up an outline before approximation.
00004  * Author:              Ray Smith
00005  * Created:             Tue Mar 26 16:56:25 GMT 1991
00006  *
00007  *(C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0(the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "scanedg.h"
00021 #include "drawedg.h"
00022 #include "edgloop.h"
00023 #include "edgblob.h"
00024 
00025 // Include automatically generated configuration file if running autoconf.
00026 #ifdef HAVE_CONFIG_H
00027 #include "config_auto.h"
00028 #endif
00029 
00030 #define EXTERN
00031 
00032 // Control parameters used in outline_complexity(), which rejects an outline
00033 // if any one of the 3 conditions is satisfied:
00034 //  - number of children exceeds edges_max_children_per_outline
00035 //  - number of nested layers exceeds edges_max_children_layers
00036 //  - joint complexity exceeds edges_children_count_limit(as in child_count())
00037 EXTERN BOOL_VAR(edges_use_new_outline_complexity, FALSE,
00038                 "Use the new outline complexity module");
00039 EXTERN INT_VAR(edges_max_children_per_outline, 10,
00040                "Max number of children inside a character outline");
00041 EXTERN INT_VAR(edges_max_children_layers, 5,
00042                "Max layers of nested children inside a character outline");
00043 EXTERN BOOL_VAR(edges_debug, FALSE,
00044                 "turn on debugging for this module");
00045 
00046 
00047 EXTERN INT_VAR(edges_children_per_grandchild, 10,
00048                "Importance ratio for chucking outlines");
00049 EXTERN INT_VAR(edges_children_count_limit, 45,
00050                "Max holes allowed in blob");
00051 EXTERN BOOL_VAR(edges_children_fix, FALSE,
00052                 "Remove boxy parents of char-like children");
00053 EXTERN INT_VAR(edges_min_nonhole, 12,
00054                "Min pixels for potential char in box");
00055 EXTERN INT_VAR(edges_patharea_ratio, 40,
00056                "Max lensq/area for acceptable child outline");
00057 EXTERN double_VAR(edges_childarea, 0.5,
00058                   "Min area fraction of child outline");
00059 EXTERN double_VAR(edges_boxarea, 0.875,
00060                   "Min area fraction of grandchild for box");
00061 
00068 OL_BUCKETS::OL_BUCKETS(
00069 ICOORD bleft,                    // corners
00070 ICOORD tright):         bl(bleft), tr(tright) {
00071   bxdim =(tright.x() - bleft.x()) / BUCKETSIZE + 1;
00072   bydim =(tright.y() - bleft.y()) / BUCKETSIZE + 1;
00073                                  // make array
00074   buckets = new C_OUTLINE_LIST[bxdim * bydim];
00075   index = 0;
00076 }
00077 
00078 
00086 C_OUTLINE_LIST *
00087 OL_BUCKETS::operator()(       // array access
00088 inT16 x,                      // image coords
00089 inT16 y) {
00090   return &buckets[(y-bl.y()) / BUCKETSIZE * bxdim + (x-bl.x()) / BUCKETSIZE];
00091 }
00092 
00093 
00114 inT32 OL_BUCKETS::outline_complexity(
00115                                      C_OUTLINE *outline,   // parent outline
00116                                      inT32 max_count,      // max output
00117                                      inT16 depth           // recurion depth
00118                                     ) {
00119   inT16 xmin, xmax;              // coord limits
00120   inT16 ymin, ymax;
00121   inT16 xindex, yindex;          // current bucket
00122   C_OUTLINE *child;              // current child
00123   inT32 child_count;             // no of children
00124   inT32 grandchild_count;        // no of grandchildren
00125   C_OUTLINE_IT child_it;         // search iterator
00126 
00127   TBOX olbox = outline->bounding_box();
00128   xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
00129   xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
00130   ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
00131   ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
00132   child_count = 0;
00133   grandchild_count = 0;
00134   if (++depth > edges_max_children_layers)  // nested loops are too deep
00135     return max_count + depth;
00136 
00137   for (yindex = ymin; yindex <= ymax; yindex++) {
00138     for (xindex = xmin; xindex <= xmax; xindex++) {
00139       child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
00140       if (child_it.empty())
00141         continue;
00142       for (child_it.mark_cycle_pt(); !child_it.cycled_list();
00143            child_it.forward()) {
00144         child = child_it.data();
00145         if (child == outline || !(*child < *outline))
00146           continue;
00147         child_count++;
00148 
00149         if (child_count > edges_max_children_per_outline) {   // too fragmented
00150           if (edges_debug)
00151             tprintf("Discard outline on child_count=%d > "
00152                     "max_children_per_outline=%d\n",
00153                     child_count,
00154                     static_cast<inT32>(edges_max_children_per_outline));
00155           return max_count + child_count;
00156         }
00157 
00158         // Compute the "complexity" of each child recursively
00159         inT32 remaining_count = max_count - child_count - grandchild_count;
00160         if (remaining_count > 0)
00161           grandchild_count += edges_children_per_grandchild *
00162                               outline_complexity(child, remaining_count, depth);
00163         if (child_count + grandchild_count > max_count) {  // too complex
00164           if (edges_debug)
00165             tprintf("Disgard outline on child_count=%d + grandchild_count=%d "
00166                     "> max_count=%d\n",
00167                     child_count, grandchild_count, max_count);
00168           return child_count + grandchild_count;
00169         }
00170       }
00171     }
00172   }
00173   return child_count + grandchild_count;
00174 }
00175 
00176 
00182 // TODO(rays) Merge with outline_complexity.
00183 inT32 OL_BUCKETS::count_children(                     // recursive count
00184                                  C_OUTLINE *outline,  // parent outline
00185                                  inT32 max_count      // max output
00186                                 ) {
00187   BOOL8 parent_box;              // could it be boxy
00188   inT16 xmin, xmax;              // coord limits
00189   inT16 ymin, ymax;
00190   inT16 xindex, yindex;          // current bucket
00191   C_OUTLINE *child;              // current child
00192   inT32 child_count;             // no of children
00193   inT32 grandchild_count;        // no of grandchildren
00194   inT32 parent_area;             // potential box
00195   FLOAT32 max_parent_area;       // potential box
00196   inT32 child_area;              // current child
00197   inT32 child_length;            // current child
00198   TBOX olbox;
00199   C_OUTLINE_IT child_it;         // search iterator
00200 
00201   olbox = outline->bounding_box();
00202   xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
00203   xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
00204   ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
00205   ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
00206   child_count = 0;
00207   grandchild_count = 0;
00208   parent_area = 0;
00209   max_parent_area = 0;
00210   parent_box = TRUE;
00211   for (yindex = ymin; yindex <= ymax; yindex++) {
00212     for (xindex = xmin; xindex <= xmax; xindex++) {
00213       child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
00214       if (child_it.empty())
00215         continue;
00216       for (child_it.mark_cycle_pt(); !child_it.cycled_list();
00217            child_it.forward()) {
00218         child = child_it.data();
00219         if (child != outline && *child < *outline) {
00220           child_count++;
00221           if (child_count <= max_count) {
00222             int max_grand =(max_count - child_count) /
00223                             edges_children_per_grandchild;
00224             if (max_grand > 0)
00225               grandchild_count += count_children(child, max_grand) *
00226                                   edges_children_per_grandchild;
00227             else
00228               grandchild_count += count_children(child, 1);
00229           }
00230           if (child_count + grandchild_count > max_count) {
00231             if (edges_debug)
00232               tprintf("Discarding parent with child count=%d, gc=%d\n",
00233                       child_count,grandchild_count);
00234             return child_count + grandchild_count;
00235           }
00236           if (parent_area == 0) {
00237             parent_area = outline->outer_area();
00238             if (parent_area < 0)
00239               parent_area = -parent_area;
00240             max_parent_area = outline->bounding_box().area() * edges_boxarea;
00241             if (parent_area < max_parent_area)
00242               parent_box = FALSE;
00243           }
00244           if (parent_box &&
00245               (!edges_children_fix ||
00246                child->bounding_box().height() > edges_min_nonhole)) {
00247             child_area = child->outer_area();
00248             if (child_area < 0)
00249               child_area = -child_area;
00250             if (edges_children_fix) {
00251               if (parent_area - child_area < max_parent_area) {
00252                 parent_box = FALSE;
00253                 continue;
00254               }
00255               if (grandchild_count > 0) {
00256                 if (edges_debug)
00257                   tprintf("Discarding parent of area %d, child area=%d, max%g "
00258                           "with gc=%d\n",
00259                           parent_area, child_area, max_parent_area,
00260                           grandchild_count);
00261                 return max_count + 1;
00262               }
00263               child_length = child->pathlength();
00264               if (child_length * child_length >
00265                   child_area * edges_patharea_ratio) {
00266                 if (edges_debug)
00267                   tprintf("Discarding parent of area %d, child area=%d, max%g "
00268                           "with child length=%d\n",
00269                           parent_area, child_area, max_parent_area,
00270                           child_length);
00271                 return max_count + 1;
00272               }
00273             }
00274             if (child_area < child->bounding_box().area() * edges_childarea) {
00275               if (edges_debug)
00276                 tprintf("Discarding parent of area %d, child area=%d, max%g "
00277                         "with child rect=%d\n",
00278                         parent_area, child_area, max_parent_area,
00279                         child->bounding_box().area());
00280               return max_count + 1;
00281             }
00282           }
00283         }
00284       }
00285     }
00286   }
00287   return child_count + grandchild_count;
00288 }
00289 
00290 
00291 
00292 
00299 void OL_BUCKETS::extract_children(                     // recursive count
00300                                   C_OUTLINE *outline,  // parent outline
00301                                   C_OUTLINE_IT *it     // destination iterator
00302                                  ) {
00303   inT16 xmin, xmax;              // coord limits
00304   inT16 ymin, ymax;
00305   inT16 xindex, yindex;          // current bucket
00306   TBOX olbox;
00307   C_OUTLINE_IT child_it;         // search iterator
00308 
00309   olbox = outline->bounding_box();
00310   xmin =(olbox.left() - bl.x()) / BUCKETSIZE;
00311   xmax =(olbox.right() - bl.x()) / BUCKETSIZE;
00312   ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE;
00313   ymax =(olbox.top() - bl.y()) / BUCKETSIZE;
00314   for (yindex = ymin; yindex <= ymax; yindex++) {
00315     for (xindex = xmin; xindex <= xmax; xindex++) {
00316       child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
00317       for (child_it.mark_cycle_pt(); !child_it.cycled_list();
00318            child_it.forward()) {
00319         if (*child_it.data() < *outline) {
00320           it->add_after_then_move(child_it.extract());
00321         }
00322       }
00323     }
00324   }
00325 }
00326 
00327 
00334 void extract_edges(Pix* pix,  // thresholded image
00335                    BLOCK *block) {  // block to scan
00336   C_OUTLINE_LIST outlines;       // outlines in block
00337   C_OUTLINE_IT out_it = &outlines;
00338 
00339   block_edges(pix, block, &out_it);
00340   ICOORD bleft;                  // block box
00341   ICOORD tright;
00342   block->bounding_box(bleft, tright);
00343                                  // make blobs
00344   outlines_to_blobs(block, bleft, tright, &outlines);
00345 }
00346 
00347 
00354 void outlines_to_blobs(               // find blobs
00355                        BLOCK *block,  // block to scan
00356                        ICOORD bleft,
00357                        ICOORD tright,
00358                        C_OUTLINE_LIST *outlines) {
00359                                  // make buckets
00360   OL_BUCKETS buckets(bleft, tright);
00361 
00362   fill_buckets(outlines, &buckets);
00363   empty_buckets(block, &buckets);
00364 }
00365 
00366 
00373 void fill_buckets(                           // find blobs
00374                   C_OUTLINE_LIST *outlines,  // outlines in block
00375                   OL_BUCKETS *buckets        // output buckets
00376                  ) {
00377   TBOX ol_box;                     // outline box
00378   C_OUTLINE_IT out_it = outlines;  // iterator
00379   C_OUTLINE_IT bucket_it;          // iterator in bucket
00380   C_OUTLINE *outline;              // current outline
00381 
00382   for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
00383     outline = out_it.extract();  // take off list
00384                                  // get box
00385     ol_box = outline->bounding_box();
00386     bucket_it.set_to_list((*buckets) (ol_box.left(), ol_box.bottom()));
00387     bucket_it.add_to_end(outline);
00388   }
00389 }
00390 
00391 
00398 void empty_buckets(                     // find blobs
00399                    BLOCK *block,        // block to scan
00400                    OL_BUCKETS *buckets  // output buckets
00401                   ) {
00402   BOOL8 good_blob;               // healthy blob
00403   C_OUTLINE_LIST outlines;       // outlines in block
00404                                  // iterator
00405   C_OUTLINE_IT out_it = &outlines;
00406   C_OUTLINE_IT bucket_it = buckets->start_scan();
00407   C_OUTLINE_IT parent_it;        // parent outline
00408   C_BLOB_IT good_blobs = block->blob_list();
00409   C_BLOB_IT junk_blobs = block->reject_blobs();
00410 
00411   while (!bucket_it.empty()) {
00412     out_it.set_to_list(&outlines);
00413     do {
00414       parent_it = bucket_it;     // find outermost
00415       do {
00416         bucket_it.forward();
00417       } while (!bucket_it.at_first() &&
00418                !(*parent_it.data() < *bucket_it.data()));
00419     } while (!bucket_it.at_first());
00420 
00421                                  // move to new list
00422     out_it.add_after_then_move(parent_it.extract());
00423     good_blob = capture_children(buckets, &junk_blobs, &out_it);
00424     C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs,
00425                                        &junk_blobs);
00426 
00427     bucket_it.set_to_list(buckets->scan_next());
00428   }
00429 }
00430 
00431 
00440 BOOL8 capture_children(                       // find children
00441                        OL_BUCKETS *buckets,   // bucket sort clanss
00442                        C_BLOB_IT *reject_it,  // dead grandchildren
00443                        C_OUTLINE_IT *blob_it  // output outlines
00444                       ) {
00445   C_OUTLINE *outline;            // master outline
00446   inT32 child_count;             // no of children
00447 
00448   outline = blob_it->data();
00449   if (edges_use_new_outline_complexity)
00450     child_count = buckets->outline_complexity(outline,
00451                                                edges_children_count_limit,
00452                                                0);
00453   else
00454     child_count = buckets->count_children(outline,
00455                                            edges_children_count_limit);
00456   if (child_count > edges_children_count_limit)
00457     return FALSE;
00458 
00459   if (child_count > 0)
00460     buckets->extract_children(outline, blob_it);
00461   return TRUE;
00462 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines