tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccstruct/werd.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        werd.cpp  (Formerly word.c)
00003  * Description: Code for the WERD class.
00004  * Author:      Ray Smith
00005  * Created:     Tue Oct 08 14:32:12 BST 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include "blckerr.h"
00021 #include "helpers.h"
00022 #include "linlsq.h"
00023 #include "werd.h"
00024 
00025 // Include automatically generated configuration file if running autoconf.
00026 #ifdef HAVE_CONFIG_H
00027 #include "config_auto.h"
00028 #endif
00029 
00030 #define FIRST_COLOUR    ScrollView::RED         //< first rainbow colour
00031 #define LAST_COLOUR     ScrollView::AQUAMARINE  //< last rainbow colour
00032 #define CHILD_COLOUR    ScrollView::BROWN       //< colour of children
00033 
00034 const ERRCODE CANT_SCALE_EDGESTEPS =
00035     "Attempted to scale an edgestep format word";
00036 
00037 ELIST2IZE(WERD)
00038 
00039 
00048 WERD::WERD(C_BLOB_LIST *blob_list, uinT8 blank_count, const char *text)
00049   : blanks(blank_count),
00050     flags(0),
00051     script_id_(0),
00052     correct(text) {
00053   C_BLOB_IT start_it = blob_list;
00054   C_BLOB_IT end_it = blob_list;
00055   C_BLOB_IT rej_cblob_it = &rej_cblobs;
00056   C_OUTLINE_IT c_outline_it;
00057   inT16 inverted_vote = 0;
00058   inT16 non_inverted_vote = 0;
00059 
00060   // Move blob_list's elements into cblobs.
00061   while (!end_it.at_last())
00062     end_it.forward();
00063   cblobs.assign_to_sublist(&start_it, &end_it);
00064 
00065   /*
00066     Set white on black flag for the WERD, moving any duff blobs onto the
00067     rej_cblobs list.
00068     First, walk the cblobs checking the inverse flag for each outline of each
00069     cblob. If a cblob has inconsistent flag settings for its different
00070     outlines, move the blob to the reject list. Otherwise, increment the
00071     appropriate w-on-b or b-on-w vote for the word.
00072 
00073     Now set the inversion flag for the WERD by maximum vote.
00074 
00075     Walk the blobs again, moving any blob whose inversion flag does not agree
00076     with the concencus onto the reject list.
00077   */
00078   start_it.set_to_list(&cblobs);
00079   if (start_it.empty())
00080     return;
00081   for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
00082     BOOL8 reject_blob = FALSE;
00083     BOOL8 blob_inverted;
00084 
00085     c_outline_it.set_to_list(start_it.data()->out_list());
00086     blob_inverted = c_outline_it.data()->flag(COUT_INVERSE);
00087     for (c_outline_it.mark_cycle_pt();
00088          !c_outline_it.cycled_list() && !reject_blob;
00089          c_outline_it.forward()) {
00090       reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted;
00091     }
00092     if (reject_blob) {
00093       rej_cblob_it.add_after_then_move(start_it.extract());
00094     } else {
00095       if (blob_inverted)
00096         inverted_vote++;
00097       else
00098         non_inverted_vote++;
00099     }
00100   }
00101 
00102   flags.set_bit(W_INVERSE, (inverted_vote > non_inverted_vote));
00103 
00104   start_it.set_to_list(&cblobs);
00105   if (start_it.empty())
00106     return;
00107   for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
00108     c_outline_it.set_to_list(start_it.data()->out_list());
00109     if (c_outline_it.data()->flag(COUT_INVERSE) != flags.bit(W_INVERSE))
00110       rej_cblob_it.add_after_then_move(start_it.extract());
00111   }
00112 }
00113 
00114 
00122 WERD::WERD(C_BLOB_LIST * blob_list,         //< In word order
00123            WERD * clone)                    //< Source of flags
00124   : flags(clone->flags),
00125     script_id_(clone->script_id_),
00126     correct(clone->correct) {
00127   C_BLOB_IT start_it = blob_list;  // iterator
00128   C_BLOB_IT end_it = blob_list;    // another
00129 
00130   while (!end_it.at_last ())
00131     end_it.forward ();           //move to last
00132   ((C_BLOB_LIST *) (&cblobs))->assign_to_sublist (&start_it, &end_it);
00133   //move to our list
00134   blanks = clone->blanks;
00135   //      fprintf(stderr,"Wrong constructor!!!!\n");
00136 }
00137 
00138 // Construct a WERD from a single_blob and clone the flags from this.
00139 // W_BOL and W_EOL flags are set according to the given values.
00140 WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) {
00141   C_BLOB_LIST temp_blobs;
00142   C_BLOB_IT temp_it(&temp_blobs);
00143   temp_it.add_after_then_move(blob);
00144   WERD* blob_word = new WERD(&temp_blobs, this);
00145   blob_word->set_flag(W_BOL, bol);
00146   blob_word->set_flag(W_EOL, eol);
00147   return blob_word;
00148 }
00149 
00163 TBOX WERD::bounding_box() {
00164   TBOX box;                       // box being built
00165   C_BLOB_IT rej_cblob_it = &rej_cblobs;  // rejected blobs
00166 
00167   for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list();
00168        rej_cblob_it.forward()) {
00169     box += rej_cblob_it.data()->bounding_box();
00170   }
00171 
00172   C_BLOB_IT it = &cblobs;    // blobs of WERD
00173   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00174     box += it.data()->bounding_box();
00175   }
00176   return box;
00177 }
00178 
00179 
00187 void WERD::move(const ICOORD vec) {
00188   C_BLOB_IT cblob_it(&cblobs);  // cblob iterator
00189 
00190   for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward())
00191     cblob_it.data()->move(vec);
00192 }
00193 
00200 void WERD::join_on(WERD* other) {
00201   C_BLOB_IT blob_it(&cblobs);
00202   C_BLOB_IT src_it(&other->cblobs);
00203   C_BLOB_IT rej_cblob_it(&rej_cblobs);
00204   C_BLOB_IT src_rej_it(&other->rej_cblobs);
00205 
00206   while (!src_it.empty()) {
00207     blob_it.add_to_end(src_it.extract());
00208     src_it.forward();
00209   }
00210   while (!src_rej_it.empty()) {
00211     rej_cblob_it.add_to_end(src_rej_it.extract());
00212     src_rej_it.forward();
00213   }
00214 }
00215 
00216 
00223 void WERD::copy_on(WERD* other) {
00224   bool reversed = other->bounding_box().left() < bounding_box().left();
00225   C_BLOB_IT c_blob_it(&cblobs);
00226   C_BLOB_LIST c_blobs;
00227 
00228   c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy);
00229   if (reversed) {
00230     c_blob_it.add_list_before(&c_blobs);
00231   } else {
00232     c_blob_it.move_to_last();
00233     c_blob_it.add_list_after(&c_blobs);
00234   }
00235   if (!other->rej_cblobs.empty()) {
00236     C_BLOB_IT rej_c_blob_it(&rej_cblobs);
00237     C_BLOB_LIST new_rej_c_blobs;
00238 
00239     new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy);
00240     if (reversed) {
00241       rej_c_blob_it.add_list_before(&new_rej_c_blobs);
00242     } else {
00243       rej_c_blob_it.move_to_last();
00244       rej_c_blob_it.add_list_after(&new_rej_c_blobs);
00245     }
00246   }
00247 }
00248 
00255 void WERD::print() {
00256   tprintf("Blanks= %d\n", blanks);
00257   bounding_box().print();
00258   tprintf("Flags = %d = 0%o\n", flags.val, flags.val);
00259   tprintf("   W_SEGMENTED = %s\n", flags.bit(W_SEGMENTED) ? "TRUE" : "FALSE ");
00260   tprintf("   W_ITALIC = %s\n", flags.bit(W_ITALIC) ? "TRUE" : "FALSE ");
00261   tprintf("   W_BOL = %s\n", flags.bit(W_BOL) ? "TRUE" : "FALSE ");
00262   tprintf("   W_EOL = %s\n", flags.bit(W_EOL) ? "TRUE" : "FALSE ");
00263   tprintf("   W_NORMALIZED = %s\n",
00264           flags.bit(W_NORMALIZED) ? "TRUE" : "FALSE ");
00265   tprintf("   W_SCRIPT_HAS_XHEIGHT = %s\n",
00266           flags.bit(W_SCRIPT_HAS_XHEIGHT) ? "TRUE" : "FALSE ");
00267   tprintf("   W_SCRIPT_IS_LATIN = %s\n",
00268           flags.bit(W_SCRIPT_IS_LATIN) ? "TRUE" : "FALSE ");
00269   tprintf("   W_DONT_CHOP = %s\n", flags.bit(W_DONT_CHOP) ? "TRUE" : "FALSE ");
00270   tprintf("   W_REP_CHAR = %s\n", flags.bit(W_REP_CHAR) ? "TRUE" : "FALSE ");
00271   tprintf("   W_FUZZY_SP = %s\n", flags.bit(W_FUZZY_SP) ? "TRUE" : "FALSE ");
00272   tprintf("   W_FUZZY_NON = %s\n", flags.bit(W_FUZZY_NON) ? "TRUE" : "FALSE ");
00273   tprintf("Correct= %s\n", correct.string());
00274   tprintf("Rejected cblob count = %d\n", rej_cblobs.length());
00275   tprintf("Script = %d\n", script_id_);
00276 }
00277 
00278 
00285 #ifndef GRAPHICS_DISABLED
00286 void WERD::plot(ScrollView *window, ScrollView::Color colour) {
00287   C_BLOB_IT it = &cblobs;
00288   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00289     it.data()->plot(window, colour, colour);
00290   }
00291   plot_rej_blobs(window);
00292 }
00293 
00294 // Get the next color in the (looping) rainbow.
00295 ScrollView::Color WERD::NextColor(ScrollView::Color colour) {
00296   ScrollView::Color next = static_cast<ScrollView::Color>(colour + 1);
00297   if (next >= LAST_COLOUR || next < FIRST_COLOUR)
00298     next = FIRST_COLOUR;
00299   return next;
00300 }
00301 
00308 void WERD::plot(ScrollView* window) {
00309   ScrollView::Color colour = FIRST_COLOUR;
00310   C_BLOB_IT it = &cblobs;
00311   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00312     it.data()->plot(window, colour, CHILD_COLOUR);
00313     colour = NextColor(colour);
00314   }
00315   plot_rej_blobs(window);
00316 }
00317 
00318 
00326 void WERD::plot_rej_blobs(ScrollView *window) {
00327   C_BLOB_IT it = &rej_cblobs;
00328   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
00329     it.data()->plot(window, ScrollView::GREY, ScrollView::GREY);
00330   }
00331 }
00332 #endif  // GRAPHICS_DISABLED
00333 
00334 
00341 WERD *WERD::shallow_copy() {
00342   WERD *new_word = new WERD;
00343 
00344   new_word->blanks = blanks;
00345   new_word->flags = flags;
00346   new_word->dummy = dummy;
00347   new_word->correct = correct;
00348   return new_word;
00349 }
00350 
00351 
00358 WERD & WERD::operator= (const WERD & source) {
00359   this->ELIST2_LINK::operator= (source);
00360   blanks = source.blanks;
00361   flags = source.flags;
00362   script_id_ = source.script_id_;
00363   dummy = source.dummy;
00364   correct = source.correct;
00365   if (!cblobs.empty())
00366     cblobs.clear();
00367   cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy);
00368 
00369   if (!rej_cblobs.empty())
00370     rej_cblobs.clear();
00371   rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy);
00372   return *this;
00373 }
00374 
00375 
00383 int word_comparator(const void *word1p, const void *word2p) {
00384   WERD *word1 = *(WERD **)word1p;
00385   WERD *word2 = *(WERD **)word2p;
00386   return word1->bounding_box().left() - word2->bounding_box().left();
00387 }
00388 
00401 WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
00402                                       C_BLOB_LIST* orphan_blobs) {
00403   C_BLOB_LIST current_blob_list;
00404   C_BLOB_IT werd_blobs_it(&current_blob_list);
00405   // Add the word's c_blobs.
00406   werd_blobs_it.add_list_after(cblob_list());
00407 
00408   // New blob list. These contain the blobs which will form the new word.
00409   C_BLOB_LIST new_werd_blobs;
00410   C_BLOB_IT new_blobs_it(&new_werd_blobs);
00411 
00412   // not_found_blobs contains the list of current word's blobs for which a
00413   // corresponding blob wasn't found in the input all_blobs list.
00414   C_BLOB_LIST not_found_blobs;
00415   C_BLOB_IT not_found_it(&not_found_blobs);
00416   not_found_it.move_to_last();
00417 
00418   werd_blobs_it.move_to_first();
00419   for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list();
00420        werd_blobs_it.forward()) {
00421     C_BLOB* werd_blob = werd_blobs_it.extract();
00422     TBOX werd_blob_box = werd_blob->bounding_box();
00423     bool found = false;
00424     // Now find the corresponding blob for this blob in the all_blobs
00425     // list. For now, follow the inefficient method of pairwise
00426     // comparisons. Ideally, one can pre-bucket the blobs by row.
00427     C_BLOB_IT all_blobs_it(all_blobs);
00428     for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
00429          all_blobs_it.forward()) {
00430       C_BLOB* a_blob = all_blobs_it.data();
00431       // Compute the overlap of the two blobs. If major, a_blob should
00432       // be added to the new blobs list.
00433       TBOX a_blob_box = a_blob->bounding_box();
00434       if (a_blob_box.null_box()) {
00435         tprintf("Bounding box couldn't be ascertained\n");
00436       }
00437       if (werd_blob_box.contains(a_blob_box) ||
00438           werd_blob_box.major_overlap(a_blob_box)) {
00439         // Old blobs are from minimal splits, therefore are expected to be
00440         // bigger. The new small blobs should cover a significant portion.
00441         // This is it.
00442         all_blobs_it.extract();
00443         new_blobs_it.add_after_then_move(a_blob);
00444         found = true;
00445       }
00446     }
00447     if (!found) {
00448       not_found_it.add_after_then_move(werd_blob);
00449     } else {
00450       delete werd_blob;
00451     }
00452   }
00453   // Iterate over all not found blobs. Some of them may be due to
00454   // under-segmentation (which is OK, since the corresponding blob is already
00455   // in the list in that case.
00456   not_found_it.move_to_first();
00457   for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
00458        not_found_it.forward()) {
00459     C_BLOB* not_found = not_found_it.data();
00460     TBOX not_found_box = not_found->bounding_box();
00461     C_BLOB_IT existing_blobs_it(new_blobs_it);
00462     for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();
00463          existing_blobs_it.forward()) {
00464       C_BLOB* a_blob = existing_blobs_it.data();
00465       TBOX a_blob_box = a_blob->bounding_box();
00466       if ((not_found_box.major_overlap(a_blob_box) ||
00467            a_blob_box.major_overlap(not_found_box)) &&
00468            not_found_box.y_overlap_fraction(a_blob_box) > 0.8) {
00469         // Already taken care of.
00470         delete not_found_it.extract();
00471         break;
00472       }
00473     }
00474   }
00475   if (orphan_blobs) {
00476     C_BLOB_IT orphan_blobs_it(orphan_blobs);
00477     orphan_blobs_it.move_to_last();
00478     orphan_blobs_it.add_list_after(&not_found_blobs);
00479   }
00480 
00481   // New blobs are ready. Create a new werd object with these.
00482   WERD* new_werd = NULL;
00483   if (!new_werd_blobs.empty()) {
00484     new_werd = new WERD(&new_werd_blobs, this);
00485   } else {
00486     // Add the blobs back to this word so that it can be reused.
00487     C_BLOB_IT this_list_it(cblob_list());
00488     this_list_it.add_list_after(&not_found_blobs);
00489   }
00490   return new_werd;
00491 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines