tesseract
3.03
|
00001 /********************************************************************** 00002 * File: werd.cpp (Formerly word.c) 00003 * Description: Code for the WERD class. 00004 * Author: Ray Smith 00005 * Created: Tue Oct 08 14:32:12 BST 1991 00006 * 00007 * (C) Copyright 1991, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "blckerr.h" 00021 #include "helpers.h" 00022 #include "linlsq.h" 00023 #include "werd.h" 00024 00025 // Include automatically generated configuration file if running autoconf. 00026 #ifdef HAVE_CONFIG_H 00027 #include "config_auto.h" 00028 #endif 00029 00030 #define FIRST_COLOUR ScrollView::RED //< first rainbow colour 00031 #define LAST_COLOUR ScrollView::AQUAMARINE //< last rainbow colour 00032 #define CHILD_COLOUR ScrollView::BROWN //< colour of children 00033 00034 const ERRCODE CANT_SCALE_EDGESTEPS = 00035 "Attempted to scale an edgestep format word"; 00036 00037 ELIST2IZE(WERD) 00038 00039 00048 WERD::WERD(C_BLOB_LIST *blob_list, uinT8 blank_count, const char *text) 00049 : blanks(blank_count), 00050 flags(0), 00051 script_id_(0), 00052 correct(text) { 00053 C_BLOB_IT start_it = blob_list; 00054 C_BLOB_IT end_it = blob_list; 00055 C_BLOB_IT rej_cblob_it = &rej_cblobs; 00056 C_OUTLINE_IT c_outline_it; 00057 inT16 inverted_vote = 0; 00058 inT16 non_inverted_vote = 0; 00059 00060 // Move blob_list's elements into cblobs. 00061 while (!end_it.at_last()) 00062 end_it.forward(); 00063 cblobs.assign_to_sublist(&start_it, &end_it); 00064 00065 /* 00066 Set white on black flag for the WERD, moving any duff blobs onto the 00067 rej_cblobs list. 00068 First, walk the cblobs checking the inverse flag for each outline of each 00069 cblob. If a cblob has inconsistent flag settings for its different 00070 outlines, move the blob to the reject list. Otherwise, increment the 00071 appropriate w-on-b or b-on-w vote for the word. 00072 00073 Now set the inversion flag for the WERD by maximum vote. 00074 00075 Walk the blobs again, moving any blob whose inversion flag does not agree 00076 with the concencus onto the reject list. 00077 */ 00078 start_it.set_to_list(&cblobs); 00079 if (start_it.empty()) 00080 return; 00081 for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { 00082 BOOL8 reject_blob = FALSE; 00083 BOOL8 blob_inverted; 00084 00085 c_outline_it.set_to_list(start_it.data()->out_list()); 00086 blob_inverted = c_outline_it.data()->flag(COUT_INVERSE); 00087 for (c_outline_it.mark_cycle_pt(); 00088 !c_outline_it.cycled_list() && !reject_blob; 00089 c_outline_it.forward()) { 00090 reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted; 00091 } 00092 if (reject_blob) { 00093 rej_cblob_it.add_after_then_move(start_it.extract()); 00094 } else { 00095 if (blob_inverted) 00096 inverted_vote++; 00097 else 00098 non_inverted_vote++; 00099 } 00100 } 00101 00102 flags.set_bit(W_INVERSE, (inverted_vote > non_inverted_vote)); 00103 00104 start_it.set_to_list(&cblobs); 00105 if (start_it.empty()) 00106 return; 00107 for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) { 00108 c_outline_it.set_to_list(start_it.data()->out_list()); 00109 if (c_outline_it.data()->flag(COUT_INVERSE) != flags.bit(W_INVERSE)) 00110 rej_cblob_it.add_after_then_move(start_it.extract()); 00111 } 00112 } 00113 00114 00122 WERD::WERD(C_BLOB_LIST * blob_list, //< In word order 00123 WERD * clone) //< Source of flags 00124 : flags(clone->flags), 00125 script_id_(clone->script_id_), 00126 correct(clone->correct) { 00127 C_BLOB_IT start_it = blob_list; // iterator 00128 C_BLOB_IT end_it = blob_list; // another 00129 00130 while (!end_it.at_last ()) 00131 end_it.forward (); //move to last 00132 ((C_BLOB_LIST *) (&cblobs))->assign_to_sublist (&start_it, &end_it); 00133 //move to our list 00134 blanks = clone->blanks; 00135 // fprintf(stderr,"Wrong constructor!!!!\n"); 00136 } 00137 00138 // Construct a WERD from a single_blob and clone the flags from this. 00139 // W_BOL and W_EOL flags are set according to the given values. 00140 WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) { 00141 C_BLOB_LIST temp_blobs; 00142 C_BLOB_IT temp_it(&temp_blobs); 00143 temp_it.add_after_then_move(blob); 00144 WERD* blob_word = new WERD(&temp_blobs, this); 00145 blob_word->set_flag(W_BOL, bol); 00146 blob_word->set_flag(W_EOL, eol); 00147 return blob_word; 00148 } 00149 00163 TBOX WERD::bounding_box() { 00164 TBOX box; // box being built 00165 C_BLOB_IT rej_cblob_it = &rej_cblobs; // rejected blobs 00166 00167 for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list(); 00168 rej_cblob_it.forward()) { 00169 box += rej_cblob_it.data()->bounding_box(); 00170 } 00171 00172 C_BLOB_IT it = &cblobs; // blobs of WERD 00173 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00174 box += it.data()->bounding_box(); 00175 } 00176 return box; 00177 } 00178 00179 00187 void WERD::move(const ICOORD vec) { 00188 C_BLOB_IT cblob_it(&cblobs); // cblob iterator 00189 00190 for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) 00191 cblob_it.data()->move(vec); 00192 } 00193 00200 void WERD::join_on(WERD* other) { 00201 C_BLOB_IT blob_it(&cblobs); 00202 C_BLOB_IT src_it(&other->cblobs); 00203 C_BLOB_IT rej_cblob_it(&rej_cblobs); 00204 C_BLOB_IT src_rej_it(&other->rej_cblobs); 00205 00206 while (!src_it.empty()) { 00207 blob_it.add_to_end(src_it.extract()); 00208 src_it.forward(); 00209 } 00210 while (!src_rej_it.empty()) { 00211 rej_cblob_it.add_to_end(src_rej_it.extract()); 00212 src_rej_it.forward(); 00213 } 00214 } 00215 00216 00223 void WERD::copy_on(WERD* other) { 00224 bool reversed = other->bounding_box().left() < bounding_box().left(); 00225 C_BLOB_IT c_blob_it(&cblobs); 00226 C_BLOB_LIST c_blobs; 00227 00228 c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy); 00229 if (reversed) { 00230 c_blob_it.add_list_before(&c_blobs); 00231 } else { 00232 c_blob_it.move_to_last(); 00233 c_blob_it.add_list_after(&c_blobs); 00234 } 00235 if (!other->rej_cblobs.empty()) { 00236 C_BLOB_IT rej_c_blob_it(&rej_cblobs); 00237 C_BLOB_LIST new_rej_c_blobs; 00238 00239 new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy); 00240 if (reversed) { 00241 rej_c_blob_it.add_list_before(&new_rej_c_blobs); 00242 } else { 00243 rej_c_blob_it.move_to_last(); 00244 rej_c_blob_it.add_list_after(&new_rej_c_blobs); 00245 } 00246 } 00247 } 00248 00255 void WERD::print() { 00256 tprintf("Blanks= %d\n", blanks); 00257 bounding_box().print(); 00258 tprintf("Flags = %d = 0%o\n", flags.val, flags.val); 00259 tprintf(" W_SEGMENTED = %s\n", flags.bit(W_SEGMENTED) ? "TRUE" : "FALSE "); 00260 tprintf(" W_ITALIC = %s\n", flags.bit(W_ITALIC) ? "TRUE" : "FALSE "); 00261 tprintf(" W_BOL = %s\n", flags.bit(W_BOL) ? "TRUE" : "FALSE "); 00262 tprintf(" W_EOL = %s\n", flags.bit(W_EOL) ? "TRUE" : "FALSE "); 00263 tprintf(" W_NORMALIZED = %s\n", 00264 flags.bit(W_NORMALIZED) ? "TRUE" : "FALSE "); 00265 tprintf(" W_SCRIPT_HAS_XHEIGHT = %s\n", 00266 flags.bit(W_SCRIPT_HAS_XHEIGHT) ? "TRUE" : "FALSE "); 00267 tprintf(" W_SCRIPT_IS_LATIN = %s\n", 00268 flags.bit(W_SCRIPT_IS_LATIN) ? "TRUE" : "FALSE "); 00269 tprintf(" W_DONT_CHOP = %s\n", flags.bit(W_DONT_CHOP) ? "TRUE" : "FALSE "); 00270 tprintf(" W_REP_CHAR = %s\n", flags.bit(W_REP_CHAR) ? "TRUE" : "FALSE "); 00271 tprintf(" W_FUZZY_SP = %s\n", flags.bit(W_FUZZY_SP) ? "TRUE" : "FALSE "); 00272 tprintf(" W_FUZZY_NON = %s\n", flags.bit(W_FUZZY_NON) ? "TRUE" : "FALSE "); 00273 tprintf("Correct= %s\n", correct.string()); 00274 tprintf("Rejected cblob count = %d\n", rej_cblobs.length()); 00275 tprintf("Script = %d\n", script_id_); 00276 } 00277 00278 00285 #ifndef GRAPHICS_DISABLED 00286 void WERD::plot(ScrollView *window, ScrollView::Color colour) { 00287 C_BLOB_IT it = &cblobs; 00288 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00289 it.data()->plot(window, colour, colour); 00290 } 00291 plot_rej_blobs(window); 00292 } 00293 00294 // Get the next color in the (looping) rainbow. 00295 ScrollView::Color WERD::NextColor(ScrollView::Color colour) { 00296 ScrollView::Color next = static_cast<ScrollView::Color>(colour + 1); 00297 if (next >= LAST_COLOUR || next < FIRST_COLOUR) 00298 next = FIRST_COLOUR; 00299 return next; 00300 } 00301 00308 void WERD::plot(ScrollView* window) { 00309 ScrollView::Color colour = FIRST_COLOUR; 00310 C_BLOB_IT it = &cblobs; 00311 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00312 it.data()->plot(window, colour, CHILD_COLOUR); 00313 colour = NextColor(colour); 00314 } 00315 plot_rej_blobs(window); 00316 } 00317 00318 00326 void WERD::plot_rej_blobs(ScrollView *window) { 00327 C_BLOB_IT it = &rej_cblobs; 00328 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00329 it.data()->plot(window, ScrollView::GREY, ScrollView::GREY); 00330 } 00331 } 00332 #endif // GRAPHICS_DISABLED 00333 00334 00341 WERD *WERD::shallow_copy() { 00342 WERD *new_word = new WERD; 00343 00344 new_word->blanks = blanks; 00345 new_word->flags = flags; 00346 new_word->dummy = dummy; 00347 new_word->correct = correct; 00348 return new_word; 00349 } 00350 00351 00358 WERD & WERD::operator= (const WERD & source) { 00359 this->ELIST2_LINK::operator= (source); 00360 blanks = source.blanks; 00361 flags = source.flags; 00362 script_id_ = source.script_id_; 00363 dummy = source.dummy; 00364 correct = source.correct; 00365 if (!cblobs.empty()) 00366 cblobs.clear(); 00367 cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy); 00368 00369 if (!rej_cblobs.empty()) 00370 rej_cblobs.clear(); 00371 rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy); 00372 return *this; 00373 } 00374 00375 00383 int word_comparator(const void *word1p, const void *word2p) { 00384 WERD *word1 = *(WERD **)word1p; 00385 WERD *word2 = *(WERD **)word2p; 00386 return word1->bounding_box().left() - word2->bounding_box().left(); 00387 } 00388 00401 WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs, 00402 C_BLOB_LIST* orphan_blobs) { 00403 C_BLOB_LIST current_blob_list; 00404 C_BLOB_IT werd_blobs_it(¤t_blob_list); 00405 // Add the word's c_blobs. 00406 werd_blobs_it.add_list_after(cblob_list()); 00407 00408 // New blob list. These contain the blobs which will form the new word. 00409 C_BLOB_LIST new_werd_blobs; 00410 C_BLOB_IT new_blobs_it(&new_werd_blobs); 00411 00412 // not_found_blobs contains the list of current word's blobs for which a 00413 // corresponding blob wasn't found in the input all_blobs list. 00414 C_BLOB_LIST not_found_blobs; 00415 C_BLOB_IT not_found_it(¬_found_blobs); 00416 not_found_it.move_to_last(); 00417 00418 werd_blobs_it.move_to_first(); 00419 for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list(); 00420 werd_blobs_it.forward()) { 00421 C_BLOB* werd_blob = werd_blobs_it.extract(); 00422 TBOX werd_blob_box = werd_blob->bounding_box(); 00423 bool found = false; 00424 // Now find the corresponding blob for this blob in the all_blobs 00425 // list. For now, follow the inefficient method of pairwise 00426 // comparisons. Ideally, one can pre-bucket the blobs by row. 00427 C_BLOB_IT all_blobs_it(all_blobs); 00428 for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); 00429 all_blobs_it.forward()) { 00430 C_BLOB* a_blob = all_blobs_it.data(); 00431 // Compute the overlap of the two blobs. If major, a_blob should 00432 // be added to the new blobs list. 00433 TBOX a_blob_box = a_blob->bounding_box(); 00434 if (a_blob_box.null_box()) { 00435 tprintf("Bounding box couldn't be ascertained\n"); 00436 } 00437 if (werd_blob_box.contains(a_blob_box) || 00438 werd_blob_box.major_overlap(a_blob_box)) { 00439 // Old blobs are from minimal splits, therefore are expected to be 00440 // bigger. The new small blobs should cover a significant portion. 00441 // This is it. 00442 all_blobs_it.extract(); 00443 new_blobs_it.add_after_then_move(a_blob); 00444 found = true; 00445 } 00446 } 00447 if (!found) { 00448 not_found_it.add_after_then_move(werd_blob); 00449 } else { 00450 delete werd_blob; 00451 } 00452 } 00453 // Iterate over all not found blobs. Some of them may be due to 00454 // under-segmentation (which is OK, since the corresponding blob is already 00455 // in the list in that case. 00456 not_found_it.move_to_first(); 00457 for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); 00458 not_found_it.forward()) { 00459 C_BLOB* not_found = not_found_it.data(); 00460 TBOX not_found_box = not_found->bounding_box(); 00461 C_BLOB_IT existing_blobs_it(new_blobs_it); 00462 for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list(); 00463 existing_blobs_it.forward()) { 00464 C_BLOB* a_blob = existing_blobs_it.data(); 00465 TBOX a_blob_box = a_blob->bounding_box(); 00466 if ((not_found_box.major_overlap(a_blob_box) || 00467 a_blob_box.major_overlap(not_found_box)) && 00468 not_found_box.y_overlap_fraction(a_blob_box) > 0.8) { 00469 // Already taken care of. 00470 delete not_found_it.extract(); 00471 break; 00472 } 00473 } 00474 } 00475 if (orphan_blobs) { 00476 C_BLOB_IT orphan_blobs_it(orphan_blobs); 00477 orphan_blobs_it.move_to_last(); 00478 orphan_blobs_it.add_list_after(¬_found_blobs); 00479 } 00480 00481 // New blobs are ready. Create a new werd object with these. 00482 WERD* new_werd = NULL; 00483 if (!new_werd_blobs.empty()) { 00484 new_werd = new WERD(&new_werd_blobs, this); 00485 } else { 00486 // Add the blobs back to this word so that it can be reused. 00487 C_BLOB_IT this_list_it(cblob_list()); 00488 this_list_it.add_list_after(¬_found_blobs); 00489 } 00490 return new_werd; 00491 }