tesseract
3.03
|
00001 /********************************************************************** 00002 * File: ratngs.cpp (Formerly ratings.c) 00003 * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes. 00004 * Author: Ray Smith 00005 * Created: Thu Apr 23 13:23:29 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 00021 #ifdef HAVE_CONFIG_H 00022 #include "config_auto.h" 00023 #endif 00024 00025 #include "ratngs.h" 00026 00027 #include "blobs.h" 00028 #include "callcpp.h" 00029 #include "genericvector.h" 00030 #include "matrix.h" 00031 #include "normalis.h" // kBlnBaselineOffset. 00032 #include "unicharset.h" 00033 00034 using tesseract::ScriptPos; 00035 00036 ELISTIZE(BLOB_CHOICE); 00037 ELISTIZE(WERD_CHOICE); 00038 00039 const float WERD_CHOICE::kBadRating = 100000.0; 00040 // Min offset in baseline-normalized coords to make a character a subscript. 00041 const int kMinSubscriptOffset = 20; 00042 // Min offset in baseline-normalized coords to make a character a superscript. 00043 const int kMinSuperscriptOffset = 20; 00044 // Max y of bottom of a drop-cap blob. 00045 const int kMaxDropCapBottom = -128; 00046 // Max fraction of x-height to use as denominator in measuring x-height overlap. 00047 const double kMaxOverlapDenominator = 0.125; 00048 // Min fraction of x-height range that should be in agreement for matching 00049 // x-heights. 00050 const double kMinXHeightMatch = 0.5; 00051 // Max tolerance on baseline position as a fraction of x-height for matching 00052 // baselines. 00053 const double kMaxBaselineDrift = 0.0625; 00054 00055 static const char kPermuterTypeNoPerm[] = "None"; 00056 static const char kPermuterTypePuncPerm[] = "Punctuation"; 00057 static const char kPermuterTypeTopPerm[] = "Top Choice"; 00058 static const char kPermuterTypeLowerPerm[] = "Top Lower Case"; 00059 static const char kPermuterTypeUpperPerm[] = "Top Upper Case"; 00060 static const char kPermuterTypeNgramPerm[] = "Ngram"; 00061 static const char kPermuterTypeNumberPerm[] = "Number"; 00062 static const char kPermuterTypeUserPatPerm[] = "User Pattern"; 00063 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary"; 00064 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary"; 00065 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary"; 00066 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary"; 00067 static const char kPermuterTypeCompoundPerm[] = "Compound"; 00068 00069 static const char * const kPermuterTypeNames[] = { 00070 kPermuterTypeNoPerm, // 0 00071 kPermuterTypePuncPerm, // 1 00072 kPermuterTypeTopPerm, // 2 00073 kPermuterTypeLowerPerm, // 3 00074 kPermuterTypeUpperPerm, // 4 00075 kPermuterTypeNgramPerm, // 5 00076 kPermuterTypeNumberPerm, // 6 00077 kPermuterTypeUserPatPerm, // 7 00078 kPermuterTypeSysDawgPerm, // 8 00079 kPermuterTypeDocDawgPerm, // 9 00080 kPermuterTypeUserDawgPerm, // 10 00081 kPermuterTypeFreqDawgPerm, // 11 00082 kPermuterTypeCompoundPerm // 12 00083 }; 00084 00090 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id 00091 float src_rating, // rating 00092 float src_cert, // certainty 00093 inT16 src_fontinfo_id, // font 00094 inT16 src_fontinfo_id2, // 2nd choice font 00095 int src_script_id, // script 00096 float min_xheight, // min xheight allowed 00097 float max_xheight, // max xheight by this char 00098 float yshift, // yshift out of position 00099 BlobChoiceClassifier c) { // adapted match or other 00100 unichar_id_ = src_unichar_id; 00101 rating_ = src_rating; 00102 certainty_ = src_cert; 00103 fontinfo_id_ = src_fontinfo_id; 00104 fontinfo_id2_ = src_fontinfo_id2; 00105 script_id_ = src_script_id; 00106 min_xheight_ = min_xheight; 00107 max_xheight_ = max_xheight; 00108 yshift_ = yshift; 00109 classifier_ = c; 00110 } 00111 00117 BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) { 00118 unichar_id_ = other.unichar_id(); 00119 rating_ = other.rating(); 00120 certainty_ = other.certainty(); 00121 fontinfo_id_ = other.fontinfo_id(); 00122 fontinfo_id2_ = other.fontinfo_id2(); 00123 script_id_ = other.script_id(); 00124 matrix_cell_ = other.matrix_cell_; 00125 min_xheight_ = other.min_xheight_; 00126 max_xheight_ = other.max_xheight_; 00127 yshift_ = other.yshift(); 00128 classifier_ = other.classifier_; 00129 } 00130 00131 // Returns true if *this and other agree on the baseline and x-height 00132 // to within some tolerance based on a given estimate of the x-height. 00133 bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE& other, float x_height, 00134 bool debug) const { 00135 double baseline_diff = fabs(yshift() - other.yshift()); 00136 if (baseline_diff > kMaxBaselineDrift * x_height) { 00137 if (debug) { 00138 tprintf("Baseline diff %g for %d v %d\n", 00139 baseline_diff, unichar_id_, other.unichar_id_); 00140 } 00141 return false; 00142 } 00143 double this_range = max_xheight() - min_xheight(); 00144 double other_range = other.max_xheight() - other.min_xheight(); 00145 double denominator = ClipToRange(MIN(this_range, other_range), 00146 1.0, kMaxOverlapDenominator * x_height); 00147 double overlap = MIN(max_xheight(), other.max_xheight()) - 00148 MAX(min_xheight(), other.min_xheight()); 00149 overlap /= denominator; 00150 if (debug) { 00151 tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n", 00152 unichar_id_, other.unichar_id_, baseline_diff, 00153 this_range, other_range, denominator, overlap); 00154 } 00155 00156 return overlap >= kMinXHeightMatch; 00157 } 00158 00159 // Helper to find the BLOB_CHOICE in the bc_list that matches the given 00160 // unichar_id, or NULL if there is no match. 00161 BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id, 00162 BLOB_CHOICE_LIST* bc_list) { 00163 // Find the corresponding best BLOB_CHOICE. 00164 BLOB_CHOICE_IT choice_it(bc_list); 00165 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); 00166 choice_it.forward()) { 00167 BLOB_CHOICE* choice = choice_it.data(); 00168 if (choice->unichar_id() == char_id) { 00169 return choice; 00170 } 00171 } 00172 return NULL; 00173 } 00174 00175 const char *WERD_CHOICE::permuter_name(uinT8 permuter) { 00176 return kPermuterTypeNames[permuter]; 00177 } 00178 00179 namespace tesseract { 00180 00181 const char *ScriptPosToString(enum ScriptPos script_pos) { 00182 switch (script_pos) { 00183 case SP_NORMAL: return "NORM"; 00184 case SP_SUBSCRIPT: return "SUB"; 00185 case SP_SUPERSCRIPT: return "SUPER"; 00186 case SP_DROPCAP: return "DROPC"; 00187 } 00188 return "SP_UNKNOWN"; 00189 } 00190 00191 } // namespace tesseract. 00192 00199 WERD_CHOICE::WERD_CHOICE(const char *src_string, 00200 const UNICHARSET &unicharset) 00201 : unicharset_(&unicharset){ 00202 GenericVector<UNICHAR_ID> encoding; 00203 GenericVector<char> lengths; 00204 if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) { 00205 lengths.push_back('\0'); 00206 STRING src_lengths = &lengths[0]; 00207 this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM); 00208 } else { // There must have been an invalid unichar in the string. 00209 this->init(8); 00210 this->make_bad(); 00211 } 00212 } 00213 00224 void WERD_CHOICE::init(const char *src_string, 00225 const char *src_lengths, 00226 float src_rating, 00227 float src_certainty, 00228 uinT8 src_permuter) { 00229 int src_string_len = strlen(src_string); 00230 if (src_string_len == 0) { 00231 this->init(8); 00232 } else { 00233 this->init(src_lengths ? strlen(src_lengths): src_string_len); 00234 length_ = reserved_; 00235 int offset = 0; 00236 for (int i = 0; i < length_; ++i) { 00237 int unichar_length = src_lengths ? src_lengths[i] : 1; 00238 unichar_ids_[i] = 00239 unicharset_->unichar_to_id(src_string+offset, unichar_length); 00240 state_[i] = 1; 00241 certainties_[i] = src_certainty; 00242 offset += unichar_length; 00243 } 00244 } 00245 adjust_factor_ = 1.0f; 00246 rating_ = src_rating; 00247 certainty_ = src_certainty; 00248 permuter_ = src_permuter; 00249 dangerous_ambig_found_ = false; 00250 } 00251 00255 WERD_CHOICE::~WERD_CHOICE() { 00256 delete[] unichar_ids_; 00257 delete[] script_pos_; 00258 delete[] state_; 00259 delete[] certainties_; 00260 } 00261 00262 const char *WERD_CHOICE::permuter_name() const { 00263 return kPermuterTypeNames[permuter_]; 00264 } 00265 00266 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word, 00267 // taken from the appropriate cell in the ratings MATRIX. 00268 // Borrowed pointer, so do not delete. 00269 BLOB_CHOICE_LIST* WERD_CHOICE::blob_choices(int index, MATRIX* ratings) const { 00270 MATRIX_COORD coord = MatrixCoord(index); 00271 BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row); 00272 if (result == NULL) { 00273 result = new BLOB_CHOICE_LIST; 00274 ratings->put(coord.col, coord.row, result); 00275 } 00276 return result; 00277 } 00278 00279 // Returns the MATRIX_COORD corresponding to the location in the ratings 00280 // MATRIX for the given index into the word. 00281 MATRIX_COORD WERD_CHOICE::MatrixCoord(int index) const { 00282 int col = 0; 00283 for (int i = 0; i < index; ++i) 00284 col += state_[i]; 00285 int row = col + state_[index] - 1; 00286 return MATRIX_COORD(col, row); 00287 } 00288 00289 // Sets the entries for the given index from the BLOB_CHOICE, assuming 00290 // unit fragment lengths, but setting the state for this index to blob_count. 00291 void WERD_CHOICE::set_blob_choice(int index, int blob_count, 00292 const BLOB_CHOICE* blob_choice) { 00293 unichar_ids_[index] = blob_choice->unichar_id(); 00294 script_pos_[index] = tesseract::SP_NORMAL; 00295 state_[index] = blob_count; 00296 certainties_[index] = blob_choice->certainty(); 00297 } 00298 00299 00305 bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const { 00306 for (int i = 0; i < length_; ++i) { 00307 if (unichar_ids_[i] == unichar_id) { 00308 return true; 00309 } 00310 } 00311 return false; 00312 } 00313 00321 void WERD_CHOICE::remove_unichar_ids(int start, int num) { 00322 ASSERT_HOST(start >= 0 && start + num <= length_); 00323 // Accumulate the states to account for the merged blobs. 00324 for (int i = 0; i < num; ++i) { 00325 if (start > 0) 00326 state_[start - 1] += state_[start + i]; 00327 else if (start + num < length_) 00328 state_[start + num] += state_[start + i]; 00329 } 00330 for (int i = start; i + num < length_; ++i) { 00331 unichar_ids_[i] = unichar_ids_[i + num]; 00332 script_pos_[i] = script_pos_[i + num]; 00333 state_[i] = state_[i + num]; 00334 certainties_[i] = certainties_[i + num]; 00335 } 00336 length_ -= num; 00337 } 00338 00344 void WERD_CHOICE::reverse_and_mirror_unichar_ids() { 00345 for (int i = 0; i < length_ / 2; ++i) { 00346 UNICHAR_ID tmp_id = unichar_ids_[i]; 00347 unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]); 00348 unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id); 00349 } 00350 if (length_ % 2 != 0) { 00351 unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]); 00352 } 00353 } 00354 00362 void WERD_CHOICE::punct_stripped(int *start, int *end) const { 00363 *start = 0; 00364 *end = length() - 1; 00365 while (*start < length() && 00366 unicharset()->get_ispunctuation(unichar_id(*start))) { 00367 (*start)++; 00368 } 00369 while (*end > -1 && 00370 unicharset()->get_ispunctuation(unichar_id(*end))) { 00371 (*end)--; 00372 } 00373 (*end)++; 00374 } 00375 00376 void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const { 00377 int end = length(); 00378 while (end > 0 && 00379 unicharset_->get_isdigit(unichar_ids_[end - 1]) && 00380 BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) { 00381 end--; 00382 } 00383 int start = 0; 00384 while (start < end && 00385 unicharset_->get_isdigit(unichar_ids_[start]) && 00386 BlobPosition(start) == tesseract::SP_SUPERSCRIPT) { 00387 start++; 00388 } 00389 *pstart = start; 00390 *pend = end; 00391 } 00392 00393 WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const { 00394 ASSERT_HOST(start >= 0 && start <= length_); 00395 ASSERT_HOST(end >= 0 && end <= length_); 00396 if (end < start) { end = start; } 00397 WERD_CHOICE retval(unicharset_, end - start); 00398 for (int i = start; i < end; i++) { 00399 retval.append_unichar_id_space_allocated( 00400 unichar_ids_[i], state_[i], 0.0f, certainties_[i]); 00401 } 00402 return retval; 00403 } 00404 00410 bool WERD_CHOICE::has_rtl_unichar_id() const { 00411 int i; 00412 for (i = 0; i < length_; ++i) { 00413 UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]); 00414 if (dir == UNICHARSET::U_RIGHT_TO_LEFT || 00415 dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) { 00416 return true; 00417 } 00418 } 00419 return false; 00420 } 00421 00428 void WERD_CHOICE::string_and_lengths(STRING *word_str, 00429 STRING *word_lengths_str) const { 00430 *word_str = ""; 00431 if (word_lengths_str != NULL) *word_lengths_str = ""; 00432 for (int i = 0; i < length_; ++i) { 00433 const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]); 00434 *word_str += ch; 00435 if (word_lengths_str != NULL) { 00436 *word_lengths_str += strlen(ch); 00437 } 00438 } 00439 } 00440 00447 void WERD_CHOICE::append_unichar_id( 00448 UNICHAR_ID unichar_id, int blob_count, 00449 float rating, float certainty) { 00450 if (length_ == reserved_) { 00451 this->double_the_size(); 00452 } 00453 this->append_unichar_id_space_allocated(unichar_id, blob_count, 00454 rating, certainty); 00455 } 00456 00464 WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) { 00465 ASSERT_HOST(unicharset_ == second.unicharset_); 00466 while (reserved_ < length_ + second.length()) { 00467 this->double_the_size(); 00468 } 00469 const UNICHAR_ID *other_unichar_ids = second.unichar_ids(); 00470 for (int i = 0; i < second.length(); ++i) { 00471 unichar_ids_[length_ + i] = other_unichar_ids[i]; 00472 state_[length_ + i] = second.state_[i]; 00473 certainties_[length_ + i] = second.certainties_[i]; 00474 script_pos_[length_ + i] = second.BlobPosition(i); 00475 } 00476 length_ += second.length(); 00477 if (second.adjust_factor_ > adjust_factor_) 00478 adjust_factor_ = second.adjust_factor_; 00479 rating_ += second.rating(); // add ratings 00480 if (second.certainty() < certainty_) // take min 00481 certainty_ = second.certainty(); 00482 if (second.dangerous_ambig_found_) 00483 dangerous_ambig_found_ = true; 00484 if (permuter_ == NO_PERM) { 00485 permuter_ = second.permuter(); 00486 } else if (second.permuter() != NO_PERM && 00487 second.permuter() != permuter_) { 00488 permuter_ = COMPOUND_PERM; 00489 } 00490 return *this; 00491 } 00492 00493 00500 WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) { 00501 while (reserved_ < source.length()) { 00502 this->double_the_size(); 00503 } 00504 00505 unicharset_ = source.unicharset_; 00506 const UNICHAR_ID *other_unichar_ids = source.unichar_ids(); 00507 for (int i = 0; i < source.length(); ++i) { 00508 unichar_ids_[i] = other_unichar_ids[i]; 00509 state_[i] = source.state_[i]; 00510 certainties_[i] = source.certainties_[i]; 00511 script_pos_[i] = source.BlobPosition(i); 00512 } 00513 length_ = source.length(); 00514 adjust_factor_ = source.adjust_factor_; 00515 rating_ = source.rating(); 00516 certainty_ = source.certainty(); 00517 min_x_height_ = source.min_x_height(); 00518 max_x_height_ = source.max_x_height(); 00519 permuter_ = source.permuter(); 00520 dangerous_ambig_found_ = source.dangerous_ambig_found_; 00521 return *this; 00522 } 00523 00524 // Sets up the script_pos_ member using the blobs_list to get the bln 00525 // bounding boxes, *this to get the unichars, and this->unicharset 00526 // to get the target positions. If small_caps is true, sub/super are not 00527 // considered, but dropcaps are. 00528 // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.) 00529 void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word) { 00530 // Since WERD_CHOICE isn't supposed to depend on a Tesseract, 00531 // we don't have easy access to the flags Tesseract stores. Therefore, debug 00532 // for this module is hard compiled in. 00533 int debug = 0; 00534 00535 // Initialize to normal. 00536 for (int i = 0; i < length_; ++i) 00537 script_pos_[i] = tesseract::SP_NORMAL; 00538 if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) { 00539 return; 00540 } 00541 00542 int position_counts[4]; 00543 for (int i = 0; i < 4; i++) { 00544 position_counts[i] = 0; 00545 } 00546 00547 int chunk_index = 0; 00548 for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) { 00549 TBLOB* tblob = word->blobs[chunk_index]; 00550 int uni_id = unichar_id(blob_index); 00551 TBOX blob_box = tblob->bounding_box(); 00552 if (state_ != NULL) { 00553 for (int i = 1; i < state_[blob_index]; ++i) { 00554 ++chunk_index; 00555 tblob = word->blobs[chunk_index]; 00556 blob_box += tblob->bounding_box(); 00557 } 00558 } 00559 script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, 00560 uni_id); 00561 if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) { 00562 script_pos_[blob_index] = tesseract::SP_NORMAL; 00563 } 00564 position_counts[script_pos_[blob_index]]++; 00565 } 00566 // If almost everything looks like a superscript or subscript, 00567 // we most likely just got the baseline wrong. 00568 if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ || 00569 position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) { 00570 if (debug >= 2) { 00571 tprintf("Most characters of %s are subscript or superscript.\n" 00572 "That seems wrong, so I'll assume we got the baseline wrong\n", 00573 unichar_string().string()); 00574 } 00575 for (int i = 0; i < length_; i++) { 00576 ScriptPos sp = script_pos_[i]; 00577 if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) { 00578 position_counts[sp]--; 00579 position_counts[tesseract::SP_NORMAL]++; 00580 script_pos_[i] = tesseract::SP_NORMAL; 00581 } 00582 } 00583 } 00584 00585 if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || 00586 debug >= 2) { 00587 tprintf("SetScriptPosition on %s\n", unichar_string().string()); 00588 int chunk_index = 0; 00589 for (int blob_index = 0; blob_index < length_; ++blob_index) { 00590 if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) { 00591 TBLOB* tblob = word->blobs[chunk_index]; 00592 ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), 00593 unichar_id(blob_index)); 00594 } 00595 chunk_index += state_ != NULL ? state_[blob_index] : 1; 00596 } 00597 } 00598 } 00599 // Sets the script_pos_ member from some source positions with a given length. 00600 void WERD_CHOICE::SetScriptPositions(const tesseract::ScriptPos* positions, 00601 int length) { 00602 ASSERT_HOST(length == length_); 00603 if (positions != script_pos_) { 00604 delete [] script_pos_; 00605 script_pos_ = new ScriptPos[length]; 00606 memcpy(script_pos_, positions, sizeof(positions[0]) * length); 00607 } 00608 } 00609 // Sets all the script_pos_ positions to the given position. 00610 void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) { 00611 for (int i = 0; i < length_; ++i) 00612 script_pos_[i] = position; 00613 } 00614 00615 /* static */ 00616 ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, 00617 const UNICHARSET& unicharset, 00618 const TBOX& blob_box, 00619 UNICHAR_ID unichar_id) { 00620 ScriptPos retval = tesseract::SP_NORMAL; 00621 int top = blob_box.top(); 00622 int bottom = blob_box.bottom(); 00623 int min_bottom, max_bottom, min_top, max_top; 00624 unicharset.get_top_bottom(unichar_id, 00625 &min_bottom, &max_bottom, 00626 &min_top, &max_top); 00627 00628 int sub_thresh_top = min_top - kMinSubscriptOffset; 00629 int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset; 00630 int sup_thresh_bot = max_bottom + kMinSuperscriptOffset; 00631 if (bottom <= kMaxDropCapBottom) { 00632 retval = tesseract::SP_DROPCAP; 00633 } else if (top < sub_thresh_top && bottom < sub_thresh_bot) { 00634 retval = tesseract::SP_SUBSCRIPT; 00635 } else if (bottom > sup_thresh_bot) { 00636 retval = tesseract::SP_SUPERSCRIPT; 00637 } 00638 00639 if (print_debug) { 00640 const char *pos = ScriptPosToString(retval); 00641 tprintf("%s Character %s[bot:%d top: %d] " 00642 "bot_range[%d,%d] top_range[%d, %d] " 00643 "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n", 00644 pos, unicharset.id_to_unichar(unichar_id), 00645 bottom, top, 00646 min_bottom, max_bottom, min_top, max_top, 00647 sub_thresh_bot, sub_thresh_top, 00648 sup_thresh_bot); 00649 } 00650 return retval; 00651 } 00652 00653 // Returns the script-id (eg Han) of the dominant script in the word. 00654 int WERD_CHOICE::GetTopScriptID() const { 00655 int max_script = unicharset_->get_script_table_size(); 00656 int *sid = new int[max_script]; 00657 int x; 00658 for (x = 0; x < max_script; x++) sid[x] = 0; 00659 for (x = 0; x < length_; ++x) { 00660 int script_id = unicharset_->get_script(unichar_id(x)); 00661 sid[script_id]++; 00662 } 00663 if (unicharset_->han_sid() != unicharset_->null_sid()) { 00664 // Add the Hiragana & Katakana counts to Han and zero them out. 00665 if (unicharset_->hiragana_sid() != unicharset_->null_sid()) { 00666 sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()]; 00667 sid[unicharset_->hiragana_sid()] = 0; 00668 } 00669 if (unicharset_->katakana_sid() != unicharset_->null_sid()) { 00670 sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()]; 00671 sid[unicharset_->katakana_sid()] = 0; 00672 } 00673 } 00674 // Note that high script ID overrides lower one on a tie, thus biasing 00675 // towards non-Common script (if sorted that way in unicharset file). 00676 int max_sid = 0; 00677 for (x = 1; x < max_script; x++) 00678 if (sid[x] >= sid[max_sid]) max_sid = x; 00679 if (sid[max_sid] < length_ / 2) 00680 max_sid = unicharset_->null_sid(); 00681 delete[] sid; 00682 return max_sid; 00683 } 00684 00685 // Fixes the state_ for a chop at the given blob_posiiton. 00686 void WERD_CHOICE::UpdateStateForSplit(int blob_position) { 00687 int total_chunks = 0; 00688 for (int i = 0; i < length_; ++i) { 00689 total_chunks += state_[i]; 00690 if (total_chunks > blob_position) { 00691 ++state_[i]; 00692 return; 00693 } 00694 } 00695 } 00696 00697 // Returns the sum of all the state elements, being the total number of blobs. 00698 int WERD_CHOICE::TotalOfStates() const { 00699 int total_chunks = 0; 00700 for (int i = 0; i < length_; ++i) { 00701 total_chunks += state_[i]; 00702 } 00703 return total_chunks; 00704 } 00705 00711 void WERD_CHOICE::print(const char *msg) const { 00712 tprintf("%s : ", msg); 00713 for (int i = 0; i < length_; ++i) { 00714 tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i])); 00715 } 00716 tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n", 00717 rating_, certainty_, adjust_factor_, permuter_, 00718 min_x_height_, max_x_height_, dangerous_ambig_found_); 00719 tprintf("pos"); 00720 for (int i = 0; i < length_; ++i) { 00721 tprintf("\t%s", ScriptPosToString(script_pos_[i])); 00722 } 00723 tprintf("\nstr"); 00724 for (int i = 0; i < length_; ++i) { 00725 tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i])); 00726 } 00727 tprintf("\nstate:"); 00728 for (int i = 0; i < length_; ++i) { 00729 tprintf("\t%d ", state_[i]); 00730 } 00731 tprintf("\nC"); 00732 for (int i = 0; i < length_; ++i) { 00733 tprintf("\t%.3f", certainties_[i]); 00734 } 00735 tprintf("\n"); 00736 } 00737 00738 // Prints the segmentation state with an introductory message. 00739 void WERD_CHOICE::print_state(const char *msg) const { 00740 tprintf("%s", msg); 00741 for (int i = 0; i < length_; ++i) 00742 tprintf(" %d", state_[i]); 00743 tprintf("\n"); 00744 } 00745 00746 // Displays the segmentation state of *this (if not the same as the last 00747 // one displayed) and waits for a click in the window. 00748 void WERD_CHOICE::DisplaySegmentation(TWERD* word) { 00749 #ifndef GRAPHICS_DISABLED 00750 // Number of different colors to draw with. 00751 const int kNumColors = 6; 00752 static ScrollView *segm_window = NULL; 00753 // Check the state against the static prev_drawn_state. 00754 static GenericVector<int> prev_drawn_state; 00755 bool already_done = prev_drawn_state.size() == length_; 00756 if (!already_done) prev_drawn_state.init_to_size(length_, 0); 00757 for (int i = 0; i < length_; ++i) { 00758 if (prev_drawn_state[i] != state_[i]) { 00759 already_done = false; 00760 } 00761 prev_drawn_state[i] = state_[i]; 00762 } 00763 if (already_done || word->blobs.empty()) return; 00764 00765 // Create the window if needed. 00766 if (segm_window == NULL) { 00767 segm_window = new ScrollView("Segmentation", 5, 10, 500, 256, 00768 2000.0, 256.0, true); 00769 } else { 00770 segm_window->Clear(); 00771 } 00772 00773 TBOX bbox; 00774 int blob_index = 0; 00775 for (int c = 0; c < length_; ++c) { 00776 ScrollView::Color color = 00777 static_cast<ScrollView::Color>(c % kNumColors + 3); 00778 for (int i = 0; i < state_[c]; ++i, ++blob_index) { 00779 TBLOB* blob = word->blobs[blob_index]; 00780 bbox += blob->bounding_box(); 00781 blob->plot(segm_window, color, color); 00782 } 00783 } 00784 segm_window->ZoomToRectangle(bbox.left(), bbox.top(), 00785 bbox.right(), bbox.bottom()); 00786 segm_window->Update(); 00787 window_wait(segm_window); 00788 #endif 00789 } 00790 00791 00792 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, 00793 const WERD_CHOICE &word2) { 00794 const UNICHARSET *uchset = word1.unicharset(); 00795 if (word2.unicharset() != uchset) return false; 00796 int w1start, w1end; 00797 word1.punct_stripped(&w1start, &w1end); 00798 int w2start, w2end; 00799 word2.punct_stripped(&w2start, &w2end); 00800 if (w1end - w1start != w2end - w2start) return false; 00801 for (int i = 0; i < w1end - w1start; i++) { 00802 if (uchset->to_lower(word1.unichar_id(w1start + i)) != 00803 uchset->to_lower(word2.unichar_id(w2start + i))) { 00804 return false; 00805 } 00806 } 00807 return true; 00808 } 00809 00820 void print_ratings_list(const char *msg, 00821 BLOB_CHOICE_LIST *ratings, 00822 const UNICHARSET ¤t_unicharset) { 00823 if (ratings->length() == 0) { 00824 tprintf("%s:<none>\n", msg); 00825 return; 00826 } 00827 if (*msg != '\0') { 00828 tprintf("%s\n", msg); 00829 } 00830 BLOB_CHOICE_IT c_it; 00831 c_it.set_to_list(ratings); 00832 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { 00833 c_it.data()->print(¤t_unicharset); 00834 if (!c_it.at_last()) tprintf("\n"); 00835 } 00836 tprintf("\n"); 00837 fflush(stdout); 00838 }