tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccstruct/ratngs.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File: ratngs.cpp  (Formerly ratings.c)
00003  * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes.
00004  * Author: Ray Smith
00005  * Created: Thu Apr 23 13:23:29 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 
00021 #ifdef HAVE_CONFIG_H
00022 #include "config_auto.h"
00023 #endif
00024 
00025 #include "ratngs.h"
00026 
00027 #include "blobs.h"
00028 #include "callcpp.h"
00029 #include "genericvector.h"
00030 #include "matrix.h"
00031 #include "normalis.h"  // kBlnBaselineOffset.
00032 #include "unicharset.h"
00033 
00034 using tesseract::ScriptPos;
00035 
00036 ELISTIZE(BLOB_CHOICE);
00037 ELISTIZE(WERD_CHOICE);
00038 
00039 const float WERD_CHOICE::kBadRating = 100000.0;
00040 // Min offset in baseline-normalized coords to make a character a subscript.
00041 const int kMinSubscriptOffset = 20;
00042 // Min offset in baseline-normalized coords to make a character a superscript.
00043 const int kMinSuperscriptOffset = 20;
00044 // Max y of bottom of a drop-cap blob.
00045 const int kMaxDropCapBottom = -128;
00046 // Max fraction of x-height to use as denominator in measuring x-height overlap.
00047 const double kMaxOverlapDenominator = 0.125;
00048 // Min fraction of x-height range that should be in agreement for matching
00049 // x-heights.
00050 const double kMinXHeightMatch = 0.5;
00051 // Max tolerance on baseline position as a fraction of x-height for matching
00052 // baselines.
00053 const double kMaxBaselineDrift = 0.0625;
00054 
00055 static const char kPermuterTypeNoPerm[] = "None";
00056 static const char kPermuterTypePuncPerm[] = "Punctuation";
00057 static const char kPermuterTypeTopPerm[] = "Top Choice";
00058 static const char kPermuterTypeLowerPerm[] = "Top Lower Case";
00059 static const char kPermuterTypeUpperPerm[] = "Top Upper Case";
00060 static const char kPermuterTypeNgramPerm[] = "Ngram";
00061 static const char kPermuterTypeNumberPerm[] = "Number";
00062 static const char kPermuterTypeUserPatPerm[] = "User Pattern";
00063 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary";
00064 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
00065 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary";
00066 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
00067 static const char kPermuterTypeCompoundPerm[] = "Compound";
00068 
00069 static const char * const kPermuterTypeNames[] = {
00070     kPermuterTypeNoPerm,        // 0
00071     kPermuterTypePuncPerm,      // 1
00072     kPermuterTypeTopPerm,       // 2
00073     kPermuterTypeLowerPerm,     // 3
00074     kPermuterTypeUpperPerm,     // 4
00075     kPermuterTypeNgramPerm,     // 5
00076     kPermuterTypeNumberPerm,    // 6
00077     kPermuterTypeUserPatPerm,   // 7
00078     kPermuterTypeSysDawgPerm,   // 8
00079     kPermuterTypeDocDawgPerm,   // 9
00080     kPermuterTypeUserDawgPerm,  // 10
00081     kPermuterTypeFreqDawgPerm,  // 11
00082     kPermuterTypeCompoundPerm   // 12
00083 };
00084 
00090 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
00091                          float src_rating,         // rating
00092                          float src_cert,           // certainty
00093                          inT16 src_fontinfo_id,     // font
00094                          inT16 src_fontinfo_id2,    // 2nd choice font
00095                          int src_script_id,        // script
00096                          float min_xheight,        // min xheight allowed
00097                          float max_xheight,        // max xheight by this char
00098                          float yshift,             // yshift out of position
00099                          BlobChoiceClassifier c) {  // adapted match or other
00100   unichar_id_ = src_unichar_id;
00101   rating_ = src_rating;
00102   certainty_ = src_cert;
00103   fontinfo_id_ = src_fontinfo_id;
00104   fontinfo_id2_ = src_fontinfo_id2;
00105   script_id_ = src_script_id;
00106   min_xheight_ = min_xheight;
00107   max_xheight_ = max_xheight;
00108   yshift_ = yshift;
00109   classifier_ = c;
00110 }
00111 
00117 BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
00118   unichar_id_ = other.unichar_id();
00119   rating_ = other.rating();
00120   certainty_ = other.certainty();
00121   fontinfo_id_ = other.fontinfo_id();
00122   fontinfo_id2_ = other.fontinfo_id2();
00123   script_id_ = other.script_id();
00124   matrix_cell_ = other.matrix_cell_;
00125   min_xheight_ = other.min_xheight_;
00126   max_xheight_ = other.max_xheight_;
00127   yshift_ = other.yshift();
00128   classifier_ = other.classifier_;
00129 }
00130 
00131 // Returns true if *this and other agree on the baseline and x-height
00132 // to within some tolerance based on a given estimate of the x-height.
00133 bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
00134                                   bool debug) const {
00135   double baseline_diff = fabs(yshift() - other.yshift());
00136   if (baseline_diff > kMaxBaselineDrift * x_height) {
00137     if (debug) {
00138       tprintf("Baseline diff %g for %d v %d\n",
00139               baseline_diff, unichar_id_, other.unichar_id_);
00140     }
00141     return false;
00142   }
00143   double this_range = max_xheight() - min_xheight();
00144   double other_range = other.max_xheight() - other.min_xheight();
00145   double denominator = ClipToRange(MIN(this_range, other_range),
00146                                    1.0, kMaxOverlapDenominator * x_height);
00147   double overlap = MIN(max_xheight(), other.max_xheight()) -
00148                    MAX(min_xheight(), other.min_xheight());
00149   overlap /= denominator;
00150   if (debug) {
00151     tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n",
00152             unichar_id_, other.unichar_id_, baseline_diff,
00153             this_range, other_range, denominator, overlap);
00154   }
00155 
00156   return overlap >= kMinXHeightMatch;
00157 }
00158 
00159 // Helper to find the BLOB_CHOICE in the bc_list that matches the given
00160 // unichar_id, or NULL if there is no match.
00161 BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id,
00162                                 BLOB_CHOICE_LIST* bc_list) {
00163   // Find the corresponding best BLOB_CHOICE.
00164   BLOB_CHOICE_IT choice_it(bc_list);
00165   for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
00166        choice_it.forward()) {
00167     BLOB_CHOICE* choice = choice_it.data();
00168     if (choice->unichar_id() == char_id) {
00169       return choice;
00170     }
00171   }
00172   return NULL;
00173 }
00174 
00175 const char *WERD_CHOICE::permuter_name(uinT8 permuter) {
00176   return kPermuterTypeNames[permuter];
00177 }
00178 
00179 namespace tesseract {
00180 
00181 const char *ScriptPosToString(enum ScriptPos script_pos) {
00182   switch (script_pos) {
00183     case SP_NORMAL: return "NORM";
00184     case SP_SUBSCRIPT: return "SUB";
00185     case SP_SUPERSCRIPT: return "SUPER";
00186     case SP_DROPCAP: return "DROPC";
00187   }
00188   return "SP_UNKNOWN";
00189 }
00190 
00191 }  // namespace tesseract.
00192 
00199 WERD_CHOICE::WERD_CHOICE(const char *src_string,
00200                          const UNICHARSET &unicharset)
00201     : unicharset_(&unicharset){
00202   GenericVector<UNICHAR_ID> encoding;
00203   GenericVector<char> lengths;
00204   if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
00205     lengths.push_back('\0');
00206     STRING src_lengths = &lengths[0];
00207     this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
00208   } else {  // There must have been an invalid unichar in the string.
00209     this->init(8);
00210     this->make_bad();
00211   }
00212 }
00213 
00224 void WERD_CHOICE::init(const char *src_string,
00225                        const char *src_lengths,
00226                        float src_rating,
00227                        float src_certainty,
00228                        uinT8 src_permuter) {
00229   int src_string_len = strlen(src_string);
00230   if (src_string_len == 0) {
00231     this->init(8);
00232   } else {
00233     this->init(src_lengths ? strlen(src_lengths): src_string_len);
00234     length_ = reserved_;
00235     int offset = 0;
00236     for (int i = 0; i < length_; ++i) {
00237       int unichar_length = src_lengths ? src_lengths[i] : 1;
00238       unichar_ids_[i] =
00239           unicharset_->unichar_to_id(src_string+offset, unichar_length);
00240       state_[i] = 1;
00241       certainties_[i] = src_certainty;
00242       offset += unichar_length;
00243     }
00244   }
00245   adjust_factor_ = 1.0f;
00246   rating_ = src_rating;
00247   certainty_ = src_certainty;
00248   permuter_ = src_permuter;
00249   dangerous_ambig_found_ = false;
00250 }
00251 
00255 WERD_CHOICE::~WERD_CHOICE() {
00256   delete[] unichar_ids_;
00257   delete[] script_pos_;
00258   delete[] state_;
00259   delete[] certainties_;
00260 }
00261 
00262 const char *WERD_CHOICE::permuter_name() const {
00263   return kPermuterTypeNames[permuter_];
00264 }
00265 
00266 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
00267 // taken from the appropriate cell in the ratings MATRIX.
00268 // Borrowed pointer, so do not delete.
00269 BLOB_CHOICE_LIST* WERD_CHOICE::blob_choices(int index, MATRIX* ratings) const {
00270   MATRIX_COORD coord = MatrixCoord(index);
00271   BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
00272   if (result == NULL) {
00273     result = new BLOB_CHOICE_LIST;
00274     ratings->put(coord.col, coord.row, result);
00275   }
00276   return result;
00277 }
00278 
00279 // Returns the MATRIX_COORD corresponding to the location in the ratings
00280 // MATRIX for the given index into the word.
00281 MATRIX_COORD WERD_CHOICE::MatrixCoord(int index) const {
00282   int col = 0;
00283   for (int i = 0; i < index; ++i)
00284     col += state_[i];
00285   int row = col + state_[index] - 1;
00286   return MATRIX_COORD(col, row);
00287 }
00288 
00289 // Sets the entries for the given index from the BLOB_CHOICE, assuming
00290 // unit fragment lengths, but setting the state for this index to blob_count.
00291 void WERD_CHOICE::set_blob_choice(int index, int blob_count,
00292                                   const BLOB_CHOICE* blob_choice) {
00293   unichar_ids_[index] = blob_choice->unichar_id();
00294   script_pos_[index] = tesseract::SP_NORMAL;
00295   state_[index] = blob_count;
00296   certainties_[index] = blob_choice->certainty();
00297 }
00298 
00299 
00305 bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {
00306   for (int i = 0; i < length_; ++i) {
00307     if (unichar_ids_[i] == unichar_id) {
00308       return true;
00309     }
00310   }
00311   return false;
00312 }
00313 
00321 void WERD_CHOICE::remove_unichar_ids(int start, int num) {
00322   ASSERT_HOST(start >= 0 && start + num <= length_);
00323   // Accumulate the states to account for the merged blobs.
00324   for (int i = 0; i < num; ++i) {
00325     if (start > 0)
00326       state_[start - 1] += state_[start + i];
00327     else if (start + num < length_)
00328       state_[start + num] += state_[start + i];
00329   }
00330   for (int i = start; i + num < length_; ++i) {
00331     unichar_ids_[i] = unichar_ids_[i + num];
00332     script_pos_[i] = script_pos_[i + num];
00333     state_[i] = state_[i + num];
00334     certainties_[i] = certainties_[i + num];
00335   }
00336   length_ -= num;
00337 }
00338 
00344 void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
00345   for (int i = 0; i < length_ / 2; ++i) {
00346     UNICHAR_ID tmp_id = unichar_ids_[i];
00347     unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
00348     unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
00349   }
00350   if (length_ % 2 != 0) {
00351     unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
00352   }
00353 }
00354 
00362 void WERD_CHOICE::punct_stripped(int *start, int *end) const {
00363   *start = 0;
00364   *end = length() - 1;
00365   while (*start < length() &&
00366          unicharset()->get_ispunctuation(unichar_id(*start))) {
00367     (*start)++;
00368   }
00369   while (*end > -1 &&
00370          unicharset()->get_ispunctuation(unichar_id(*end))) {
00371     (*end)--;
00372   }
00373   (*end)++;
00374 }
00375 
00376 void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
00377   int end = length();
00378   while (end > 0 &&
00379          unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
00380          BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) {
00381     end--;
00382   }
00383   int start = 0;
00384   while (start < end &&
00385          unicharset_->get_isdigit(unichar_ids_[start]) &&
00386          BlobPosition(start) == tesseract::SP_SUPERSCRIPT) {
00387     start++;
00388   }
00389   *pstart = start;
00390   *pend = end;
00391 }
00392 
00393 WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
00394   ASSERT_HOST(start >= 0 && start <= length_);
00395   ASSERT_HOST(end >= 0 && end <= length_);
00396   if (end < start) { end = start; }
00397   WERD_CHOICE retval(unicharset_, end - start);
00398   for (int i = start; i < end; i++) {
00399     retval.append_unichar_id_space_allocated(
00400         unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
00401   }
00402   return retval;
00403 }
00404 
00410 bool WERD_CHOICE::has_rtl_unichar_id() const {
00411   int i;
00412   for (i = 0; i < length_; ++i) {
00413     UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
00414     if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
00415         dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) {
00416       return true;
00417     }
00418   }
00419   return false;
00420 }
00421 
00428 void WERD_CHOICE::string_and_lengths(STRING *word_str,
00429                                      STRING *word_lengths_str) const {
00430   *word_str = "";
00431   if (word_lengths_str != NULL) *word_lengths_str = "";
00432   for (int i = 0; i < length_; ++i) {
00433     const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
00434     *word_str += ch;
00435     if (word_lengths_str != NULL) {
00436       *word_lengths_str += strlen(ch);
00437     }
00438   }
00439 }
00440 
00447 void WERD_CHOICE::append_unichar_id(
00448     UNICHAR_ID unichar_id, int blob_count,
00449     float rating, float certainty) {
00450   if (length_ == reserved_) {
00451     this->double_the_size();
00452   }
00453   this->append_unichar_id_space_allocated(unichar_id, blob_count,
00454                                           rating, certainty);
00455 }
00456 
00464 WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) {
00465   ASSERT_HOST(unicharset_ == second.unicharset_);
00466   while (reserved_ < length_ + second.length()) {
00467     this->double_the_size();
00468   }
00469   const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
00470   for (int i = 0; i < second.length(); ++i) {
00471     unichar_ids_[length_ + i] = other_unichar_ids[i];
00472     state_[length_ + i] = second.state_[i];
00473     certainties_[length_ + i] = second.certainties_[i];
00474     script_pos_[length_ + i] = second.BlobPosition(i);
00475   }
00476   length_ += second.length();
00477   if (second.adjust_factor_ > adjust_factor_)
00478     adjust_factor_ = second.adjust_factor_;
00479   rating_ += second.rating();  // add ratings
00480   if (second.certainty() < certainty_) // take min
00481     certainty_ = second.certainty();
00482   if (second.dangerous_ambig_found_)
00483     dangerous_ambig_found_ = true;
00484   if (permuter_ == NO_PERM) {
00485     permuter_ = second.permuter();
00486   } else if (second.permuter() != NO_PERM &&
00487              second.permuter() != permuter_) {
00488     permuter_ = COMPOUND_PERM;
00489   }
00490   return *this;
00491 }
00492 
00493 
00500 WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) {
00501   while (reserved_ < source.length()) {
00502     this->double_the_size();
00503   }
00504 
00505   unicharset_ = source.unicharset_;
00506   const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
00507   for (int i = 0; i < source.length(); ++i) {
00508     unichar_ids_[i] = other_unichar_ids[i];
00509     state_[i] = source.state_[i];
00510     certainties_[i] = source.certainties_[i];
00511     script_pos_[i] = source.BlobPosition(i);
00512   }
00513   length_ = source.length();
00514   adjust_factor_ = source.adjust_factor_;
00515   rating_ = source.rating();
00516   certainty_ = source.certainty();
00517   min_x_height_ = source.min_x_height();
00518   max_x_height_ = source.max_x_height();
00519   permuter_ = source.permuter();
00520   dangerous_ambig_found_ = source.dangerous_ambig_found_;
00521   return *this;
00522 }
00523 
00524 // Sets up the script_pos_ member using the blobs_list to get the bln
00525 // bounding boxes, *this to get the unichars, and this->unicharset
00526 // to get the target positions. If small_caps is true, sub/super are not
00527 // considered, but dropcaps are.
00528 // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
00529 void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word) {
00530   // Since WERD_CHOICE isn't supposed to depend on a Tesseract,
00531   // we don't have easy access to the flags Tesseract stores.  Therefore, debug
00532   // for this module is hard compiled in.
00533   int debug = 0;
00534 
00535   // Initialize to normal.
00536   for (int i = 0; i < length_; ++i)
00537     script_pos_[i] = tesseract::SP_NORMAL;
00538   if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
00539     return;
00540   }
00541 
00542   int position_counts[4];
00543   for (int i = 0; i < 4; i++) {
00544     position_counts[i] = 0;
00545   }
00546 
00547   int chunk_index = 0;
00548   for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
00549     TBLOB* tblob = word->blobs[chunk_index];
00550     int uni_id = unichar_id(blob_index);
00551     TBOX blob_box = tblob->bounding_box();
00552     if (state_ != NULL) {
00553       for (int i = 1; i <  state_[blob_index]; ++i) {
00554         ++chunk_index;
00555         tblob = word->blobs[chunk_index];
00556         blob_box += tblob->bounding_box();
00557       }
00558     }
00559     script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
00560                                                uni_id);
00561     if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
00562       script_pos_[blob_index] = tesseract::SP_NORMAL;
00563     }
00564     position_counts[script_pos_[blob_index]]++;
00565   }
00566   // If almost everything looks like a superscript or subscript,
00567   // we most likely just got the baseline wrong.
00568   if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
00569       position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
00570     if (debug >= 2) {
00571       tprintf("Most characters of %s are subscript or superscript.\n"
00572               "That seems wrong, so I'll assume we got the baseline wrong\n",
00573               unichar_string().string());
00574     }
00575     for (int i = 0; i < length_; i++) {
00576       ScriptPos sp = script_pos_[i];
00577       if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) {
00578         position_counts[sp]--;
00579         position_counts[tesseract::SP_NORMAL]++;
00580         script_pos_[i] = tesseract::SP_NORMAL;
00581       }
00582     }
00583   }
00584 
00585   if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
00586       debug >= 2) {
00587     tprintf("SetScriptPosition on %s\n", unichar_string().string());
00588     int chunk_index = 0;
00589     for (int blob_index = 0; blob_index < length_; ++blob_index) {
00590       if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
00591         TBLOB* tblob = word->blobs[chunk_index];
00592         ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
00593                          unichar_id(blob_index));
00594       }
00595       chunk_index += state_ != NULL ? state_[blob_index] : 1;
00596     }
00597   }
00598 }
00599 // Sets the script_pos_ member from some source positions with a given length.
00600 void WERD_CHOICE::SetScriptPositions(const tesseract::ScriptPos* positions,
00601                                      int length) {
00602   ASSERT_HOST(length == length_);
00603   if (positions != script_pos_) {
00604     delete [] script_pos_;
00605     script_pos_ = new ScriptPos[length];
00606     memcpy(script_pos_, positions, sizeof(positions[0]) * length);
00607   }
00608 }
00609 // Sets all the script_pos_ positions to the given position.
00610 void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) {
00611   for (int i = 0; i < length_; ++i)
00612     script_pos_[i] = position;
00613 }
00614 
00615 /* static */
00616 ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug,
00617                                         const UNICHARSET& unicharset,
00618                                         const TBOX& blob_box,
00619                                         UNICHAR_ID unichar_id) {
00620   ScriptPos retval = tesseract::SP_NORMAL;
00621   int top = blob_box.top();
00622   int bottom = blob_box.bottom();
00623   int min_bottom, max_bottom, min_top, max_top;
00624   unicharset.get_top_bottom(unichar_id,
00625                             &min_bottom, &max_bottom,
00626                             &min_top, &max_top);
00627 
00628   int sub_thresh_top = min_top - kMinSubscriptOffset;
00629   int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
00630   int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
00631   if (bottom <= kMaxDropCapBottom) {
00632     retval = tesseract::SP_DROPCAP;
00633   } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
00634     retval = tesseract::SP_SUBSCRIPT;
00635   } else if (bottom > sup_thresh_bot) {
00636     retval = tesseract::SP_SUPERSCRIPT;
00637   }
00638 
00639   if (print_debug) {
00640     const char *pos = ScriptPosToString(retval);
00641     tprintf("%s Character %s[bot:%d top: %d]  "
00642             "bot_range[%d,%d]  top_range[%d, %d] "
00643             "sub_thresh[bot:%d top:%d]  sup_thresh_bot %d\n",
00644             pos, unicharset.id_to_unichar(unichar_id),
00645             bottom, top,
00646             min_bottom, max_bottom, min_top, max_top,
00647             sub_thresh_bot, sub_thresh_top,
00648             sup_thresh_bot);
00649   }
00650   return retval;
00651 }
00652 
00653 // Returns the script-id (eg Han) of the dominant script in the word.
00654 int WERD_CHOICE::GetTopScriptID() const {
00655   int max_script = unicharset_->get_script_table_size();
00656   int *sid = new int[max_script];
00657   int x;
00658   for (x = 0; x < max_script; x++) sid[x] = 0;
00659   for (x = 0; x < length_; ++x) {
00660     int script_id = unicharset_->get_script(unichar_id(x));
00661     sid[script_id]++;
00662   }
00663   if (unicharset_->han_sid() != unicharset_->null_sid()) {
00664     // Add the Hiragana & Katakana counts to Han and zero them out.
00665     if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
00666       sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
00667       sid[unicharset_->hiragana_sid()] = 0;
00668     }
00669     if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
00670       sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
00671       sid[unicharset_->katakana_sid()] = 0;
00672     }
00673   }
00674   // Note that high script ID overrides lower one on a tie, thus biasing
00675   // towards non-Common script (if sorted that way in unicharset file).
00676   int max_sid = 0;
00677   for (x = 1; x < max_script; x++)
00678     if (sid[x] >= sid[max_sid]) max_sid = x;
00679   if (sid[max_sid] < length_ / 2)
00680     max_sid = unicharset_->null_sid();
00681   delete[] sid;
00682   return max_sid;
00683 }
00684 
00685 // Fixes the state_ for a chop at the given blob_posiiton.
00686 void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
00687   int total_chunks = 0;
00688   for (int i = 0; i < length_; ++i) {
00689     total_chunks += state_[i];
00690     if (total_chunks > blob_position) {
00691       ++state_[i];
00692       return;
00693     }
00694   }
00695 }
00696 
00697 // Returns the sum of all the state elements, being the total number of blobs.
00698 int WERD_CHOICE::TotalOfStates() const {
00699   int total_chunks = 0;
00700   for (int i = 0; i < length_; ++i) {
00701     total_chunks += state_[i];
00702   }
00703   return total_chunks;
00704 }
00705 
00711 void WERD_CHOICE::print(const char *msg) const {
00712   tprintf("%s : ", msg);
00713   for (int i = 0; i < length_; ++i) {
00714     tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
00715   }
00716   tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
00717           rating_, certainty_, adjust_factor_, permuter_,
00718           min_x_height_, max_x_height_, dangerous_ambig_found_);
00719   tprintf("pos");
00720   for (int i = 0; i < length_; ++i) {
00721     tprintf("\t%s", ScriptPosToString(script_pos_[i]));
00722   }
00723   tprintf("\nstr");
00724   for (int i = 0; i < length_; ++i) {
00725     tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
00726   }
00727   tprintf("\nstate:");
00728   for (int i = 0; i < length_; ++i) {
00729     tprintf("\t%d ", state_[i]);
00730   }
00731   tprintf("\nC");
00732   for (int i = 0; i < length_; ++i) {
00733     tprintf("\t%.3f", certainties_[i]);
00734   }
00735   tprintf("\n");
00736 }
00737 
00738 // Prints the segmentation state with an introductory message.
00739 void WERD_CHOICE::print_state(const char *msg) const {
00740   tprintf("%s", msg);
00741   for (int i = 0; i < length_; ++i)
00742     tprintf(" %d", state_[i]);
00743   tprintf("\n");
00744 }
00745 
00746 // Displays the segmentation state of *this (if not the same as the last
00747 // one displayed) and waits for a click in the window.
00748 void WERD_CHOICE::DisplaySegmentation(TWERD* word) {
00749 #ifndef GRAPHICS_DISABLED
00750   // Number of different colors to draw with.
00751   const int kNumColors = 6;
00752   static ScrollView *segm_window = NULL;
00753   // Check the state against the static prev_drawn_state.
00754   static GenericVector<int> prev_drawn_state;
00755   bool already_done = prev_drawn_state.size() == length_;
00756   if (!already_done) prev_drawn_state.init_to_size(length_, 0);
00757   for (int i = 0; i < length_; ++i) {
00758     if (prev_drawn_state[i] != state_[i]) {
00759       already_done = false;
00760     }
00761     prev_drawn_state[i] = state_[i];
00762   }
00763   if (already_done || word->blobs.empty()) return;
00764 
00765   // Create the window if needed.
00766   if (segm_window == NULL) {
00767     segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
00768                                  2000.0, 256.0, true);
00769   } else {
00770     segm_window->Clear();
00771   }
00772 
00773   TBOX bbox;
00774   int blob_index = 0;
00775   for (int c = 0; c < length_; ++c) {
00776     ScrollView::Color color =
00777         static_cast<ScrollView::Color>(c % kNumColors + 3);
00778     for (int i = 0; i < state_[c]; ++i, ++blob_index) {
00779       TBLOB* blob = word->blobs[blob_index];
00780       bbox += blob->bounding_box();
00781       blob->plot(segm_window, color, color);
00782     }
00783   }
00784   segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
00785                                bbox.right(), bbox.bottom());
00786   segm_window->Update();
00787   window_wait(segm_window);
00788 #endif
00789 }
00790 
00791 
00792 bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
00793                                        const WERD_CHOICE &word2) {
00794   const UNICHARSET *uchset = word1.unicharset();
00795   if (word2.unicharset() != uchset) return false;
00796   int w1start, w1end;
00797   word1.punct_stripped(&w1start, &w1end);
00798   int w2start, w2end;
00799   word2.punct_stripped(&w2start, &w2end);
00800   if (w1end - w1start != w2end - w2start) return false;
00801   for (int i = 0; i < w1end - w1start; i++) {
00802     if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
00803         uchset->to_lower(word2.unichar_id(w2start + i))) {
00804         return false;
00805     }
00806   }
00807   return true;
00808 }
00809 
00820 void print_ratings_list(const char *msg,
00821                         BLOB_CHOICE_LIST *ratings,
00822                         const UNICHARSET &current_unicharset) {
00823   if (ratings->length() == 0) {
00824     tprintf("%s:<none>\n", msg);
00825     return;
00826   }
00827   if (*msg != '\0') {
00828     tprintf("%s\n", msg);
00829   }
00830   BLOB_CHOICE_IT c_it;
00831   c_it.set_to_list(ratings);
00832   for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
00833     c_it.data()->print(&current_unicharset);
00834     if (!c_it.at_last()) tprintf("\n");
00835   }
00836   tprintf("\n");
00837   fflush(stdout);
00838 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines