tesseract
3.03
|
00001 /********************************************************************** 00002 * File: tfacepp.cpp (Formerly tface++.c) 00003 * Description: C++ side of the C/C++ Tess/Editor interface. 00004 * Author: Ray Smith 00005 * Created: Thu Apr 23 15:39:23 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #pragma warning(disable:4305) // int/float warnings 00023 #pragma warning(disable:4800) // int/bool warnings 00024 #endif 00025 00026 #include <math.h> 00027 00028 #include "blamer.h" 00029 #include "errcode.h" 00030 #include "ratngs.h" 00031 #include "reject.h" 00032 #include "tesseractclass.h" 00033 #include "werd.h" 00034 00035 #define MAX_UNDIVIDED_LENGTH 24 00036 00037 00038 00039 /********************************************************************** 00040 * recog_word 00041 * 00042 * Convert the word to tess form and pass it to the tess segmenter. 00043 * Convert the output back to editor form. 00044 **********************************************************************/ 00045 namespace tesseract { 00046 void Tesseract::recog_word(WERD_RES *word) { 00047 if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL || 00048 word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) { 00049 if (classify_debug_level) tprintf("No truth for word - skipping\n"); 00050 word->tess_failed = true; 00051 return; 00052 } 00053 ASSERT_HOST(!word->chopped_word->blobs.empty()); 00054 recog_word_recursive(word); 00055 word->SetupBoxWord(); 00056 if (word->best_choice->length() != word->box_word->length()) { 00057 tprintf("recog_word ASSERT FAIL String:\"%s\"; " 00058 "Strlen=%d; #Blobs=%d\n", 00059 word->best_choice->debug_string().string(), 00060 word->best_choice->length(), word->box_word->length()); 00061 } 00062 ASSERT_HOST(word->best_choice->length() == word->box_word->length()); 00063 // Check that the ratings matrix size matches the sum of all the 00064 // segmentation states. 00065 if (!word->StatesAllValid()) { 00066 tprintf("Not all words have valid states relative to ratings matrix!!"); 00067 word->DebugWordChoices(true, NULL); 00068 ASSERT_HOST(word->StatesAllValid()); 00069 } 00070 if (tessedit_override_permuter) { 00071 /* Override the permuter type if a straight dictionary check disagrees. */ 00072 uinT8 perm_type = word->best_choice->permuter(); 00073 if ((perm_type != SYSTEM_DAWG_PERM) && 00074 (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) { 00075 uinT8 real_dict_perm_type = dict_word(*word->best_choice); 00076 if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || 00077 (real_dict_perm_type == FREQ_DAWG_PERM) || 00078 (real_dict_perm_type == USER_DAWG_PERM)) && 00079 (alpha_count(word->best_choice->unichar_string().string(), 00080 word->best_choice->unichar_lengths().string()) > 0)) { 00081 word->best_choice->set_permuter(real_dict_perm_type); // use dict perm 00082 } 00083 } 00084 if (tessedit_rejection_debug && 00085 perm_type != word->best_choice->permuter()) { 00086 tprintf("Permuter Type Flipped from %d to %d\n", 00087 perm_type, word->best_choice->permuter()); 00088 } 00089 } 00090 // Factored out from control.cpp 00091 ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL)); 00092 if (word->best_choice == NULL || word->best_choice->length() == 0 || 00093 static_cast<int>(strspn(word->best_choice->unichar_string().string(), 00094 " ")) == word->best_choice->length()) { 00095 word->tess_failed = true; 00096 word->reject_map.initialise(word->box_word->length()); 00097 word->reject_map.rej_word_tess_failure(); 00098 } else { 00099 word->tess_failed = false; 00100 } 00101 } 00102 00103 00104 /********************************************************************** 00105 * recog_word_recursive 00106 * 00107 * Convert the word to tess form and pass it to the tess segmenter. 00108 * Convert the output back to editor form. 00109 **********************************************************************/ 00110 void Tesseract::recog_word_recursive(WERD_RES *word) { 00111 int word_length = word->chopped_word->NumBlobs(); // no of blobs 00112 if (word_length > MAX_UNDIVIDED_LENGTH) { 00113 return split_and_recog_word(word); 00114 } 00115 cc_recog(word); 00116 word_length = word->rebuild_word->NumBlobs(); // No of blobs in output. 00117 00118 // Do sanity checks and minor fixes on best_choice. 00119 if (word->best_choice->length() > word_length) { 00120 word->best_choice->make_bad(); // should never happen 00121 tprintf("recog_word: Discarded long string \"%s\"" 00122 " (%d characters vs %d blobs)\n", 00123 word->best_choice->unichar_string().string(), 00124 word->best_choice->length(), word_length); 00125 tprintf("Word is at:"); 00126 word->word->bounding_box().print(); 00127 } 00128 if (word->best_choice->length() < word_length) { 00129 UNICHAR_ID space_id = unicharset.unichar_to_id(" "); 00130 while (word->best_choice->length() < word_length) { 00131 word->best_choice->append_unichar_id(space_id, 1, 0.0, 00132 word->best_choice->certainty()); 00133 } 00134 } 00135 } 00136 00137 00138 /********************************************************************** 00139 * split_and_recog_word 00140 * 00141 * Split the word into 2 smaller pieces at the largest gap. 00142 * Recognize the pieces and stick the results back together. 00143 **********************************************************************/ 00144 void Tesseract::split_and_recog_word(WERD_RES *word) { 00145 // Find the biggest blob gap in the chopped_word. 00146 int bestgap = -MAX_INT32; 00147 int split_index = 0; 00148 for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) { 00149 TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box(); 00150 TBOX blob_box = word->chopped_word->blobs[b]->bounding_box(); 00151 int gap = blob_box.left() - prev_box.right(); 00152 if (gap > bestgap) { 00153 bestgap = gap; 00154 split_index = b; 00155 } 00156 } 00157 ASSERT_HOST(split_index > 0); 00158 00159 WERD_RES *word2 = NULL; 00160 BlamerBundle *orig_bb = NULL; 00161 split_word(word, split_index, &word2, &orig_bb); 00162 00163 // Recognize the first part of the word. 00164 recog_word_recursive(word); 00165 // Recognize the second part of the word. 00166 recog_word_recursive(word2); 00167 00168 join_words(word, word2, orig_bb); 00169 } 00170 00171 00172 /********************************************************************** 00173 * split_word 00174 * 00175 * Split a given WERD_RES in place into two smaller words for recognition. 00176 * split_pt is the index of the first blob to go in the second word. 00177 * The underlying word is left alone, only the TWERD (and subsequent data) 00178 * are split up. orig_blamer_bundle is set to the original blamer bundle, 00179 * and will now be owned by the caller. New blamer bundles are forged for the 00180 * two pieces. 00181 **********************************************************************/ 00182 void Tesseract::split_word(WERD_RES *word, 00183 int split_pt, 00184 WERD_RES **right_piece, 00185 BlamerBundle **orig_blamer_bundle) const { 00186 ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs()); 00187 00188 // Save a copy of the blamer bundle so we can try to reconstruct it below. 00189 BlamerBundle *orig_bb = 00190 word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : NULL; 00191 00192 WERD_RES *word2 = new WERD_RES(*word); 00193 00194 // blow away the copied chopped_word, as we want to work with 00195 // the blobs from the input chopped_word so seam_arrays can be merged. 00196 TWERD *chopped = word->chopped_word; 00197 TWERD *chopped2 = new TWERD; 00198 chopped2->blobs.reserve(chopped->NumBlobs() - split_pt); 00199 for (int i = split_pt; i < chopped->NumBlobs(); ++i) { 00200 chopped2->blobs.push_back(chopped->blobs[i]); 00201 } 00202 chopped->blobs.truncate(split_pt); 00203 word->chopped_word = NULL; 00204 delete word2->chopped_word; 00205 word2->chopped_word = NULL; 00206 00207 const UNICHARSET &unicharset = *word->uch_set; 00208 word->ClearResults(); 00209 word2->ClearResults(); 00210 word->chopped_word = chopped; 00211 word2->chopped_word = chopped2; 00212 word->SetupBasicsFromChoppedWord(unicharset); 00213 word2->SetupBasicsFromChoppedWord(unicharset); 00214 00215 // Try to adjust the blamer bundle. 00216 if (orig_bb != NULL) { 00217 // TODO(rays) Looks like a leak to me. 00218 // orig_bb should take, rather than copy. 00219 word->blamer_bundle = new BlamerBundle(); 00220 word2->blamer_bundle = new BlamerBundle(); 00221 orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(), 00222 word2->chopped_word->blobs[0]->bounding_box().left(), 00223 wordrec_debug_blamer, 00224 word->blamer_bundle, word2->blamer_bundle); 00225 } 00226 00227 *right_piece = word2; 00228 *orig_blamer_bundle = orig_bb; 00229 } 00230 00231 00232 /********************************************************************** 00233 * join_words 00234 * 00235 * The opposite of split_word(): 00236 * join word2 (including any recognized data / seam array / etc) 00237 * onto the right of word and then delete word2. 00238 * Also, if orig_bb is provided, stitch it back into word. 00239 **********************************************************************/ 00240 void Tesseract::join_words(WERD_RES *word, 00241 WERD_RES *word2, 00242 BlamerBundle *orig_bb) const { 00243 TBOX prev_box = word->chopped_word->blobs.back()->bounding_box(); 00244 TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box(); 00245 // Tack the word2 outputs onto the end of the word outputs. 00246 word->chopped_word->blobs += word2->chopped_word->blobs; 00247 word->rebuild_word->blobs += word2->rebuild_word->blobs; 00248 word2->chopped_word->blobs.clear(); 00249 word2->rebuild_word->blobs.clear(); 00250 TPOINT split_pt; 00251 split_pt.x = (prev_box.right() + blob_box.left()) / 2; 00252 split_pt.y = (prev_box.top() + prev_box.bottom() + 00253 blob_box.top() + blob_box.bottom()) / 4; 00254 // Move the word2 seams onto the end of the word1 seam_array. 00255 // Since the seam list is one element short, an empty seam marking the 00256 // end of the last blob in the first word is needed first. 00257 word->seam_array.push_back(new SEAM(0.0f, split_pt, NULL, NULL, NULL)); 00258 word->seam_array += word2->seam_array; 00259 word2->seam_array.truncate(0); 00260 // Fix widths and gaps. 00261 word->blob_widths += word2->blob_widths; 00262 word->blob_gaps += word2->blob_gaps; 00263 // Fix the ratings matrix. 00264 int rat1 = word->ratings->dimension(); 00265 int rat2 = word2->ratings->dimension(); 00266 word->ratings->AttachOnCorner(word2->ratings); 00267 ASSERT_HOST(word->ratings->dimension() == rat1 + rat2); 00268 word->best_state += word2->best_state; 00269 // Append the word choices. 00270 *word->raw_choice += *word2->raw_choice; 00271 00272 // How many alt choices from each should we try to get? 00273 const int kAltsPerPiece = 2; 00274 // When do we start throwing away extra alt choices? 00275 const int kTooManyAltChoices = 100; 00276 00277 // Construct the cartesian product of the best_choices of word(1) and word2. 00278 WERD_CHOICE_LIST joined_choices; 00279 WERD_CHOICE_IT jc_it(&joined_choices); 00280 WERD_CHOICE_IT bc1_it(&word->best_choices); 00281 WERD_CHOICE_IT bc2_it(&word2->best_choices); 00282 int num_word1_choices = word->best_choices.length(); 00283 int total_joined_choices = num_word1_choices; 00284 // Nota Bene: For the main loop here, we operate only on the 2nd and greater 00285 // word2 choices, and put them in the joined_choices list. The 1st word2 00286 // choice gets added to the original word1 choices in-place after we have 00287 // finished with them. 00288 int bc2_index = 1; 00289 for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) { 00290 if (total_joined_choices >= kTooManyAltChoices && 00291 bc2_index > kAltsPerPiece) 00292 break; 00293 int bc1_index = 0; 00294 for (bc1_it.move_to_first(); bc1_index < num_word1_choices; 00295 ++bc1_index, bc1_it.forward()) { 00296 if (total_joined_choices >= kTooManyAltChoices && 00297 bc1_index > kAltsPerPiece) 00298 break; 00299 WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data()); 00300 *wc += *bc2_it.data(); 00301 jc_it.add_after_then_move(wc); 00302 ++total_joined_choices; 00303 } 00304 } 00305 // Now that we've filled in as many alternates as we want, paste the best 00306 // choice for word2 onto the original word alt_choices. 00307 bc1_it.move_to_first(); 00308 bc2_it.move_to_first(); 00309 for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) { 00310 *bc1_it.data() += *bc2_it.data(); 00311 } 00312 bc1_it.move_to_last(); 00313 bc1_it.add_list_after(&joined_choices); 00314 00315 // Restore the pointer to original blamer bundle and combine blamer 00316 // information recorded in the splits. 00317 if (orig_bb != NULL) { 00318 orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, 00319 wordrec_debug_blamer); 00320 delete word->blamer_bundle; 00321 word->blamer_bundle = orig_bb; 00322 } 00323 word->SetupBoxWord(); 00324 word->reject_map.initialise(word->box_word->length()); 00325 delete word2; 00326 } 00327 00328 00329 } // namespace tesseract