tesseract
3.03
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: pieces.c (Formerly pieces.c) 00005 * Description: 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Mon May 20 12:12:35 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I n c l u d e s 00027 ----------------------------------------------------------------------*/ 00028 00029 #include "blobs.h" 00030 #include "freelist.h" 00031 #include "helpers.h" 00032 #include "matrix.h" 00033 #include "ndminx.h" 00034 #include "ratngs.h" 00035 #include "seam.h" 00036 #include "wordrec.h" 00037 00038 // Include automatically generated configuration file if running autoconf. 00039 #ifdef HAVE_CONFIG_H 00040 #include "config_auto.h" 00041 #endif 00042 00043 /*---------------------------------------------------------------------- 00044 F u n c t i o n s 00045 ----------------------------------------------------------------------*/ 00046 00047 /********************************************************************** 00048 * classify_piece 00049 * 00050 * Create a larger piece from a collection of smaller ones. Classify 00051 * it and return the results. Take the large piece apart to leave 00052 * the collection of small pieces un modified. 00053 **********************************************************************/ 00054 namespace tesseract { 00055 BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector<SEAM*>& seams, 00056 inT16 start, 00057 inT16 end, 00058 const char* description, 00059 TWERD *word, 00060 BlamerBundle *blamer_bundle) { 00061 if (end > start) join_pieces(seams, start, end, word); 00062 BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description, 00063 White, blamer_bundle); 00064 // Set the matrix_cell_ entries in all the BLOB_CHOICES. 00065 BLOB_CHOICE_IT bc_it(choices); 00066 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { 00067 bc_it.data()->set_matrix_cell(start, end); 00068 } 00069 00070 if (end > start) break_pieces(seams, start, end, word); 00071 00072 return (choices); 00073 } 00074 00075 template<class BLOB_CHOICE> 00076 int SortByUnicharID(const void *void1, const void *void2) { 00077 const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1); 00078 const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2); 00079 00080 return p1->unichar_id() - p2->unichar_id(); 00081 } 00082 00083 template<class BLOB_CHOICE> 00084 int SortByRating(const void *void1, const void *void2) { 00085 const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1); 00086 const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2); 00087 00088 if (p1->rating() < p2->rating()) 00089 return 1; 00090 return -1; 00091 } 00092 00093 00094 /********************************************************************** 00095 * fill_filtered_fragment_list 00096 * 00097 * Filter the fragment list so that the filtered_choices only contain 00098 * fragments that are in the correct position. choices is the list 00099 * that we are going to filter. fragment_pos is the position in the 00100 * fragment that we are looking for and num_frag_parts is the the 00101 * total number of pieces. The result will be appended to 00102 * filtered_choices. 00103 **********************************************************************/ 00104 void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, 00105 int fragment_pos, 00106 int num_frag_parts, 00107 BLOB_CHOICE_LIST *filtered_choices) { 00108 BLOB_CHOICE_IT filtered_choices_it(filtered_choices); 00109 BLOB_CHOICE_IT choices_it(choices); 00110 00111 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); 00112 choices_it.forward()) { 00113 UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id(); 00114 const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id); 00115 00116 if (frag != NULL && frag->get_pos() == fragment_pos && 00117 frag->get_total() == num_frag_parts) { 00118 // Recover the unichar_id of the unichar that this fragment is 00119 // a part of 00120 BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data()); 00121 int original_unichar = unicharset.unichar_to_id(frag->get_unichar()); 00122 b->set_unichar_id(original_unichar); 00123 filtered_choices_it.add_to_end(b); 00124 } 00125 } 00126 00127 filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>); 00128 } 00129 00130 00131 /********************************************************************** 00132 * merge_and_put_fragment_lists 00133 * 00134 * Merge the fragment lists in choice_lists and append it to the 00135 * ratings matrix. 00136 **********************************************************************/ 00137 void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column, 00138 inT16 num_frag_parts, 00139 BLOB_CHOICE_LIST *choice_lists, 00140 MATRIX *ratings) { 00141 BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts]; 00142 00143 for (int i = 0; i < num_frag_parts; i++) { 00144 choice_lists_it[i].set_to_list(&choice_lists[i]); 00145 choice_lists_it[i].mark_cycle_pt(); 00146 } 00147 00148 BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column); 00149 if (merged_choice == NULL) 00150 merged_choice = new BLOB_CHOICE_LIST; 00151 00152 bool end_of_list = false; 00153 BLOB_CHOICE_IT merged_choice_it(merged_choice); 00154 while (!end_of_list) { 00155 // Find the maximum unichar_id of the current entry the iterators 00156 // are pointing at 00157 UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id(); 00158 for (int i = 0; i < num_frag_parts; i++) { 00159 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); 00160 if (max_unichar_id < unichar_id) { 00161 max_unichar_id = unichar_id; 00162 } 00163 } 00164 00165 // Move the each iterators until it gets to an entry that has a 00166 // value greater than or equal to max_unichar_id 00167 for (int i = 0; i < num_frag_parts; i++) { 00168 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); 00169 while (!choice_lists_it[i].cycled_list() && 00170 unichar_id < max_unichar_id) { 00171 choice_lists_it[i].forward(); 00172 unichar_id = choice_lists_it[i].data()->unichar_id(); 00173 } 00174 if (choice_lists_it[i].cycled_list()) { 00175 end_of_list = true; 00176 break; 00177 } 00178 } 00179 00180 if (end_of_list) 00181 break; 00182 00183 // Checks if the fragments are parts of the same character 00184 UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id(); 00185 bool same_unichar = true; 00186 for (int i = 1; i < num_frag_parts; i++) { 00187 UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); 00188 if (unichar_id != first_unichar_id) { 00189 same_unichar = false; 00190 break; 00191 } 00192 } 00193 00194 if (same_unichar) { 00195 // Add the merged character to the result 00196 UNICHAR_ID merged_unichar_id = first_unichar_id; 00197 inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id(); 00198 inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2(); 00199 float merged_min_xheight = choice_lists_it[0].data()->min_xheight(); 00200 float merged_max_xheight = choice_lists_it[0].data()->max_xheight(); 00201 float positive_yshift = 0, negative_yshift = 0; 00202 int merged_script_id = choice_lists_it[0].data()->script_id(); 00203 BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier(); 00204 00205 float merged_rating = 0, merged_certainty = 0; 00206 for (int i = 0; i < num_frag_parts; i++) { 00207 float rating = choice_lists_it[i].data()->rating(); 00208 float certainty = choice_lists_it[i].data()->certainty(); 00209 00210 if (i == 0 || certainty < merged_certainty) 00211 merged_certainty = certainty; 00212 merged_rating += rating; 00213 00214 choice_lists_it[i].forward(); 00215 if (choice_lists_it[i].cycled_list()) 00216 end_of_list = true; 00217 IntersectRange(choice_lists_it[i].data()->min_xheight(), 00218 choice_lists_it[i].data()->max_xheight(), 00219 &merged_min_xheight, &merged_max_xheight); 00220 float yshift = choice_lists_it[i].data()->yshift(); 00221 if (yshift > positive_yshift) positive_yshift = yshift; 00222 if (yshift < negative_yshift) negative_yshift = yshift; 00223 } 00224 00225 float merged_yshift = positive_yshift != 0 00226 ? (negative_yshift != 0 ? 0 : positive_yshift) 00227 : negative_yshift; 00228 merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id, 00229 merged_rating, 00230 merged_certainty, 00231 merged_fontinfo_id, 00232 merged_fontinfo_id2, 00233 merged_script_id, 00234 merged_min_xheight, 00235 merged_max_xheight, 00236 merged_yshift, 00237 classifier)); 00238 } 00239 } 00240 00241 if (classify_debug_level) 00242 print_ratings_list("Merged Fragments", merged_choice, 00243 unicharset); 00244 00245 if (merged_choice->empty()) 00246 delete merged_choice; 00247 else 00248 ratings->put(row, column, merged_choice); 00249 00250 delete [] choice_lists_it; 00251 } 00252 00253 00254 /********************************************************************** 00255 * get_fragment_lists 00256 * 00257 * Recursively go through the ratings matrix to find lists of fragments 00258 * to be merged in the function merge_and_put_fragment_lists. 00259 * current_frag is the postion of the piece we are looking for. 00260 * current_row is the row in the rating matrix we are currently at. 00261 * start is the row we started initially, so that we can know where 00262 * to append the results to the matrix. num_frag_parts is the total 00263 * number of pieces we are looking for and num_blobs is the size of the 00264 * ratings matrix. 00265 **********************************************************************/ 00266 void Wordrec::get_fragment_lists(inT16 current_frag, inT16 current_row, 00267 inT16 start, inT16 num_frag_parts, 00268 inT16 num_blobs, MATRIX *ratings, 00269 BLOB_CHOICE_LIST *choice_lists) { 00270 if (current_frag == num_frag_parts) { 00271 merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts, 00272 choice_lists, ratings); 00273 return; 00274 } 00275 00276 for (inT16 x = current_row; x < num_blobs; x++) { 00277 BLOB_CHOICE_LIST *choices = ratings->get(current_row, x); 00278 if (choices == NULL) 00279 continue; 00280 00281 fill_filtered_fragment_list(choices, current_frag, num_frag_parts, 00282 &choice_lists[current_frag]); 00283 if (!choice_lists[current_frag].empty()) { 00284 get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts, 00285 num_blobs, ratings, choice_lists); 00286 choice_lists[current_frag].clear(); 00287 } 00288 } 00289 } 00290 00291 00292 /********************************************************************** 00293 * merge_fragments 00294 * 00295 * Try to merge fragments in the ratings matrix and put the result in 00296 * the corresponding row and column 00297 **********************************************************************/ 00298 void Wordrec::merge_fragments(MATRIX *ratings, inT16 num_blobs) { 00299 BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks]; 00300 for (inT16 start = 0; start < num_blobs; start++) { 00301 for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks; 00302 frag_parts++) { 00303 get_fragment_lists(0, start, start, frag_parts, num_blobs, 00304 ratings, choice_lists); 00305 } 00306 } 00307 00308 // Delete fragments from the rating matrix 00309 for (inT16 x = 0; x < num_blobs; x++) { 00310 for (inT16 y = x; y < num_blobs; y++) { 00311 BLOB_CHOICE_LIST *choices = ratings->get(x, y); 00312 if (choices != NULL) { 00313 BLOB_CHOICE_IT choices_it(choices); 00314 for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); 00315 choices_it.forward()) { 00316 UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id(); 00317 const CHAR_FRAGMENT *frag = 00318 unicharset.get_fragment(choice_unichar_id); 00319 if (frag != NULL) 00320 delete choices_it.extract(); 00321 } 00322 } 00323 } 00324 } 00325 } 00326 00327 00328 } // namespace tesseract