tesseract
3.03
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: trie.c (Formerly trie.c) 00005 * Description: Functions to build a trie data structure. 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Fri Jul 26 12:18:10 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I n c l u d e s 00027 ----------------------------------------------------------------------*/ 00028 #ifdef _MSC_VER 00029 #pragma warning(disable:4244) // Conversion warnings 00030 #pragma warning(disable:4800) // int/bool warnings 00031 #endif 00032 #include "trie.h" 00033 00034 #include "callcpp.h" 00035 #include "dawg.h" 00036 #include "dict.h" 00037 #include "freelist.h" 00038 #include "genericvector.h" 00039 #include "helpers.h" 00040 #include "kdpair.h" 00041 00042 namespace tesseract { 00043 00044 const char kDoNotReverse[] = "RRP_DO_NO_REVERSE"; 00045 const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL"; 00046 const char kForceReverse[] = "RRP_FORCE_REVERSE"; 00047 00048 const char * const RTLReversePolicyNames[] = { 00049 kDoNotReverse, 00050 kReverseIfHasRTL, 00051 kForceReverse 00052 }; 00053 00054 const char Trie::kAlphaPatternUnicode[] = "\u2000"; 00055 const char Trie::kDigitPatternUnicode[] = "\u2001"; 00056 const char Trie::kAlphanumPatternUnicode[] = "\u2002"; 00057 const char Trie::kPuncPatternUnicode[] = "\u2003"; 00058 const char Trie::kLowerPatternUnicode[] = "\u2004"; 00059 const char Trie::kUpperPatternUnicode[] = "\u2005"; 00060 00061 const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) { 00062 return RTLReversePolicyNames[reverse_policy]; 00063 } 00064 00065 // Reset the Trie to empty. 00066 void Trie::clear() { 00067 nodes_.delete_data_pointers(); 00068 nodes_.clear(); 00069 root_back_freelist_.clear(); 00070 num_edges_ = 0; 00071 new_dawg_node(); // Need to allocate node 0. 00072 } 00073 00074 bool Trie::edge_char_of(NODE_REF node_ref, NODE_REF next_node, 00075 int direction, bool word_end, UNICHAR_ID unichar_id, 00076 EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const { 00077 if (debug_level_ == 3) { 00078 tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT 00079 " direction %d word_end %d unichar_id %d, exploring node:\n", 00080 node_ref, next_node, direction, word_end, unichar_id); 00081 if (node_ref != NO_EDGE) { 00082 print_node(node_ref, nodes_[node_ref]->forward_edges.size()); 00083 } 00084 } 00085 if (node_ref == NO_EDGE) return false; 00086 assert(node_ref < nodes_.size()); 00087 EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? 00088 nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges; 00089 int vec_size = vec.size(); 00090 if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search 00091 EDGE_INDEX start = 0; 00092 EDGE_INDEX end = vec_size - 1; 00093 EDGE_INDEX k; 00094 int compare; 00095 while (start <= end) { 00096 k = (start + end) >> 1; // (start + end) / 2 00097 compare = given_greater_than_edge_rec(next_node, word_end, 00098 unichar_id, vec[k]); 00099 if (compare == 0) { // given == vec[k] 00100 *edge_ptr = &(vec[k]); 00101 *edge_index = k; 00102 return true; 00103 } else if (compare == 1) { // given > vec[k] 00104 start = k + 1; 00105 } else { // given < vec[k] 00106 end = k - 1; 00107 } 00108 } 00109 } else { // linear search 00110 for (int i = 0; i < vec_size; ++i) { 00111 EDGE_RECORD &edge_rec = vec[i]; 00112 if (edge_rec_match(next_node, word_end, unichar_id, 00113 next_node_from_edge_rec(edge_rec), 00114 end_of_word_from_edge_rec(edge_rec), 00115 unichar_id_from_edge_rec(edge_rec))) { 00116 *edge_ptr = &(edge_rec); 00117 *edge_index = i; 00118 return true; 00119 } 00120 } 00121 } 00122 return false; // not found 00123 } 00124 00125 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, 00126 int direction, bool word_end, 00127 UNICHAR_ID unichar_id) { 00128 if (num_edges_ == max_num_edges_) return false; 00129 EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? 00130 &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges); 00131 int search_index; 00132 if (node1 == 0 && direction == FORWARD_EDGE) { 00133 search_index = 0; // find the index to make the add sorted 00134 while (search_index < vec->size() && 00135 given_greater_than_edge_rec(node2, word_end, unichar_id, 00136 (*vec)[search_index]) == 1) { 00137 search_index++; 00138 } 00139 } else { 00140 search_index = vec->size(); // add is unsorted, so index does not matter 00141 } 00142 EDGE_RECORD edge_rec; 00143 link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id); 00144 if (node1 == 0 && direction == BACKWARD_EDGE && 00145 !root_back_freelist_.empty()) { 00146 EDGE_INDEX edge_index = root_back_freelist_.pop_back(); 00147 (*vec)[edge_index] = edge_rec; 00148 } else if (search_index < vec->size()) { 00149 vec->insert(edge_rec, search_index); 00150 } else { 00151 vec->push_back(edge_rec); 00152 } 00153 if (debug_level_ > 1) { 00154 tprintf("new edge in nodes_[" REFFORMAT "]: ", node1); 00155 print_edge_rec(edge_rec); 00156 tprintf("\n"); 00157 } 00158 num_edges_++; 00159 return true; 00160 } 00161 00162 void Trie::add_word_ending(EDGE_RECORD *edge_ptr, 00163 NODE_REF the_next_node, 00164 bool marker_flag, 00165 UNICHAR_ID unichar_id) { 00166 EDGE_RECORD *back_edge_ptr; 00167 EDGE_INDEX back_edge_index; 00168 ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false, 00169 unichar_id, &back_edge_ptr, &back_edge_index)); 00170 if (marker_flag) { 00171 *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_); 00172 *edge_ptr |= (MARKER_FLAG << flag_start_bit_); 00173 } 00174 // Mark both directions as end of word. 00175 *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_); 00176 *edge_ptr |= (WERD_END_FLAG << flag_start_bit_); 00177 } 00178 00179 bool Trie::add_word_to_dawg(const WERD_CHOICE &word, 00180 const GenericVector<bool> *repetitions) { 00181 if (word.length() <= 0) return false; // can't add empty words 00182 if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length()); 00183 // Make sure the word does not contain invalid unchar ids. 00184 for (int i = 0; i < word.length(); ++i) { 00185 if (word.unichar_id(i) < 0 || 00186 word.unichar_id(i) >= unicharset_size_) return false; 00187 } 00188 00189 EDGE_RECORD *edge_ptr; 00190 NODE_REF last_node = 0; 00191 NODE_REF the_next_node; 00192 bool marker_flag = false; 00193 EDGE_INDEX edge_index; 00194 int i; 00195 inT32 still_finding_chars = true; 00196 inT32 word_end = false; 00197 bool add_failed = false; 00198 bool found; 00199 00200 if (debug_level_ > 1) word.print("\nAdding word: "); 00201 00202 UNICHAR_ID unichar_id; 00203 for (i = 0; i < word.length() - 1; ++i) { 00204 unichar_id = word.unichar_id(i); 00205 marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false; 00206 if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id); 00207 if (still_finding_chars) { 00208 found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end, 00209 unichar_id, &edge_ptr, &edge_index); 00210 if (found && debug_level_ > 1) { 00211 tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n", 00212 edge_index, last_node); 00213 } 00214 if (!found) { 00215 still_finding_chars = false; 00216 } else if (next_node_from_edge_rec(*edge_ptr) == 0) { 00217 // We hit the end of an existing word, but the new word is longer. 00218 // In this case we have to disconnect the existing word from the 00219 // backwards root node, mark the current position as end-of-word 00220 // and add new nodes for the increased length. Disconnecting the 00221 // existing word from the backwards root node requires a linear 00222 // search, so it is much faster to add the longest words first, 00223 // to avoid having to come here. 00224 word_end = true; 00225 still_finding_chars = false; 00226 remove_edge(last_node, 0, word_end, unichar_id); 00227 } else { 00228 // We have to add a new branch here for the new word. 00229 if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr); 00230 last_node = next_node_from_edge_rec(*edge_ptr); 00231 } 00232 } 00233 if (!still_finding_chars) { 00234 the_next_node = new_dawg_node(); 00235 if (debug_level_ > 1) 00236 tprintf("adding node " REFFORMAT "\n", the_next_node); 00237 if (the_next_node == 0) { 00238 add_failed = true; 00239 break; 00240 } 00241 if (!add_new_edge(last_node, the_next_node, 00242 marker_flag, word_end, unichar_id)) { 00243 add_failed = true; 00244 break; 00245 } 00246 word_end = false; 00247 last_node = the_next_node; 00248 } 00249 } 00250 the_next_node = 0; 00251 unichar_id = word.unichar_id(i); 00252 marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false; 00253 if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id); 00254 if (still_finding_chars && 00255 edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false, 00256 unichar_id, &edge_ptr, &edge_index)) { 00257 // An extension of this word already exists in the trie, so we 00258 // only have to add the ending flags in both directions. 00259 add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr), 00260 marker_flag, unichar_id); 00261 } else { 00262 // Add a link to node 0. All leaves connect to node 0 so the back links can 00263 // be used in reduction to a dawg. This root backward node has one edge 00264 // entry for every word, (except prefixes of longer words) so it is huge. 00265 if (!add_failed && 00266 !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id)) 00267 add_failed = true; 00268 } 00269 if (add_failed) { 00270 tprintf("Re-initializing document dictionary...\n"); 00271 clear(); 00272 return false; 00273 } else { 00274 return true; 00275 } 00276 } 00277 00278 NODE_REF Trie::new_dawg_node() { 00279 TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD(); 00280 if (node == NULL) return 0; // failed to create new node 00281 nodes_.push_back(node); 00282 return nodes_.length() - 1; 00283 } 00284 00285 // Sort function to sort words by decreasing order of length. 00286 static int sort_strings_by_dec_length(const void* v1, const void* v2) { 00287 const STRING* s1 = reinterpret_cast<const STRING*>(v1); 00288 const STRING* s2 = reinterpret_cast<const STRING*>(v2); 00289 return s2->length() - s1->length(); 00290 } 00291 00292 bool Trie::read_and_add_word_list(const char *filename, 00293 const UNICHARSET &unicharset, 00294 Trie::RTLReversePolicy reverse_policy) { 00295 GenericVector<STRING> word_list; 00296 if (!read_word_list(filename, unicharset, reverse_policy, &word_list)) 00297 return false; 00298 word_list.sort(sort_strings_by_dec_length); 00299 return add_word_list(word_list, unicharset); 00300 } 00301 00302 bool Trie::read_word_list(const char *filename, 00303 const UNICHARSET &unicharset, 00304 Trie::RTLReversePolicy reverse_policy, 00305 GenericVector<STRING>* words) { 00306 FILE *word_file; 00307 char string[CHARS_PER_LINE]; 00308 int word_count = 0; 00309 00310 word_file = fopen(filename, "rb"); 00311 if (word_file == NULL) return false; 00312 00313 while (fgets(string, CHARS_PER_LINE, word_file) != NULL) { 00314 chomp_string(string); // remove newline 00315 WERD_CHOICE word(string, unicharset); 00316 if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && 00317 word.has_rtl_unichar_id()) || 00318 reverse_policy == RRP_FORCE_REVERSE) { 00319 word.reverse_and_mirror_unichar_ids(); 00320 } 00321 ++word_count; 00322 if (debug_level_ && word_count % 10000 == 0) 00323 tprintf("Read %d words so far\n", word_count); 00324 if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) { 00325 words->push_back(word.unichar_string()); 00326 } else if (debug_level_) { 00327 tprintf("Skipping invalid word %s\n", string); 00328 if (debug_level_ >= 3) word.print(); 00329 } 00330 } 00331 if (debug_level_) 00332 tprintf("Read %d words total.\n", word_count); 00333 fclose(word_file); 00334 return true; 00335 } 00336 00337 bool Trie::add_word_list(const GenericVector<STRING>& words, 00338 const UNICHARSET &unicharset) { 00339 for (int i = 0; i < words.size(); ++i) { 00340 WERD_CHOICE word(words[i].string(), unicharset); 00341 if (!word_in_dawg(word)) { 00342 add_word_to_dawg(word); 00343 if (!word_in_dawg(word)) { 00344 tprintf("Error: word '%s' not in DAWG after adding it\n", 00345 words[i].string()); 00346 return false; 00347 } 00348 } 00349 } 00350 return true; 00351 } 00352 00353 void Trie::initialize_patterns(UNICHARSET *unicharset) { 00354 unicharset->unichar_insert(kAlphaPatternUnicode); 00355 alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode); 00356 unicharset->unichar_insert(kDigitPatternUnicode); 00357 digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode); 00358 unicharset->unichar_insert(kAlphanumPatternUnicode); 00359 alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode); 00360 unicharset->unichar_insert(kPuncPatternUnicode); 00361 punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode); 00362 unicharset->unichar_insert(kLowerPatternUnicode); 00363 lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode); 00364 unicharset->unichar_insert(kUpperPatternUnicode); 00365 upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode); 00366 initialized_patterns_ = true; 00367 unicharset_size_ = unicharset->size(); 00368 } 00369 00370 void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, 00371 const UNICHARSET &unicharset, 00372 GenericVector<UNICHAR_ID> *vec) const { 00373 bool is_alpha = unicharset.get_isalpha(unichar_id); 00374 if (is_alpha) { 00375 vec->push_back(alpha_pattern_); 00376 vec->push_back(alphanum_pattern_); 00377 if (unicharset.get_islower(unichar_id)) { 00378 vec->push_back(lower_pattern_); 00379 } else if (unicharset.get_isupper(unichar_id)) { 00380 vec->push_back(upper_pattern_); 00381 } 00382 } 00383 if (unicharset.get_isdigit(unichar_id)) { 00384 vec->push_back(digit_pattern_); 00385 if (!is_alpha) vec->push_back(alphanum_pattern_); 00386 } 00387 if (unicharset.get_ispunctuation(unichar_id)) { 00388 vec->push_back(punc_pattern_); 00389 } 00390 } 00391 00392 UNICHAR_ID Trie::character_class_to_pattern(char ch) { 00393 if (ch == 'c') { 00394 return alpha_pattern_; 00395 } else if (ch == 'd') { 00396 return digit_pattern_; 00397 } else if (ch == 'n') { 00398 return alphanum_pattern_; 00399 } else if (ch == 'p') { 00400 return punc_pattern_; 00401 } else if (ch == 'a') { 00402 return lower_pattern_; 00403 } else if (ch == 'A') { 00404 return upper_pattern_; 00405 } else { 00406 return INVALID_UNICHAR_ID; 00407 } 00408 } 00409 00410 bool Trie::read_pattern_list(const char *filename, 00411 const UNICHARSET &unicharset) { 00412 if (!initialized_patterns_) { 00413 tprintf("please call initialize_patterns() before read_pattern_list()\n"); 00414 return false; 00415 } 00416 00417 FILE *pattern_file = fopen(filename, "rb"); 00418 if (pattern_file == NULL) { 00419 tprintf("Error opening pattern file %s\n", filename); 00420 return false; 00421 } 00422 00423 int pattern_count = 0; 00424 char string[CHARS_PER_LINE]; 00425 while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) { 00426 chomp_string(string); // remove newline 00427 // Parse the pattern and construct a unichar id vector. 00428 // Record the number of repetitions of each unichar in the parallel vector. 00429 WERD_CHOICE word(&unicharset); 00430 GenericVector<bool> repetitions_vec; 00431 const char *str_ptr = string; 00432 int step = unicharset.step(str_ptr); 00433 bool failed = false; 00434 while (step > 0) { 00435 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; 00436 if (step == 1 && *str_ptr == '\\') { 00437 ++str_ptr; 00438 if (*str_ptr == '\\') { // regular '\' unichar that was escaped 00439 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); 00440 } else { 00441 if (word.length() < kSaneNumConcreteChars) { 00442 tprintf("Please provide at least %d concrete characters at the" 00443 " beginning of the pattern\n", kSaneNumConcreteChars); 00444 failed = true; 00445 break; 00446 } 00447 // Parse character class from expression. 00448 curr_unichar_id = character_class_to_pattern(*str_ptr); 00449 } 00450 } else { 00451 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step); 00452 } 00453 if (curr_unichar_id == INVALID_UNICHAR_ID) { 00454 failed = true; 00455 break; // failed to parse this pattern 00456 } 00457 word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0); 00458 repetitions_vec.push_back(false); 00459 str_ptr += step; 00460 step = unicharset.step(str_ptr); 00461 // Check if there is a repetition pattern specified after this unichar. 00462 if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') { 00463 repetitions_vec[repetitions_vec.size()-1] = true; 00464 str_ptr += 2; 00465 step = unicharset.step(str_ptr); 00466 } 00467 } 00468 if (failed) { 00469 tprintf("Invalid user pattern %s\n", string); 00470 continue; 00471 } 00472 // Insert the pattern into the trie. 00473 if (debug_level_ > 2) { 00474 tprintf("Inserting expanded user pattern %s\n", 00475 word.debug_string().string()); 00476 } 00477 if (!this->word_in_dawg(word)) { 00478 this->add_word_to_dawg(word, &repetitions_vec); 00479 if (!this->word_in_dawg(word)) { 00480 tprintf("Error: failed to insert pattern '%s'\n", string); 00481 } 00482 } 00483 ++pattern_count; 00484 } 00485 if (debug_level_) { 00486 tprintf("Read %d valid patterns from %s\n", pattern_count, filename); 00487 } 00488 fclose(pattern_file); 00489 return true; 00490 } 00491 00492 void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, 00493 bool word_end, UNICHAR_ID unichar_id) { 00494 EDGE_RECORD *edge_ptr = NULL; 00495 EDGE_INDEX edge_index = 0; 00496 ASSERT_HOST(edge_char_of(node1, node2, direction, word_end, 00497 unichar_id, &edge_ptr, &edge_index)); 00498 if (debug_level_ > 1) { 00499 tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1); 00500 print_edge_rec(*edge_ptr); 00501 tprintf("\n"); 00502 } 00503 if (direction == FORWARD_EDGE) { 00504 nodes_[node1]->forward_edges.remove(edge_index); 00505 } else if (node1 == 0) { 00506 KillEdge(&nodes_[node1]->backward_edges[edge_index]); 00507 root_back_freelist_.push_back(edge_index); 00508 } else { 00509 nodes_[node1]->backward_edges.remove(edge_index); 00510 } 00511 --num_edges_; 00512 } 00513 00514 // Some optimizations employed in add_word_to_dawg and trie_to_dawg: 00515 // 1 Avoid insertion sorting or bubble sorting the tail root node 00516 // (back links on node 0, a list of all the leaves.). The node is 00517 // huge, and sorting it with n^2 time is terrible. 00518 // 2 Avoid using GenericVector::remove on the tail root node. 00519 // (a) During add of words to the trie, zero-out the unichars and 00520 // keep a freelist of spaces to re-use. 00521 // (b) During reduction, just zero-out the unichars of deleted back 00522 // links, skipping zero entries while searching. 00523 // 3 Avoid linear search of the tail root node. This has to be done when 00524 // a suffix is added to an existing word. Adding words by decreasing 00525 // length avoids this problem entirely. Words can still be added in 00526 // any order, but it is faster to add the longest first. 00527 SquishedDawg *Trie::trie_to_dawg() { 00528 root_back_freelist_.clear(); // Will be invalided by trie_to_dawg. 00529 if (debug_level_ > 2) { 00530 print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY); 00531 } 00532 NODE_MARKER reduced_nodes = new bool[nodes_.size()]; 00533 for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0; 00534 this->reduce_node_input(0, reduced_nodes); 00535 delete[] reduced_nodes; 00536 00537 if (debug_level_ > 2) { 00538 print_all("After reduction:", MAX_NODE_EDGES_DISPLAY); 00539 } 00540 // Build a translation map from node indices in nodes_ vector to 00541 // their target indices in EDGE_ARRAY. 00542 NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1]; 00543 int i, j; 00544 node_ref_map[0] = 0; 00545 for (i = 0; i < nodes_.size(); ++i) { 00546 node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size(); 00547 } 00548 int num_forward_edges = node_ref_map[i]; 00549 00550 // Convert nodes_ vector into EDGE_ARRAY translating the next node references 00551 // in edges using node_ref_map. Empty nodes and backward edges are dropped. 00552 EDGE_ARRAY edge_array = 00553 (EDGE_ARRAY)memalloc(num_forward_edges * sizeof(EDGE_RECORD)); 00554 EDGE_ARRAY edge_array_ptr = edge_array; 00555 for (i = 0; i < nodes_.size(); ++i) { 00556 TRIE_NODE_RECORD *node_ptr = nodes_[i]; 00557 int end = node_ptr->forward_edges.size(); 00558 for (j = 0; j < end; ++j) { 00559 EDGE_RECORD &edge_rec = node_ptr->forward_edges[j]; 00560 NODE_REF node_ref = next_node_from_edge_rec(edge_rec); 00561 ASSERT_HOST(node_ref < nodes_.size()); 00562 UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec); 00563 link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE, 00564 end_of_word_from_edge_rec(edge_rec), unichar_id); 00565 if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr); 00566 ++edge_array_ptr; 00567 } 00568 } 00569 delete[] node_ref_map; 00570 00571 return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, 00572 perm_, unicharset_size_, debug_level_); 00573 } 00574 00575 bool Trie::eliminate_redundant_edges(NODE_REF node, 00576 const EDGE_RECORD &edge1, 00577 const EDGE_RECORD &edge2) { 00578 if (debug_level_ > 1) { 00579 tprintf("\nCollapsing node %d:\n", node); 00580 print_node(node, MAX_NODE_EDGES_DISPLAY); 00581 tprintf("Candidate edges: "); 00582 print_edge_rec(edge1); 00583 tprintf(", "); 00584 print_edge_rec(edge2); 00585 tprintf("\n\n"); 00586 } 00587 NODE_REF next_node1 = next_node_from_edge_rec(edge1); 00588 NODE_REF next_node2 = next_node_from_edge_rec(edge2); 00589 TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2]; 00590 // Translate all edges going to/from next_node2 to go to/from next_node1. 00591 EDGE_RECORD *edge_ptr = NULL; 00592 EDGE_INDEX edge_index; 00593 int i; 00594 // The backward link in node to next_node2 will be zeroed out by the caller. 00595 // Copy all the backward links in next_node2 to node next_node1 00596 for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) { 00597 const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i]; 00598 NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge); 00599 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge); 00600 int curr_word_end = end_of_word_from_edge_rec(bkw_edge); 00601 bool marker_flag = marker_flag_from_edge_rec(bkw_edge); 00602 add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE, 00603 curr_word_end, curr_unichar_id); 00604 // Relocate the corresponding forward edge in curr_next_node 00605 ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE, 00606 curr_word_end, curr_unichar_id, 00607 &edge_ptr, &edge_index)); 00608 set_next_node_in_edge_rec(edge_ptr, next_node1); 00609 } 00610 int next_node2_num_edges = (next_node2_ptr->forward_edges.size() + 00611 next_node2_ptr->backward_edges.size()); 00612 if (debug_level_ > 1) { 00613 tprintf("removed %d edges from node " REFFORMAT "\n", 00614 next_node2_num_edges, next_node2); 00615 } 00616 next_node2_ptr->forward_edges.clear(); 00617 next_node2_ptr->backward_edges.clear(); 00618 num_edges_ -= next_node2_num_edges; 00619 return true; 00620 } 00621 00622 bool Trie::reduce_lettered_edges(EDGE_INDEX edge_index, 00623 UNICHAR_ID unichar_id, 00624 NODE_REF node, 00625 EDGE_VECTOR* backward_edges, 00626 NODE_MARKER reduced_nodes) { 00627 if (debug_level_ > 1) 00628 tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index); 00629 // Compare each of the edge pairs with the given unichar_id. 00630 bool did_something = false; 00631 for (int i = edge_index; i < backward_edges->size() - 1; ++i) { 00632 // Find the first edge that can be eliminated. 00633 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID; 00634 while (i < backward_edges->size()) { 00635 curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]); 00636 if (curr_unichar_id != 0) { 00637 if (curr_unichar_id != unichar_id) return did_something; 00638 if (can_be_eliminated((*backward_edges)[i])) break; 00639 } 00640 ++i; 00641 } 00642 if (i == backward_edges->size()) break; 00643 const EDGE_RECORD &edge_rec = (*backward_edges)[i]; 00644 // Compare it to the rest of the edges with the given unichar_id. 00645 for (int j = i + 1; j < backward_edges->size(); ++j) { 00646 const EDGE_RECORD &next_edge_rec = (*backward_edges)[j]; 00647 UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec); 00648 if (next_id == 0) continue; 00649 if (next_id != unichar_id) break; 00650 if (end_of_word_from_edge_rec(next_edge_rec) == 00651 end_of_word_from_edge_rec(edge_rec) && 00652 can_be_eliminated(next_edge_rec) && 00653 eliminate_redundant_edges(node, edge_rec, next_edge_rec)) { 00654 reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0; 00655 did_something = true; 00656 KillEdge(&(*backward_edges)[j]); 00657 } 00658 } 00659 } 00660 return did_something; 00661 } 00662 00663 void Trie::sort_edges(EDGE_VECTOR *edges) { 00664 int num_edges = edges->size(); 00665 if (num_edges <= 1) return; 00666 GenericVector<KDPairInc<UNICHAR_ID, EDGE_RECORD> > sort_vec; 00667 sort_vec.reserve(num_edges); 00668 for (int i = 0; i < num_edges; ++i) { 00669 sort_vec.push_back(KDPairInc<UNICHAR_ID, EDGE_RECORD>( 00670 unichar_id_from_edge_rec((*edges)[i]), (*edges)[i])); 00671 } 00672 sort_vec.sort(); 00673 for (int i = 0; i < num_edges; ++i) 00674 (*edges)[i] = sort_vec[i].data; 00675 } 00676 00677 void Trie::reduce_node_input(NODE_REF node, 00678 NODE_MARKER reduced_nodes) { 00679 if (debug_level_ > 1) { 00680 tprintf("reduce_node_input(node=" REFFORMAT ")\n", node); 00681 print_node(node, MAX_NODE_EDGES_DISPLAY); 00682 } 00683 00684 EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges; 00685 sort_edges(&backward_edges); 00686 EDGE_INDEX edge_index = 0; 00687 while (edge_index < backward_edges.size()) { 00688 UNICHAR_ID unichar_id = 00689 unichar_id_from_edge_rec(backward_edges[edge_index]); 00690 while (reduce_lettered_edges(edge_index, unichar_id, node, 00691 &backward_edges, reduced_nodes)); 00692 while (++edge_index < backward_edges.size()) { 00693 UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]); 00694 if (id != 0 && id != unichar_id) break; 00695 } 00696 } 00697 reduced_nodes[node] = true; // mark as reduced 00698 00699 if (debug_level_ > 1) { 00700 tprintf("Node " REFFORMAT " after reduction:\n", node); 00701 print_node(node, MAX_NODE_EDGES_DISPLAY); 00702 } 00703 00704 for (int i = 0; i < backward_edges.size(); ++i) { 00705 NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]); 00706 if (next_node != 0 && !reduced_nodes[next_node]) { 00707 reduce_node_input(next_node, reduced_nodes); 00708 } 00709 } 00710 } 00711 00712 void Trie::print_node(NODE_REF node, int max_num_edges) const { 00713 if (node == NO_EDGE) return; // nothing to print 00714 TRIE_NODE_RECORD *node_ptr = nodes_[node]; 00715 int num_fwd = node_ptr->forward_edges.size(); 00716 int num_bkw = node_ptr->backward_edges.size(); 00717 EDGE_VECTOR *vec; 00718 for (int dir = 0; dir < 2; ++dir) { 00719 if (dir == 0) { 00720 vec = &(node_ptr->forward_edges); 00721 tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw); 00722 } else { 00723 vec = &(node_ptr->backward_edges); 00724 tprintf("\t"); 00725 } 00726 int i; 00727 for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) && 00728 i < max_num_edges; ++i) { 00729 print_edge_rec((*vec)[i]); 00730 tprintf(" "); 00731 } 00732 if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("..."); 00733 tprintf("\n"); 00734 } 00735 } 00736 00737 } // namespace tesseract