tesseract
3.03
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: dawg.c (Formerly dawg.c) 00005 * Description: Use a Directed Accyclic Word Graph 00006 * Author: Mark Seaman, OCR Technology 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 /*---------------------------------------------------------------------- 00026 I n c l u d e s 00027 ----------------------------------------------------------------------*/ 00028 00029 #ifdef _MSC_VER 00030 #pragma warning(disable:4244) // Conversion warnings 00031 #pragma warning(disable:4800) // int/bool warnings 00032 #endif 00033 #include "dawg.h" 00034 00035 #include "cutil.h" 00036 #include "dict.h" 00037 #include "emalloc.h" 00038 #include "freelist.h" 00039 #include "helpers.h" 00040 #include "strngs.h" 00041 #include "tesscallback.h" 00042 #include "tprintf.h" 00043 00044 /*---------------------------------------------------------------------- 00045 F u n c t i o n s f o r D a w g 00046 ----------------------------------------------------------------------*/ 00047 namespace tesseract { 00048 00049 bool Dawg::prefix_in_dawg(const WERD_CHOICE &word, 00050 bool requires_complete) const { 00051 if (word.length() == 0) return !requires_complete; 00052 NODE_REF node = 0; 00053 int end_index = word.length() - 1; 00054 for (int i = 0; i < end_index; i++) { 00055 EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false); 00056 if (edge == NO_EDGE) { 00057 return false; 00058 } 00059 if ((node = next_node(edge)) == 0) { 00060 // This only happens if all words following this edge terminate -- 00061 // there are no larger words. See Trie::add_word_to_dawg() 00062 return false; 00063 } 00064 } 00065 // Now check the last character. 00066 return edge_char_of(node, word.unichar_id(end_index), requires_complete) != 00067 NO_EDGE; 00068 } 00069 00070 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const { 00071 return prefix_in_dawg(word, true); 00072 } 00073 00074 int Dawg::check_for_words(const char *filename, 00075 const UNICHARSET &unicharset, 00076 bool enable_wildcard) const { 00077 if (filename == NULL) return 0; 00078 00079 FILE *word_file; 00080 char string [CHARS_PER_LINE]; 00081 int misses = 0; 00082 UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard); 00083 00084 word_file = open_file (filename, "r"); 00085 00086 while (fgets (string, CHARS_PER_LINE, word_file) != NULL) { 00087 chomp_string(string); // remove newline 00088 WERD_CHOICE word(string, unicharset); 00089 if (word.length() > 0 && 00090 !word.contains_unichar_id(INVALID_UNICHAR_ID)) { 00091 if (!match_words(&word, 0, 0, 00092 enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) { 00093 tprintf("Missing word: %s\n", string); 00094 ++misses; 00095 } 00096 } else { 00097 tprintf("Failed to create a valid word from %s\n", string); 00098 } 00099 } 00100 fclose (word_file); 00101 // Make sure the user sees this with fprintf instead of tprintf. 00102 if (debug_level_) tprintf("Number of lost words=%d\n", misses); 00103 return misses; 00104 } 00105 00106 void Dawg::iterate_words(const UNICHARSET &unicharset, 00107 TessCallback1<const WERD_CHOICE *> *cb) const { 00108 WERD_CHOICE word(&unicharset); 00109 iterate_words_rec(word, 0, cb); 00110 } 00111 00112 void CallWithUTF8(TessCallback1<const char *> *cb, const WERD_CHOICE *wc) { 00113 STRING s; 00114 wc->string_and_lengths(&s, NULL); 00115 cb->Run(s.string()); 00116 } 00117 00118 void Dawg::iterate_words(const UNICHARSET &unicharset, 00119 TessCallback1<const char *> *cb) const { 00120 TessCallback1<const WERD_CHOICE *> *shim = 00121 NewPermanentTessCallback(CallWithUTF8, cb); 00122 WERD_CHOICE word(&unicharset); 00123 iterate_words_rec(word, 0, shim); 00124 delete shim; 00125 } 00126 00127 void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far, 00128 NODE_REF to_explore, 00129 TessCallback1<const WERD_CHOICE *> *cb) const { 00130 NodeChildVector children; 00131 this->unichar_ids_of(to_explore, &children, false); 00132 for (int i = 0; i < children.size(); i++) { 00133 WERD_CHOICE next_word(word_so_far); 00134 next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0); 00135 if (this->end_of_word(children[i].edge_ref)) { 00136 cb->Run(&next_word); 00137 } 00138 NODE_REF next = next_node(children[i].edge_ref); 00139 if (next != 0) { 00140 iterate_words_rec(next_word, next, cb); 00141 } 00142 } 00143 } 00144 00145 bool Dawg::match_words(WERD_CHOICE *word, inT32 index, 00146 NODE_REF node, UNICHAR_ID wildcard) const { 00147 EDGE_REF edge; 00148 inT32 word_end; 00149 00150 if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) { 00151 bool any_matched = false; 00152 NodeChildVector vec; 00153 this->unichar_ids_of(node, &vec, false); 00154 for (int i = 0; i < vec.size(); ++i) { 00155 word->set_unichar_id(vec[i].unichar_id, index); 00156 if (match_words(word, index, node, wildcard)) 00157 any_matched = true; 00158 } 00159 word->set_unichar_id(wildcard, index); 00160 return any_matched; 00161 } else { 00162 word_end = index == word->length() - 1; 00163 edge = edge_char_of(node, word->unichar_id(index), word_end); 00164 if (edge != NO_EDGE) { // normal edge in DAWG 00165 node = next_node(edge); 00166 if (word_end) { 00167 if (debug_level_ > 1) word->print("match_words() found: "); 00168 return true; 00169 } else if (node != 0) { 00170 return match_words(word, index+1, node, wildcard); 00171 } 00172 } 00173 } 00174 return false; 00175 } 00176 00177 void Dawg::init(DawgType type, const STRING &lang, 00178 PermuterType perm, int unicharset_size, int debug_level) { 00179 type_ = type; 00180 lang_ = lang; 00181 perm_ = perm; 00182 ASSERT_HOST(unicharset_size > 0); 00183 unicharset_size_ = unicharset_size; 00184 // Set bit masks. 00185 flag_start_bit_ = ceil(log(static_cast<double>(unicharset_size_)) / log(2.0)); 00186 next_node_start_bit_ = flag_start_bit_ + NUM_FLAG_BITS; 00187 letter_mask_ = ~(~0 << flag_start_bit_); 00188 next_node_mask_ = ~0 << (flag_start_bit_ + NUM_FLAG_BITS); 00189 flags_mask_ = ~(letter_mask_ | next_node_mask_); 00190 00191 debug_level_ = debug_level; 00192 } 00193 00194 00195 /*---------------------------------------------------------------------- 00196 F u n c t i o n s f o r S q u i s h e d D a w g 00197 ----------------------------------------------------------------------*/ 00198 00199 SquishedDawg::~SquishedDawg() { memfree(edges_); } 00200 00201 EDGE_REF SquishedDawg::edge_char_of(NODE_REF node, 00202 UNICHAR_ID unichar_id, 00203 bool word_end) const { 00204 EDGE_REF edge = node; 00205 if (node == 0) { // binary search 00206 EDGE_REF start = 0; 00207 EDGE_REF end = num_forward_edges_in_node0 - 1; 00208 int compare; 00209 while (start <= end) { 00210 edge = (start + end) >> 1; // (start + end) / 2 00211 compare = given_greater_than_edge_rec(NO_EDGE, word_end, 00212 unichar_id, edges_[edge]); 00213 if (compare == 0) { // given == vec[k] 00214 return edge; 00215 } else if (compare == 1) { // given > vec[k] 00216 start = edge + 1; 00217 } else { // given < vec[k] 00218 end = edge - 1; 00219 } 00220 } 00221 } else { // linear search 00222 if (edge != NO_EDGE && edge_occupied(edge)) { 00223 do { 00224 if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) && 00225 (!word_end || end_of_word_from_edge_rec(edges_[edge]))) 00226 return (edge); 00227 } while (!last_edge(edge++)); 00228 } 00229 } 00230 return (NO_EDGE); // not found 00231 } 00232 00233 inT32 SquishedDawg::num_forward_edges(NODE_REF node) const { 00234 EDGE_REF edge = node; 00235 inT32 num = 0; 00236 00237 if (forward_edge (edge)) { 00238 do { 00239 num++; 00240 } while (!last_edge(edge++)); 00241 } 00242 00243 return (num); 00244 } 00245 00246 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const { 00247 if (node == NO_EDGE) return; // nothing to print 00248 00249 EDGE_REF edge = node; 00250 const char *forward_string = "FORWARD"; 00251 const char *backward_string = " "; 00252 00253 const char *last_string = "LAST"; 00254 const char *not_last_string = " "; 00255 00256 const char *eow_string = "EOW"; 00257 const char *not_eow_string = " "; 00258 00259 const char *direction; 00260 const char *is_last; 00261 const char *eow; 00262 00263 UNICHAR_ID unichar_id; 00264 00265 if (edge_occupied(edge)) { 00266 do { 00267 direction = 00268 forward_edge(edge) ? forward_string : backward_string; 00269 is_last = last_edge(edge) ? last_string : not_last_string; 00270 eow = end_of_word(edge) ? eow_string : not_eow_string; 00271 00272 unichar_id = edge_letter(edge); 00273 tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n", 00274 edge, next_node(edge), unichar_id, 00275 direction, is_last, eow); 00276 00277 if (edge - node > max_num_edges) return; 00278 } while (!last_edge(edge++)); 00279 00280 if (edge < num_edges_ && 00281 edge_occupied(edge) && backward_edge(edge)) { 00282 do { 00283 direction = 00284 forward_edge(edge) ? forward_string : backward_string; 00285 is_last = last_edge(edge) ? last_string : not_last_string; 00286 eow = end_of_word(edge) ? eow_string : not_eow_string; 00287 00288 unichar_id = edge_letter(edge); 00289 tprintf(REFFORMAT " : next = " REFFORMAT 00290 ", unichar_id = %d, %s %s %s\n", 00291 edge, next_node(edge), unichar_id, 00292 direction, is_last, eow); 00293 00294 if (edge - node > MAX_NODE_EDGES_DISPLAY) return; 00295 } while (!last_edge(edge++)); 00296 } 00297 } 00298 else { 00299 tprintf(REFFORMAT " : no edges in this node\n", node); 00300 } 00301 tprintf("\n"); 00302 } 00303 00304 void SquishedDawg::print_edge(EDGE_REF edge) const { 00305 if (edge == NO_EDGE) { 00306 tprintf("NO_EDGE\n"); 00307 } else { 00308 tprintf(REFFORMAT " : next = " REFFORMAT 00309 ", unichar_id = '%d', %s %s %s\n", edge, 00310 next_node(edge), edge_letter(edge), 00311 (forward_edge(edge) ? "FORWARD" : " "), 00312 (last_edge(edge) ? "LAST" : " "), 00313 (end_of_word(edge) ? "EOW" : "")); 00314 } 00315 } 00316 00317 void SquishedDawg::read_squished_dawg(FILE *file, 00318 DawgType type, 00319 const STRING &lang, 00320 PermuterType perm, 00321 int debug_level) { 00322 if (debug_level) tprintf("Reading squished dawg\n"); 00323 00324 // Read the magic number and if it does not match kDawgMagicNumber 00325 // set swap to true to indicate that we need to switch endianness. 00326 inT16 magic; 00327 fread(&magic, sizeof(inT16), 1, file); 00328 bool swap = (magic != kDawgMagicNumber); 00329 00330 int unicharset_size; 00331 fread(&unicharset_size, sizeof(inT32), 1, file); 00332 fread(&num_edges_, sizeof(inT32), 1, file); 00333 00334 if (swap) { 00335 ReverseN(&unicharset_size, sizeof(unicharset_size)); 00336 ReverseN(&num_edges_, sizeof(num_edges_)); 00337 } 00338 ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty 00339 Dawg::init(type, lang, perm, unicharset_size, debug_level); 00340 00341 edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_); 00342 fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file); 00343 EDGE_REF edge; 00344 if (swap) { 00345 for (edge = 0; edge < num_edges_; ++edge) { 00346 ReverseN(&edges_[edge], sizeof(edges_[edge])); 00347 } 00348 } 00349 if (debug_level > 2) { 00350 tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n", 00351 type_, lang_.string(), perm_, unicharset_size_, num_edges_); 00352 for (edge = 0; edge < num_edges_; ++edge) 00353 print_edge(edge); 00354 } 00355 } 00356 00357 NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const { 00358 EDGE_REF edge; 00359 NODE_MAP node_map; 00360 inT32 node_counter; 00361 inT32 num_edges; 00362 00363 node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_); 00364 00365 for (edge = 0; edge < num_edges_; edge++) // init all slots 00366 node_map [edge] = -1; 00367 00368 node_counter = num_forward_edges(0); 00369 00370 *num_nodes = 0; 00371 for (edge = 0; edge < num_edges_; edge++) { // search all slots 00372 00373 if (forward_edge(edge)) { 00374 (*num_nodes)++; // count nodes links 00375 node_map[edge] = (edge ? node_counter : 0); 00376 num_edges = num_forward_edges(edge); 00377 if (edge != 0) node_counter += num_edges; 00378 edge += num_edges; 00379 if (edge >= num_edges_) break; 00380 if (backward_edge(edge)) while (!last_edge(edge++)); 00381 edge--; 00382 } 00383 } 00384 return (node_map); 00385 } 00386 00387 void SquishedDawg::write_squished_dawg(FILE *file) { 00388 EDGE_REF edge; 00389 inT32 num_edges; 00390 inT32 node_count = 0; 00391 NODE_MAP node_map; 00392 EDGE_REF old_index; 00393 EDGE_RECORD temp_record; 00394 00395 if (debug_level_) tprintf("write_squished_dawg\n"); 00396 00397 node_map = build_node_map(&node_count); 00398 00399 // Write the magic number to help detecting a change in endianness. 00400 inT16 magic = kDawgMagicNumber; 00401 fwrite(&magic, sizeof(inT16), 1, file); 00402 fwrite(&unicharset_size_, sizeof(inT32), 1, file); 00403 00404 // Count the number of edges in this Dawg. 00405 num_edges = 0; 00406 for (edge=0; edge < num_edges_; edge++) 00407 if (forward_edge(edge)) 00408 num_edges++; 00409 00410 fwrite(&num_edges, sizeof(inT32), 1, file); // write edge count to file 00411 00412 if (debug_level_) { 00413 tprintf("%d nodes in DAWG\n", node_count); 00414 tprintf("%d edges in DAWG\n", num_edges); 00415 } 00416 00417 for (edge = 0; edge < num_edges_; edge++) { 00418 if (forward_edge(edge)) { // write forward edges 00419 do { 00420 old_index = next_node_from_edge_rec(edges_[edge]); 00421 set_next_node(edge, node_map[old_index]); 00422 temp_record = edges_[edge]; 00423 fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file); 00424 set_next_node(edge, old_index); 00425 } while (!last_edge(edge++)); 00426 00427 if (edge >= num_edges_) break; 00428 if (backward_edge(edge)) // skip back links 00429 while (!last_edge(edge++)); 00430 00431 edge--; 00432 } 00433 } 00434 free(node_map); 00435 } 00436 00437 } // namespace tesseract