tesseract
3.03
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: dawg.h (Formerly dawg.h) 00005 * Description: Definition of a class that represents Directed Accyclic Word 00006 * Graph (DAWG), functions to build and manipulate the DAWG. 00007 * Author: Mark Seaman, SW Productivity 00008 * Created: Fri Oct 16 14:37:00 1987 00009 * Modified: Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt 00010 * Language: C 00011 * Package: N/A 00012 * Status: Reusable Software Component 00013 * 00014 * (c) Copyright 1987, Hewlett-Packard Company. 00015 ** Licensed under the Apache License, Version 2.0 (the "License"); 00016 ** you may not use this file except in compliance with the License. 00017 ** You may obtain a copy of the License at 00018 ** http://www.apache.org/licenses/LICENSE-2.0 00019 ** Unless required by applicable law or agreed to in writing, software 00020 ** distributed under the License is distributed on an "AS IS" BASIS, 00021 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00022 ** See the License for the specific language governing permissions and 00023 ** limitations under the License. 00024 * 00025 *********************************************************************************/ 00026 00027 #ifndef DICT_DAWG_H_ 00028 #define DICT_DAWG_H_ 00029 00030 /*---------------------------------------------------------------------- 00031 I n c l u d e s 00032 ----------------------------------------------------------------------*/ 00033 00034 #include "elst.h" 00035 #include "ratngs.h" 00036 #include "params.h" 00037 #include "tesscallback.h" 00038 00039 #ifndef __GNUC__ 00040 #ifdef _WIN32 00041 #define NO_EDGE (inT64) 0xffffffffffffffffi64 00042 #endif /*_WIN32*/ 00043 #else 00044 #define NO_EDGE (inT64) 0xffffffffffffffffll 00045 #endif /*__GNUC__*/ 00046 00047 /*---------------------------------------------------------------------- 00048 T y p e s 00049 ----------------------------------------------------------------------*/ 00050 class UNICHARSET; 00051 00052 typedef uinT64 EDGE_RECORD; 00053 typedef EDGE_RECORD *EDGE_ARRAY; 00054 typedef inT64 EDGE_REF; 00055 typedef inT64 NODE_REF; 00056 typedef EDGE_REF *NODE_MAP; 00057 00058 namespace tesseract { 00059 00060 struct NodeChild { 00061 UNICHAR_ID unichar_id; 00062 EDGE_REF edge_ref; 00063 NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {} 00064 NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {} 00065 }; 00066 00067 typedef GenericVector<NodeChild> NodeChildVector; 00068 typedef GenericVector<int> SuccessorList; 00069 typedef GenericVector<SuccessorList *> SuccessorListsVector; 00070 00071 enum DawgType { 00072 DAWG_TYPE_PUNCTUATION, 00073 DAWG_TYPE_WORD, 00074 DAWG_TYPE_NUMBER, 00075 DAWG_TYPE_PATTERN, 00076 00077 DAWG_TYPE_COUNT // number of enum entries 00078 }; 00079 00080 /*---------------------------------------------------------------------- 00081 C o n s t a n t s 00082 ----------------------------------------------------------------------*/ 00083 00084 #define FORWARD_EDGE (inT32) 0 00085 #define BACKWARD_EDGE (inT32) 1 00086 #define MAX_NODE_EDGES_DISPLAY (inT64) 100 00087 #define MARKER_FLAG (inT64) 1 00088 #define DIRECTION_FLAG (inT64) 2 00089 #define WERD_END_FLAG (inT64) 4 00090 #define LETTER_START_BIT 0 00091 #define NUM_FLAG_BITS 3 00092 #define REFFORMAT "%lld" 00093 00094 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = { 00095 { 0, 1, 1, 0 }, // for DAWG_TYPE_PUNCTUATION 00096 { 1, 0, 0, 0 }, // for DAWG_TYPE_WORD 00097 { 1, 0, 0, 0 }, // for DAWG_TYPE_NUMBER 00098 { 0, 0, 0, 0 }, // for DAWG_TYPE_PATTERN 00099 }; 00100 00101 static const char kWildcard[] = "*"; 00102 00103 00104 /*---------------------------------------------------------------------- 00105 C l a s s e s a n d S t r u c t s 00106 ----------------------------------------------------------------------*/ 00107 // 00117 // 00118 class Dawg { 00119 public: 00121 static const inT16 kDawgMagicNumber = 42; 00125 static const UNICHAR_ID kPatternUnicharID = 0; 00126 00127 inline DawgType type() const { return type_; } 00128 inline const STRING &lang() const { return lang_; } 00129 inline PermuterType permuter() const { return perm_; } 00130 00131 virtual ~Dawg() {}; 00132 00134 bool word_in_dawg(const WERD_CHOICE &word) const; 00135 00136 // Returns true if the given word prefix is not contraindicated by the dawg. 00137 // If requires_complete is true, then the exact complete word must be present. 00138 bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const; 00139 00142 int check_for_words(const char *filename, 00143 const UNICHARSET &unicharset, 00144 bool enable_wildcard) const; 00145 00146 // For each word in the Dawg, call the given (permanent) callback with the 00147 // text (UTF-8) version of the word. 00148 void iterate_words(const UNICHARSET &unicharset, 00149 TessCallback1<const WERD_CHOICE *> *cb) const; 00150 00151 // For each word in the Dawg, call the given (permanent) callback with the 00152 // text (UTF-8) version of the word. 00153 void iterate_words(const UNICHARSET &unicharset, 00154 TessCallback1<const char *> *cb) const; 00155 00156 // Pure virtual function that should be implemented by the derived classes. 00157 00159 virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, 00160 bool word_end) const = 0; 00161 00164 virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, 00165 bool word_end) const = 0; 00166 00169 virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0; 00170 00173 virtual bool end_of_word(EDGE_REF edge_ref) const = 0; 00174 00176 virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0; 00177 00180 virtual void print_node(NODE_REF node, int max_num_edges) const = 0; 00181 00184 virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, 00185 const UNICHARSET &unicharset, 00186 GenericVector<UNICHAR_ID> *vec) const {}; 00187 00191 virtual EDGE_REF pattern_loop_edge( 00192 EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const { 00193 return false; 00194 } 00195 00196 protected: 00197 Dawg() {} 00198 00200 inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00201 return ((edge_rec & next_node_mask_) >> next_node_start_bit_); 00202 } 00204 inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00205 return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0; 00206 } 00208 inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00209 return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ? 00210 BACKWARD_EDGE : FORWARD_EDGE; 00211 } 00213 inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00214 return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0; 00215 } 00217 inline UNICHAR_ID unichar_id_from_edge_rec( 00218 const EDGE_RECORD &edge_rec) const { 00219 return ((edge_rec & letter_mask_) >> LETTER_START_BIT); 00220 } 00222 inline void set_next_node_in_edge_rec( 00223 EDGE_RECORD *edge_rec, EDGE_REF value) { 00224 *edge_rec &= (~next_node_mask_); 00225 *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_); 00226 } 00228 inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) { 00229 *edge_rec |= (MARKER_FLAG << flag_start_bit_); 00230 } 00238 inline int given_greater_than_edge_rec(NODE_REF next_node, 00239 bool word_end, 00240 UNICHAR_ID unichar_id, 00241 const EDGE_RECORD &edge_rec) const { 00242 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec); 00243 NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec); 00244 bool curr_word_end = end_of_word_from_edge_rec(edge_rec); 00245 if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node, 00246 curr_word_end, curr_unichar_id)) return 0; 00247 if (unichar_id > curr_unichar_id) return 1; 00248 if (unichar_id == curr_unichar_id) { 00249 if (next_node > curr_next_node) return 1; 00250 if (next_node == curr_next_node) { 00251 if (word_end > curr_word_end) return 1; 00252 } 00253 } 00254 return -1; 00255 } 00259 inline bool edge_rec_match(NODE_REF next_node, 00260 bool word_end, 00261 UNICHAR_ID unichar_id, 00262 NODE_REF other_next_node, 00263 bool other_word_end, 00264 UNICHAR_ID other_unichar_id) const { 00265 return ((unichar_id == other_unichar_id) && 00266 (next_node == NO_EDGE || next_node == other_next_node) && 00267 (!word_end || (word_end == other_word_end))); 00268 } 00269 00272 void init(DawgType type, const STRING &lang, 00273 PermuterType perm, int unicharset_size, int debug_level); 00274 00280 bool match_words(WERD_CHOICE *word, inT32 index, 00281 NODE_REF node, UNICHAR_ID wildcard) const; 00282 00283 // Recursively iterate over all words in a dawg (see public iterate_words). 00284 void iterate_words_rec(const WERD_CHOICE &word_so_far, 00285 NODE_REF to_explore, 00286 TessCallback1<const WERD_CHOICE *> *cb) const; 00287 00288 // Member Variables. 00289 DawgType type_; 00290 STRING lang_; 00292 PermuterType perm_; 00293 // Variables to construct various edge masks. Formerly: 00294 // #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64 00295 // #define FLAGS_MASK (inT64) 0x0000000700000000i64 00296 // #define LETTER_MASK (inT64) 0x00000000ffffffffi64 00297 int unicharset_size_; 00298 int flag_start_bit_; 00299 int next_node_start_bit_; 00300 uinT64 next_node_mask_; 00301 uinT64 flags_mask_; 00302 uinT64 letter_mask_; 00303 // Level of debug statements to print to stdout. 00304 int debug_level_; 00305 }; 00306 00307 // 00308 // DawgPosition keeps track of where we are in the primary dawg we're searching 00309 // as well as where we may be in the "punctuation dawg" which may provide 00310 // surrounding context. 00311 // 00312 // Example: 00313 // punctuation dawg -- space is the "pattern character" 00314 // " " // no punctuation 00315 // "' '" // leading and trailing apostrophes 00316 // " '" // trailing apostrophe 00317 // word dawg: 00318 // "cat" 00319 // "cab" 00320 // "cat's" 00321 // 00322 // DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp) 00323 // 00324 // DawgPosition(-1, NO_EDGE, p, pe, false) 00325 // We're in the punctuation dawg, no other dawg has been started. 00326 // (1) If there's a pattern edge as a punc dawg child of us, 00327 // for each punc-following dawg starting with ch, produce: 00328 // Result: DawgPosition(k, w, p', false) 00329 // (2) If there's a valid continuation in the punc dawg, produce: 00330 // Result: DawgPosition(-k, NO_EDGE, p', false) 00331 // 00332 // DawgPosition(k, w, -1, NO_EDGE, false) 00333 // We're in dawg k. Going back to punctuation dawg is not an option. 00334 // Follow ch in dawg k. 00335 // 00336 // DawgPosition(k, w, p, pe, false) 00337 // We're in dawg k. Continue in dawg k and/or go back to the punc dawg. 00338 // If ending, check that the punctuation dawg is also ok to end here. 00339 // 00340 // DawgPosition(k, w, p, pe true) 00341 // We're back in the punctuation dawg. Continuing there is the only option. 00342 struct DawgPosition { 00343 DawgPosition() 00344 : dawg_index(-1), dawg_ref(NO_EDGE), punc_ref(NO_EDGE), 00345 back_to_punc(false) {} 00346 DawgPosition(int dawg_idx, EDGE_REF dawgref, 00347 int punc_idx, EDGE_REF puncref, 00348 bool backtopunc) 00349 : dawg_index(dawg_idx), dawg_ref(dawgref), 00350 punc_index(punc_idx), punc_ref(puncref), 00351 back_to_punc(backtopunc) { 00352 } 00353 bool operator==(const DawgPosition &other) { 00354 return dawg_index == other.dawg_index && 00355 dawg_ref == other.dawg_ref && 00356 punc_index == other.punc_index && 00357 punc_ref == other.punc_ref && 00358 back_to_punc == other.back_to_punc; 00359 } 00360 00361 inT8 dawg_index; 00362 EDGE_REF dawg_ref; 00363 inT8 punc_index; 00364 EDGE_REF punc_ref; 00365 // Have we returned to the punc dawg at the end of the word? 00366 bool back_to_punc; 00367 }; 00368 00369 class DawgPositionVector : public GenericVector<DawgPosition> { 00370 public: 00372 ~DawgPositionVector() { 00373 if (size_reserved_ > 0) { 00374 delete[] data_; 00375 size_used_ = 0; 00376 size_reserved_ = 0; 00377 } 00378 } 00381 void clear() { size_used_ = 0; } 00385 inline bool add_unique(const DawgPosition &new_pos, 00386 bool debug, 00387 const char *debug_msg) { 00388 for (int i = 0; i < size_used_; ++i) { 00389 if (data_[i] == new_pos) return false; 00390 } 00391 push_back(new_pos); 00392 if (debug) { 00393 tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n", 00394 debug_msg, new_pos.dawg_index, new_pos.dawg_ref, 00395 new_pos.punc_ref, new_pos.back_to_punc ? " returned" : ""); 00396 } 00397 return true; 00398 } 00399 }; 00400 00401 // 00408 // 00409 class SquishedDawg : public Dawg { 00410 public: 00411 SquishedDawg(FILE *file, DawgType type, const STRING &lang, 00412 PermuterType perm, int debug_level) { 00413 read_squished_dawg(file, type, lang, perm, debug_level); 00414 num_forward_edges_in_node0 = num_forward_edges(0); 00415 } 00416 SquishedDawg(const char* filename, DawgType type, 00417 const STRING &lang, PermuterType perm, int debug_level) { 00418 FILE *file = fopen(filename, "rb"); 00419 if (file == NULL) { 00420 tprintf("Failed to open dawg file %s\n", filename); 00421 exit(1); 00422 } 00423 read_squished_dawg(file, type, lang, perm, debug_level); 00424 num_forward_edges_in_node0 = num_forward_edges(0); 00425 fclose(file); 00426 } 00427 SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, 00428 const STRING &lang, PermuterType perm, 00429 int unicharset_size, int debug_level) : 00430 edges_(edges), num_edges_(num_edges) { 00431 init(type, lang, perm, unicharset_size, debug_level); 00432 num_forward_edges_in_node0 = num_forward_edges(0); 00433 if (debug_level > 3) print_all("SquishedDawg:"); 00434 } 00435 ~SquishedDawg(); 00436 00437 int NumEdges() { return num_edges_; } 00438 00440 EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, 00441 bool word_end) const; 00442 00445 void unichar_ids_of(NODE_REF node, NodeChildVector *vec, 00446 bool word_end) const { 00447 EDGE_REF edge = node; 00448 if (!edge_occupied(edge) || edge == NO_EDGE) return; 00449 assert(forward_edge(edge)); // we don't expect any backward edges to 00450 do { // be present when this funciton is called 00451 if (!word_end || end_of_word_from_edge_rec(edges_[edge])) { 00452 vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge)); 00453 } 00454 } while (!last_edge(edge++)); 00455 } 00456 00459 NODE_REF next_node(EDGE_REF edge) const { 00460 return next_node_from_edge_rec((edges_[edge])); 00461 } 00462 00465 bool end_of_word(EDGE_REF edge_ref) const { 00466 return end_of_word_from_edge_rec((edges_[edge_ref])); 00467 } 00468 00470 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const { 00471 return unichar_id_from_edge_rec((edges_[edge_ref])); 00472 } 00473 00476 void print_node(NODE_REF node, int max_num_edges) const; 00477 00479 void write_squished_dawg(FILE *file); 00480 00483 void write_squished_dawg(const char *filename) { 00484 FILE *file = fopen(filename, "wb"); 00485 if (file == NULL) { 00486 tprintf("Error opening %s\n", filename); 00487 exit(1); 00488 } 00489 this->write_squished_dawg(file); 00490 fclose(file); 00491 } 00492 00493 private: 00495 inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) { 00496 set_next_node_in_edge_rec(&(edges_[edge_ref]), value); 00497 } 00499 inline void set_empty_edge(EDGE_REF edge_ref) { 00500 (edges_[edge_ref] = next_node_mask_); 00501 } 00503 inline void clear_all_edges() { 00504 for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge); 00505 } 00507 inline void clear_marker_flag(EDGE_REF edge_ref) { 00508 (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_)); 00509 } 00511 inline bool forward_edge(EDGE_REF edge_ref) const { 00512 return (edge_occupied(edge_ref) && 00513 (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); 00514 } 00516 inline bool backward_edge(EDGE_REF edge_ref) const { 00517 return (edge_occupied(edge_ref) && 00518 (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); 00519 } 00521 inline bool edge_occupied(EDGE_REF edge_ref) const { 00522 return (edges_[edge_ref] != next_node_mask_); 00523 } 00525 inline bool last_edge(EDGE_REF edge_ref) const { 00526 return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0; 00527 } 00528 00530 inT32 num_forward_edges(NODE_REF node) const; 00531 00533 void read_squished_dawg(FILE *file, DawgType type, const STRING &lang, 00534 PermuterType perm, int debug_level); 00535 00537 void print_edge(EDGE_REF edge) const; 00538 00540 void print_all(const char* msg) { 00541 tprintf("\n__________________________\n%s\n", msg); 00542 for (int i = 0; i < num_edges_; ++i) print_edge(i); 00543 tprintf("__________________________\n"); 00544 } 00546 NODE_MAP build_node_map(inT32 *num_nodes) const; 00547 00548 00549 // Member variables. 00550 EDGE_ARRAY edges_; 00551 int num_edges_; 00552 int num_forward_edges_in_node0; 00553 }; 00554 00555 } // namespace tesseract 00556 00557 #endif // DICT_DAWG_H_