tesseract
3.03
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: trie.h (Formerly trie.h) 00005 * Description: Functions to build a trie data structure. 00006 * Author: Mark Seaman, SW Productivity 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Fri Jul 26 11:26:34 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 #ifndef TRIE_H 00026 #define TRIE_H 00027 00028 #include "dawg.h" 00029 #include "cutil.h" 00030 #include "genericvector.h" 00031 00032 class UNICHARSET; 00033 00034 // Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed 00035 // max int32, we will need to change GenericVector to use int64 for size 00036 // and address indices. This does not seem to be needed immediately, 00037 // since currently the largest number of edges limit used by tesseract 00038 // (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32. 00039 // There are also int casts below to satisfy the WIN32 compiler that would 00040 // need to be changed. 00041 // It might be cleanest to change the types of most of the Trie/Dawg related 00042 // typedefs to int and restrict the casts to extracting these values from 00043 // the 64 bit EDGE_RECORD. 00044 typedef inT64 EDGE_INDEX; // index of an edge in a given node 00045 typedef bool *NODE_MARKER; 00046 typedef GenericVector<EDGE_RECORD> EDGE_VECTOR; 00047 00048 struct TRIE_NODE_RECORD { 00049 EDGE_VECTOR forward_edges; 00050 EDGE_VECTOR backward_edges; 00051 }; 00052 typedef GenericVector<TRIE_NODE_RECORD *> TRIE_NODES; 00053 00054 namespace tesseract { 00055 00062 class Trie : public Dawg { 00063 public: 00064 enum RTLReversePolicy { 00065 RRP_DO_NO_REVERSE, 00066 RRP_REVERSE_IF_HAS_RTL, 00067 RRP_FORCE_REVERSE, 00068 }; 00069 00070 // Minimum number of concrete characters at the beginning of user patterns. 00071 static const int kSaneNumConcreteChars = 0; 00072 // Various unicode whitespace characters are used to denote unichar patterns, 00073 // (character classifier would never produce these whitespace characters as a 00074 // valid classification). 00075 static const char kAlphaPatternUnicode[]; 00076 static const char kDigitPatternUnicode[]; 00077 static const char kAlphanumPatternUnicode[]; 00078 static const char kPuncPatternUnicode[]; 00079 static const char kLowerPatternUnicode[]; 00080 static const char kUpperPatternUnicode[]; 00081 00082 static const char *get_reverse_policy_name( 00083 RTLReversePolicy reverse_policy); 00084 00085 // max_num_edges argument allows limiting the amount of memory this 00086 // Trie can consume (if a new word insert would cause the Trie to 00087 // contain more edges than max_num_edges, all the edges are cleared 00088 // so that new inserts can proceed). 00089 Trie(DawgType type, const STRING &lang, PermuterType perm, 00090 uinT64 max_num_edges, int unicharset_size, int debug_level) { 00091 init(type, lang, perm, unicharset_size, debug_level); 00092 num_edges_ = 0; 00093 max_num_edges_ = max_num_edges; 00094 deref_node_index_mask_ = ~letter_mask_; 00095 new_dawg_node(); // need to allocate node 0 00096 initialized_patterns_ = false; 00097 } 00098 virtual ~Trie() { nodes_.delete_data_pointers(); } 00099 00100 // Reset the Trie to empty. 00101 void clear(); 00102 00104 EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, 00105 bool word_end) const { 00106 EDGE_RECORD *edge_ptr; 00107 EDGE_INDEX edge_index; 00108 if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, 00109 &edge_ptr, &edge_index)) return NO_EDGE; 00110 return make_edge_ref(node_ref, edge_index); 00111 } 00112 00117 void unichar_ids_of(NODE_REF node, NodeChildVector *vec, 00118 bool word_end) const { 00119 const EDGE_VECTOR &forward_edges = 00120 nodes_[static_cast<int>(node)]->forward_edges; 00121 for (int i = 0; i < forward_edges.size(); ++i) { 00122 if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) { 00123 vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]), 00124 make_edge_ref(node, i))); 00125 } 00126 } 00127 } 00128 00133 NODE_REF next_node(EDGE_REF edge_ref) const { 00134 if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE; 00135 return next_node_from_edge_rec(*deref_edge_ref(edge_ref)); 00136 } 00137 00142 bool end_of_word(EDGE_REF edge_ref) const { 00143 if (edge_ref == NO_EDGE || num_edges_ == 0) return false; 00144 return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref)); 00145 } 00146 00148 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const { 00149 if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID; 00150 return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref)); 00151 } 00152 // Sets the UNICHAR_ID in the given edge_rec to 0, marking the edge dead. 00153 void KillEdge(EDGE_RECORD* edge_rec) const { 00154 *edge_rec &= ~letter_mask_; 00155 } 00156 00157 // Prints the contents of the node indicated by the given NODE_REF. 00158 // At most max_num_edges will be printed. 00159 void print_node(NODE_REF node, int max_num_edges) const; 00160 00161 // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg. 00162 // Eliminates redundant edges and returns the pointer to the SquishedDawg. 00163 // Note: the caller is responsible for deallocating memory associated 00164 // with the returned SquishedDawg pointer. 00165 SquishedDawg *trie_to_dawg(); 00166 00167 // Reads a list of words from the given file and adds into the Trie. 00168 // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse 00169 // policy and information in the unicharset. 00170 // Returns false on error. 00171 bool read_and_add_word_list(const char *filename, 00172 const UNICHARSET &unicharset, 00173 Trie::RTLReversePolicy reverse); 00174 00175 // Reads a list of words from the given file, applying the reverse_policy, 00176 // according to information in the unicharset. 00177 // Returns false on error. 00178 bool read_word_list(const char *filename, 00179 const UNICHARSET &unicharset, 00180 Trie::RTLReversePolicy reverse_policy, 00181 GenericVector<STRING>* words); 00182 // Adds a list of words previously read using read_word_list to the trie 00183 // using the given unicharset to convert to unichar-ids. 00184 // Returns false on error. 00185 bool add_word_list(const GenericVector<STRING>& words, 00186 const UNICHARSET &unicharset); 00187 00188 // Inserts the list of patterns from the given file into the Trie. 00189 // The pattern list file should contain one pattern per line in UTF-8 format. 00190 // 00191 // Each pattern can contain any non-whitespace characters, however only the 00192 // patterns that contain characters from the unicharset of the corresponding 00193 // language will be useful. 00194 // The only meta character is '\'. To be used in a pattern as an ordinary 00195 // string it should be escaped with '\' (e.g. string "C:\Documents" should 00196 // be written in the patterns file as "C:\\Documents"). 00197 // This function supports a very limited regular expression syntax. One can 00198 // express a character, a certain character class and a number of times the 00199 // entity should be repeated in the pattern. 00200 // 00201 // To denote a character class use one of: 00202 // \c - unichar for which UNICHARSET::get_isalpha() is true (character) 00203 // \d - unichar for which UNICHARSET::get_isdigit() is true 00204 // \n - unichar for which UNICHARSET::get_isdigit() and 00205 // UNICHARSET::isalpha() are true 00206 // \p - unichar for which UNICHARSET::get_ispunct() is true 00207 // \a - unichar for which UNICHARSET::get_islower() is true 00208 // \A - unichar for which UNICHARSET::get_isupper() is true 00209 // 00210 // \* could be specified after each character or pattern to indicate that 00211 // the character/pattern can be repeated any number of times before the next 00212 // character/pattern occurs. 00213 // 00214 // Examples: 00215 // 1-8\d\d-GOOG-411 will be expanded to strings: 00216 // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411. 00217 // 00218 // http://www.\n\*.com will be expanded to strings like: 00219 // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com 00220 // 00221 // Note: In choosing which patterns to include please be aware of the fact 00222 // providing very generic patterns will make tesseract run slower. 00223 // For example \n\* at the beginning of the pattern will make Tesseract 00224 // consider all the combinations of proposed character choices for each 00225 // of the segmentations, which will be unacceptably slow. 00226 // Because of potential problems with speed that could be difficult to 00227 // identify, each user pattern has to have at least kSaneNumConcreteChars 00228 // concrete characters from the unicharset at the beginning. 00229 bool read_pattern_list(const char *filename, const UNICHARSET &unicharset); 00230 00231 // Initializes the values of *_pattern_ unichar ids. 00232 // This function should be called before calling read_pattern_list(). 00233 void initialize_patterns(UNICHARSET *unicharset); 00234 00235 // Fills in the given unichar id vector with the unichar ids that represent 00236 // the patterns of the character classes of the given unichar_id. 00237 void unichar_id_to_patterns(UNICHAR_ID unichar_id, 00238 const UNICHARSET &unicharset, 00239 GenericVector<UNICHAR_ID> *vec) const; 00240 00241 // Returns the given EDGE_REF if the EDGE_RECORD that it points to has 00242 // a self loop and the given unichar_id matches the unichar_id stored in the 00243 // EDGE_RECORD, returns NO_EDGE otherwise. 00244 virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, 00245 UNICHAR_ID unichar_id, 00246 bool word_end) const { 00247 if (edge_ref == NO_EDGE) return NO_EDGE; 00248 EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref); 00249 return (marker_flag_from_edge_rec(*edge_rec) && 00250 unichar_id == unichar_id_from_edge_rec(*edge_rec) && 00251 word_end == end_of_word_from_edge_rec(*edge_rec)) ? 00252 edge_ref : NO_EDGE; 00253 } 00254 00255 // Adds a word to the Trie (creates the necessary nodes and edges). 00256 // 00257 // If repetitions vector is not NULL, each entry in the vector indicates 00258 // whether the unichar id with the corresponding index in the word is allowed 00259 // to repeat an unlimited number of times. For each entry that is true, MARKER 00260 // flag of the corresponding edge created for this unichar id is set to true). 00261 // 00262 // Return true if add succeeded, false otherwise (e.g. when a word contained 00263 // an invalid unichar id or the trie was getting too large and was cleared). 00264 bool add_word_to_dawg(const WERD_CHOICE &word, 00265 const GenericVector<bool> *repetitions); 00266 bool add_word_to_dawg(const WERD_CHOICE &word) { 00267 return add_word_to_dawg(word, NULL); 00268 } 00269 00270 protected: 00271 // The structure of an EDGE_REF for Trie edges is as follows: 00272 // [LETTER_START_BIT, flag_start_bit_): 00273 // edge index in *_edges in a TRIE_NODE_RECORD 00274 // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector) 00275 // 00276 // With this arrangement there are enough bits to represent edge indices 00277 // (each node can have at most unicharset_size_ forward edges and 00278 // the position of flag_start_bit is set to be log2(unicharset_size_)). 00279 // It is also possible to accommodate a maximum number of nodes that is at 00280 // least as large as that of the SquishedDawg representation (in SquishedDawg 00281 // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent 00282 // the next node index). 00283 // 00284 00285 // Returns the pointer to EDGE_RECORD after decoding the location 00286 // of the edge from the information in the given EDGE_REF. 00287 // This function assumes that EDGE_REF holds valid node/edge indices. 00288 inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const { 00289 int edge_index = static_cast<int>( 00290 (edge_ref & letter_mask_) >> LETTER_START_BIT); 00291 int node_index = static_cast<int>( 00292 (edge_ref & deref_node_index_mask_) >> flag_start_bit_); 00293 TRIE_NODE_RECORD *node_rec = nodes_[node_index]; 00294 return &(node_rec->forward_edges[edge_index]); 00295 } 00297 inline EDGE_REF make_edge_ref(NODE_REF node_index, 00298 EDGE_INDEX edge_index) const { 00299 return ((node_index << flag_start_bit_) | 00300 (edge_index << LETTER_START_BIT)); 00301 } 00303 inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, 00304 int direction, bool word_end, UNICHAR_ID unichar_id) { 00305 EDGE_RECORD flags = 0; 00306 if (repeats) flags |= MARKER_FLAG; 00307 if (word_end) flags |= WERD_END_FLAG; 00308 if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG; 00309 *edge = ((nxt << next_node_start_bit_) | 00310 (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) | 00311 (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT)); 00312 } 00314 inline void print_edge_rec(const EDGE_RECORD &edge_rec) const { 00315 tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec), 00316 marker_flag_from_edge_rec(edge_rec) ? "R," : "", 00317 (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B", 00318 end_of_word_from_edge_rec(edge_rec) ? ",E" : "", 00319 unichar_id_from_edge_rec(edge_rec)); 00320 } 00321 // Returns true if the next node in recorded the given EDGE_RECORD 00322 // has exactly one forward edge. 00323 inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) { 00324 NODE_REF node_ref = next_node_from_edge_rec(edge_rec); 00325 return (node_ref != NO_EDGE && 00326 nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1); 00327 } 00328 00329 // Prints the contents of the Trie. 00330 // At most max_num_edges will be printed for each node. 00331 void print_all(const char* msg, int max_num_edges) { 00332 tprintf("\n__________________________\n%s\n", msg); 00333 for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges); 00334 tprintf("__________________________\n"); 00335 } 00336 00337 // Finds the edge with the given direction, word_end and unichar_id 00338 // in the node indicated by node_ref. Fills in the pointer to the 00339 // EDGE_RECORD and the index of the edge with the the values 00340 // corresponding to the edge found. Returns true if an edge was found. 00341 bool edge_char_of(NODE_REF node_ref, NODE_REF next_node, 00342 int direction, bool word_end, UNICHAR_ID unichar_id, 00343 EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const; 00344 00345 // Adds an single edge linkage between node1 and node2 in the direction 00346 // indicated by direction argument. 00347 bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, 00348 int direction, bool word_end, 00349 UNICHAR_ID unichar_id); 00350 00351 // Adds forward edge linkage from node1 to node2 and the corresponding 00352 // backward edge linkage in the other direction. 00353 bool add_new_edge(NODE_REF node1, NODE_REF node2, 00354 bool repeats, bool word_end, UNICHAR_ID unichar_id) { 00355 return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, 00356 word_end, unichar_id) && 00357 add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, 00358 word_end, unichar_id)); 00359 } 00360 00361 // Sets the word ending flags in an already existing edge pair. 00362 // Returns true on success. 00363 void add_word_ending(EDGE_RECORD *edge, 00364 NODE_REF the_next_node, 00365 bool repeats, 00366 UNICHAR_ID unichar_id); 00367 00368 // Allocates space for a new node in the Trie. 00369 NODE_REF new_dawg_node(); 00370 00371 // Removes a single edge linkage to between node1 and node2 in the 00372 // direction indicated by direction argument. 00373 void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, 00374 bool word_end, UNICHAR_ID unichar_id); 00375 00376 // Removes forward edge linkage from node1 to node2 and the corresponding 00377 // backward edge linkage in the other direction. 00378 void remove_edge(NODE_REF node1, NODE_REF node2, 00379 bool word_end, UNICHAR_ID unichar_id) { 00380 remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id); 00381 remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id); 00382 } 00383 00384 // Compares edge1 and edge2 in the given node to see if they point to two 00385 // next nodes that could be collapsed. If they do, performs the reduction 00386 // and returns true. 00387 bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, 00388 const EDGE_RECORD &edge2); 00389 00390 // Assuming that edge_index indicates the first edge in a group of edges 00391 // in this node with a particular letter value, looks through these edges 00392 // to see if any of them can be collapsed. If so does it. Returns to the 00393 // caller when all edges with this letter have been reduced. 00394 // Returns true if further reduction is possible with this same letter. 00395 bool reduce_lettered_edges(EDGE_INDEX edge_index, 00396 UNICHAR_ID unichar_id, 00397 NODE_REF node, 00398 EDGE_VECTOR* backward_edges, 00399 NODE_MARKER reduced_nodes); 00400 00407 void sort_edges(EDGE_VECTOR *edges); 00408 00410 void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes); 00411 00412 // Returns the pattern unichar id for the given character class code. 00413 UNICHAR_ID character_class_to_pattern(char ch); 00414 00415 // Member variables 00416 TRIE_NODES nodes_; // vector of nodes in the Trie 00417 uinT64 num_edges_; // sum of all edges (forward and backward) 00418 uinT64 max_num_edges_; // maximum number of edges allowed 00419 uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction 00420 uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index 00421 // Freelist of edges in the root backwards node that were previously zeroed. 00422 GenericVector<EDGE_INDEX> root_back_freelist_; 00423 // Variables for translating character class codes denoted in user patterns 00424 // file to the unichar ids used to represent them in a Trie. 00425 bool initialized_patterns_; 00426 UNICHAR_ID alpha_pattern_; 00427 UNICHAR_ID digit_pattern_; 00428 UNICHAR_ID alphanum_pattern_; 00429 UNICHAR_ID punc_pattern_; 00430 UNICHAR_ID lower_pattern_; 00431 UNICHAR_ID upper_pattern_; 00432 }; 00433 } // namespace tesseract 00434 00435 #endif