tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/dict/dawg.h
Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:         dawg.h  (Formerly dawg.h)
00005  * Description:  Definition of a class that represents Directed Accyclic Word
00006  *               Graph (DAWG), functions to build and manipulate the DAWG.
00007  * Author:       Mark Seaman, SW Productivity
00008  * Created:      Fri Oct 16 14:37:00 1987
00009  * Modified:     Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt
00010  * Language:     C
00011  * Package:      N/A
00012  * Status:       Reusable Software Component
00013  *
00014  * (c) Copyright 1987, Hewlett-Packard Company.
00015  ** Licensed under the Apache License, Version 2.0 (the "License");
00016  ** you may not use this file except in compliance with the License.
00017  ** You may obtain a copy of the License at
00018  ** http://www.apache.org/licenses/LICENSE-2.0
00019  ** Unless required by applicable law or agreed to in writing, software
00020  ** distributed under the License is distributed on an "AS IS" BASIS,
00021  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00022  ** See the License for the specific language governing permissions and
00023  ** limitations under the License.
00024  *
00025  *********************************************************************************/
00026 
00027 #ifndef DICT_DAWG_H_
00028 #define DICT_DAWG_H_
00029 
00030 /*----------------------------------------------------------------------
00031               I n c l u d e s
00032 ----------------------------------------------------------------------*/
00033 
00034 #include "elst.h"
00035 #include "ratngs.h"
00036 #include "params.h"
00037 #include "tesscallback.h"
00038 
00039 #ifndef __GNUC__
00040 #ifdef _WIN32
00041 #define NO_EDGE                (inT64) 0xffffffffffffffffi64
00042 #endif  /*_WIN32*/
00043 #else
00044 #define NO_EDGE                (inT64) 0xffffffffffffffffll
00045 #endif /*__GNUC__*/
00046 
00047 /*----------------------------------------------------------------------
00048               T y p e s
00049 ----------------------------------------------------------------------*/
00050 class UNICHARSET;
00051 
00052 typedef uinT64 EDGE_RECORD;
00053 typedef EDGE_RECORD *EDGE_ARRAY;
00054 typedef inT64 EDGE_REF;
00055 typedef inT64 NODE_REF;
00056 typedef EDGE_REF *NODE_MAP;
00057 
00058 namespace tesseract {
00059 
00060 struct NodeChild {
00061   UNICHAR_ID unichar_id;
00062   EDGE_REF edge_ref;
00063   NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {}
00064   NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
00065 };
00066 
00067 typedef GenericVector<NodeChild> NodeChildVector;
00068 typedef GenericVector<int> SuccessorList;
00069 typedef GenericVector<SuccessorList *> SuccessorListsVector;
00070 
00071 enum DawgType {
00072   DAWG_TYPE_PUNCTUATION,
00073   DAWG_TYPE_WORD,
00074   DAWG_TYPE_NUMBER,
00075   DAWG_TYPE_PATTERN,
00076 
00077   DAWG_TYPE_COUNT  // number of enum entries
00078 };
00079 
00080 /*----------------------------------------------------------------------
00081               C o n s t a n t s
00082 ----------------------------------------------------------------------*/
00083 
00084 #define FORWARD_EDGE           (inT32) 0
00085 #define BACKWARD_EDGE          (inT32) 1
00086 #define MAX_NODE_EDGES_DISPLAY (inT64) 100
00087 #define MARKER_FLAG            (inT64) 1
00088 #define DIRECTION_FLAG         (inT64) 2
00089 #define WERD_END_FLAG          (inT64) 4
00090 #define LETTER_START_BIT       0
00091 #define NUM_FLAG_BITS          3
00092 #define REFFORMAT "%lld"
00093 
00094 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
00095   { 0, 1, 1, 0 },  // for DAWG_TYPE_PUNCTUATION
00096   { 1, 0, 0, 0 },  // for DAWG_TYPE_WORD
00097   { 1, 0, 0, 0 },  // for DAWG_TYPE_NUMBER
00098   { 0, 0, 0, 0 },  // for DAWG_TYPE_PATTERN
00099 };
00100 
00101 static const char kWildcard[] = "*";
00102 
00103 
00104 /*----------------------------------------------------------------------
00105               C l a s s e s   a n d   S t r u c t s
00106 ----------------------------------------------------------------------*/
00107 //
00117 //
00118 class Dawg {
00119  public:
00121   static const inT16 kDawgMagicNumber = 42;
00125   static const UNICHAR_ID kPatternUnicharID = 0;
00126 
00127   inline DawgType type() const { return type_; }
00128   inline const STRING &lang() const { return lang_; }
00129   inline PermuterType permuter() const { return perm_; }
00130 
00131   virtual ~Dawg() {};
00132 
00134   bool word_in_dawg(const WERD_CHOICE &word) const;
00135 
00136   // Returns true if the given word prefix is not contraindicated by the dawg.
00137   // If requires_complete is true, then the exact complete word must be present.
00138   bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const;
00139 
00142   int check_for_words(const char *filename,
00143                       const UNICHARSET &unicharset,
00144                       bool enable_wildcard) const;
00145 
00146   // For each word in the Dawg, call the given (permanent) callback with the
00147   // text (UTF-8) version of the word.
00148   void iterate_words(const UNICHARSET &unicharset,
00149                      TessCallback1<const WERD_CHOICE *> *cb) const;
00150 
00151   // For each word in the Dawg, call the given (permanent) callback with the
00152   // text (UTF-8) version of the word.
00153   void iterate_words(const UNICHARSET &unicharset,
00154                      TessCallback1<const char *> *cb) const;
00155 
00156   // Pure virtual function that should be implemented by the derived classes.
00157 
00159   virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
00160                                 bool word_end) const = 0;
00161 
00164   virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
00165                               bool word_end) const = 0;
00166 
00169   virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
00170 
00173   virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
00174 
00176   virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
00177 
00180   virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
00181 
00184   virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,
00185                                       const UNICHARSET &unicharset,
00186                                       GenericVector<UNICHAR_ID> *vec) const {};
00187 
00191   virtual EDGE_REF pattern_loop_edge(
00192       EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const {
00193     return false;
00194   }
00195 
00196  protected:
00197   Dawg() {}
00198 
00200   inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00201     return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
00202   }
00204   inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00205     return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
00206   }
00208   inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00209     return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
00210       BACKWARD_EDGE : FORWARD_EDGE;
00211   }
00213   inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00214     return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
00215   }
00217   inline UNICHAR_ID unichar_id_from_edge_rec(
00218       const EDGE_RECORD &edge_rec) const {
00219     return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
00220   }
00222   inline void set_next_node_in_edge_rec(
00223       EDGE_RECORD *edge_rec, EDGE_REF value) {
00224     *edge_rec &= (~next_node_mask_);
00225     *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
00226   }
00228   inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
00229     *edge_rec |= (MARKER_FLAG << flag_start_bit_);
00230   }
00238   inline int given_greater_than_edge_rec(NODE_REF next_node,
00239                                          bool word_end,
00240                                          UNICHAR_ID unichar_id,
00241                                          const EDGE_RECORD &edge_rec) const {
00242     UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
00243     NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
00244     bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
00245     if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
00246                        curr_word_end, curr_unichar_id)) return 0;
00247     if (unichar_id > curr_unichar_id) return 1;
00248     if (unichar_id == curr_unichar_id) {
00249       if (next_node > curr_next_node) return 1;
00250       if (next_node == curr_next_node) {
00251         if (word_end > curr_word_end) return 1;
00252       }
00253     }
00254     return -1;
00255   }
00259   inline bool edge_rec_match(NODE_REF next_node,
00260                              bool word_end,
00261                              UNICHAR_ID unichar_id,
00262                              NODE_REF other_next_node,
00263                              bool other_word_end,
00264                              UNICHAR_ID other_unichar_id) const {
00265     return ((unichar_id == other_unichar_id) &&
00266             (next_node == NO_EDGE || next_node == other_next_node) &&
00267             (!word_end || (word_end == other_word_end)));
00268   }
00269 
00272   void init(DawgType type, const STRING &lang,
00273             PermuterType perm, int unicharset_size, int debug_level);
00274 
00280   bool match_words(WERD_CHOICE *word, inT32 index,
00281                    NODE_REF node, UNICHAR_ID wildcard) const;
00282 
00283   // Recursively iterate over all words in a dawg (see public iterate_words).
00284   void iterate_words_rec(const WERD_CHOICE &word_so_far,
00285                          NODE_REF to_explore,
00286                          TessCallback1<const WERD_CHOICE *> *cb) const;
00287 
00288   // Member Variables.
00289   DawgType type_;
00290   STRING lang_;
00292   PermuterType perm_;
00293   // Variables to construct various edge masks. Formerly:
00294   // #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64
00295   // #define FLAGS_MASK     (inT64) 0x0000000700000000i64
00296   // #define LETTER_MASK    (inT64) 0x00000000ffffffffi64
00297   int unicharset_size_;
00298   int flag_start_bit_;
00299   int next_node_start_bit_;
00300   uinT64 next_node_mask_;
00301   uinT64 flags_mask_;
00302   uinT64 letter_mask_;
00303   // Level of debug statements to print to stdout.
00304   int debug_level_;
00305 };
00306 
00307 //
00308 // DawgPosition keeps track of where we are in the primary dawg we're searching
00309 // as well as where we may be in the "punctuation dawg" which may provide
00310 // surrounding context.
00311 //
00312 // Example:
00313 //   punctuation dawg  -- space is the "pattern character"
00314 //     " "     // no punctuation
00315 //     "' '"   // leading and trailing apostrophes
00316 //     " '"    // trailing apostrophe
00317 //   word dawg:
00318 //     "cat"
00319 //     "cab"
00320 //     "cat's"
00321 //
00322 //  DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp)
00323 //
00324 //  DawgPosition(-1, NO_EDGE, p, pe, false)
00325 //    We're in the punctuation dawg, no other dawg has been started.
00326 //    (1) If there's a pattern edge as a punc dawg child of us,
00327 //        for each punc-following dawg starting with ch, produce:
00328 //        Result: DawgPosition(k, w, p', false)
00329 //    (2) If there's a valid continuation in the punc dawg, produce:
00330 //        Result: DawgPosition(-k, NO_EDGE, p', false)
00331 //
00332 //  DawgPosition(k, w, -1, NO_EDGE, false)
00333 //    We're in dawg k.  Going back to punctuation dawg is not an option.
00334 //    Follow ch in dawg k.
00335 //
00336 //  DawgPosition(k, w, p, pe, false)
00337 //    We're in dawg k.  Continue in dawg k and/or go back to the punc dawg.
00338 //    If ending, check that the punctuation dawg is also ok to end here.
00339 //
00340 //  DawgPosition(k, w, p, pe true)
00341 //    We're back in the punctuation dawg.  Continuing there is the only option.
00342 struct DawgPosition {
00343   DawgPosition()
00344       : dawg_index(-1), dawg_ref(NO_EDGE), punc_ref(NO_EDGE),
00345         back_to_punc(false) {}
00346   DawgPosition(int dawg_idx, EDGE_REF dawgref,
00347                int punc_idx, EDGE_REF puncref,
00348                bool backtopunc)
00349       : dawg_index(dawg_idx), dawg_ref(dawgref),
00350         punc_index(punc_idx), punc_ref(puncref),
00351         back_to_punc(backtopunc) {
00352   }
00353   bool operator==(const DawgPosition &other) {
00354     return dawg_index == other.dawg_index &&
00355         dawg_ref == other.dawg_ref &&
00356         punc_index == other.punc_index &&
00357         punc_ref == other.punc_ref &&
00358         back_to_punc == other.back_to_punc;
00359   }
00360 
00361   inT8 dawg_index;
00362   EDGE_REF dawg_ref;
00363   inT8 punc_index;
00364   EDGE_REF punc_ref;
00365   // Have we returned to the punc dawg at the end of the word?
00366   bool back_to_punc;
00367 };
00368 
00369 class DawgPositionVector : public GenericVector<DawgPosition> {
00370  public:
00372   ~DawgPositionVector() {
00373     if (size_reserved_ > 0) {
00374       delete[] data_;
00375       size_used_ = 0;
00376       size_reserved_ = 0;
00377     }
00378   }
00381   void clear() { size_used_ = 0; }
00385   inline bool add_unique(const DawgPosition &new_pos,
00386                          bool debug,
00387                          const char *debug_msg) {
00388     for (int i = 0; i < size_used_; ++i) {
00389       if (data_[i] == new_pos) return false;
00390     }
00391     push_back(new_pos);
00392     if (debug) {
00393       tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n",
00394               debug_msg, new_pos.dawg_index, new_pos.dawg_ref,
00395               new_pos.punc_ref, new_pos.back_to_punc ? " returned" : "");
00396     }
00397     return true;
00398   }
00399 };
00400 
00401 //
00408 //
00409 class SquishedDawg : public Dawg {
00410  public:
00411   SquishedDawg(FILE *file, DawgType type, const STRING &lang,
00412                PermuterType perm, int debug_level) {
00413     read_squished_dawg(file, type, lang, perm, debug_level);
00414     num_forward_edges_in_node0 = num_forward_edges(0);
00415   }
00416   SquishedDawg(const char* filename, DawgType type,
00417                const STRING &lang, PermuterType perm, int debug_level) {
00418     FILE *file = fopen(filename, "rb");
00419     if (file == NULL) {
00420       tprintf("Failed to open dawg file %s\n", filename);
00421       exit(1);
00422     }
00423     read_squished_dawg(file, type, lang, perm, debug_level);
00424     num_forward_edges_in_node0 = num_forward_edges(0);
00425     fclose(file);
00426   }
00427   SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
00428                const STRING &lang, PermuterType perm,
00429                int unicharset_size, int debug_level) :
00430     edges_(edges), num_edges_(num_edges) {
00431     init(type, lang, perm, unicharset_size, debug_level);
00432     num_forward_edges_in_node0 = num_forward_edges(0);
00433     if (debug_level > 3) print_all("SquishedDawg:");
00434   }
00435   ~SquishedDawg();
00436 
00437   int NumEdges() { return num_edges_; }
00438 
00440   EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
00441                         bool word_end) const;
00442 
00445   void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
00446                       bool word_end) const {
00447     EDGE_REF edge = node;
00448     if (!edge_occupied(edge) || edge == NO_EDGE) return;
00449     assert(forward_edge(edge));  // we don't expect any backward edges to
00450     do {                         // be present when this funciton is called
00451       if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
00452         vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
00453       }
00454     } while (!last_edge(edge++));
00455   }
00456 
00459   NODE_REF next_node(EDGE_REF edge) const {
00460     return next_node_from_edge_rec((edges_[edge]));
00461   }
00462 
00465   bool end_of_word(EDGE_REF edge_ref) const {
00466     return end_of_word_from_edge_rec((edges_[edge_ref]));
00467   }
00468 
00470   UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
00471     return unichar_id_from_edge_rec((edges_[edge_ref]));
00472   }
00473 
00476   void print_node(NODE_REF node, int max_num_edges) const;
00477 
00479   void write_squished_dawg(FILE *file);
00480 
00483   void write_squished_dawg(const char *filename) {
00484     FILE *file = fopen(filename, "wb");
00485     if (file == NULL) {
00486       tprintf("Error opening %s\n", filename);
00487       exit(1);
00488     }
00489     this->write_squished_dawg(file);
00490     fclose(file);
00491   }
00492 
00493  private:
00495   inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
00496     set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
00497   }
00499   inline void set_empty_edge(EDGE_REF edge_ref) {
00500     (edges_[edge_ref] = next_node_mask_);
00501   }
00503   inline void clear_all_edges() {
00504     for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
00505   }
00507   inline void clear_marker_flag(EDGE_REF edge_ref) {
00508      (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
00509   }
00511   inline bool forward_edge(EDGE_REF edge_ref) const {
00512     return (edge_occupied(edge_ref) &&
00513             (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
00514   }
00516   inline bool backward_edge(EDGE_REF edge_ref) const {
00517     return (edge_occupied(edge_ref) &&
00518             (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
00519   }
00521   inline bool edge_occupied(EDGE_REF edge_ref) const {
00522     return (edges_[edge_ref] != next_node_mask_);
00523   }
00525   inline bool last_edge(EDGE_REF edge_ref) const {
00526     return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
00527   }
00528 
00530   inT32 num_forward_edges(NODE_REF node) const;
00531 
00533   void read_squished_dawg(FILE *file, DawgType type, const STRING &lang,
00534                           PermuterType perm, int debug_level);
00535 
00537   void print_edge(EDGE_REF edge) const;
00538 
00540   void print_all(const char* msg) {
00541     tprintf("\n__________________________\n%s\n", msg);
00542     for (int i = 0; i < num_edges_; ++i) print_edge(i);
00543     tprintf("__________________________\n");
00544   }
00546   NODE_MAP build_node_map(inT32 *num_nodes) const;
00547 
00548 
00549   // Member variables.
00550   EDGE_ARRAY edges_;
00551   int num_edges_;
00552   int num_forward_edges_in_node0;
00553 };
00554 
00555 }  // namespace tesseract
00556 
00557 #endif  // DICT_DAWG_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines