tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccutil/ambigs.h
Go to the documentation of this file.
00001 
00002 // File:        ambigs.h
00003 // Description: Constants, flags, functions for dealing with
00004 //              ambiguities (training and recognition).
00005 // Author:      Daria Antonova
00006 // Created:     Mon Aug 23 11:26:43 PDT 2008
00007 //
00008 // (C) Copyright 2008, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifndef TESSERACT_CCUTIL_AMBIGS_H_
00022 #define TESSERACT_CCUTIL_AMBIGS_H_
00023 
00024 #include "elst.h"
00025 #include "tprintf.h"
00026 #include "unichar.h"
00027 #include "unicharset.h"
00028 #include "genericvector.h"
00029 
00030 #define MAX_AMBIG_SIZE    10
00031 
00032 namespace tesseract {
00033 
00034 typedef GenericVector<UNICHAR_ID> UnicharIdVector;
00035 
00036 static const int kUnigramAmbigsBufferSize = 1000;
00037 static const char kAmbigNgramSeparator[] = { ' ', '\0' };
00038 static const char kAmbigDelimiters[] = "\t ";
00039 static const char kIllegalMsg[] =
00040   "Illegal ambiguity specification on line %d\n";
00041 static const char kIllegalUnicharMsg[] =
00042   "Illegal unichar %s in ambiguity specification\n";
00043 
00044 enum AmbigType {
00045   NOT_AMBIG,        // the ngram pair is not ambiguous
00046   REPLACE_AMBIG,    // ocred ngram should always be substituted with correct
00047   DEFINITE_AMBIG,   // add correct ngram to the classifier results (1-1)
00048   SIMILAR_AMBIG,    // use pairwise classifier for ocred/correct pair (1-1)
00049   CASE_AMBIG,       // this is a case ambiguity (1-1)
00050 
00051   AMBIG_TYPE_COUNT  // number of enum entries
00052 };
00053 
00054 // A collection of utility functions for arrays of UNICHAR_IDs that are
00055 // terminated by INVALID_UNICHAR_ID.
00056 class UnicharIdArrayUtils {
00057  public:
00058   // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
00059   // less than length of array2, if any array1[i] is less than array2[i].
00060   // Returns 0 if the arrays are equal, 1 otherwise.
00061   // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
00062   static inline int compare(const UNICHAR_ID array1[],
00063                             const UNICHAR_ID array2[]) {
00064     const UNICHAR_ID *ptr1 = array1;
00065     const UNICHAR_ID *ptr2 = array2;
00066     while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
00067       if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
00068       ++ptr1;
00069       ++ptr2;
00070     }
00071     if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
00072     return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
00073   }
00074 
00075   // Look uid in the vector of uids.  If found, the index of the matched
00076   // element is returned.  Otherwise, it returns -1.
00077   static inline int find_in(const UnicharIdVector& uid_vec,
00078                             const UNICHAR_ID uid) {
00079     for (int i = 0; i < uid_vec.size(); ++i)
00080       if (uid_vec[i] == uid) return i;
00081     return -1;
00082   }
00083 
00084   // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
00085   // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
00086   // and that dst has enough space for all the elements from src.
00087   static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
00088     int i = 0;
00089     do {
00090       dst[i] = src[i];
00091     } while (dst[i++] != INVALID_UNICHAR_ID);
00092     return i - 1;
00093   }
00094 
00095   // Prints unichars corresponding to the unichar_ids in the given array.
00096   // The function assumes that array is terminated by INVALID_UNICHAR_ID.
00097   static inline void print(const UNICHAR_ID array[],
00098                            const UNICHARSET &unicharset) {
00099     const UNICHAR_ID *ptr = array;
00100     if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
00101     while (*ptr != INVALID_UNICHAR_ID) {
00102       tprintf("%s ", unicharset.id_to_unichar(*ptr++));
00103     }
00104     tprintf("( ");
00105     ptr = array;
00106     while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
00107     tprintf(")\n");
00108   }
00109 };
00110 
00111 // AMBIG_SPEC_LIST stores a list of dangerous ambigs that
00112 // start with the same unichar (e.g. r->t rn->m rr1->m).
00113 class AmbigSpec : public ELIST_LINK {
00114  public:
00115   AmbigSpec();
00116   ~AmbigSpec() {}
00117 
00118   // Comparator function for sorting AmbigSpec_LISTs. The lists will
00119   // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
00120   // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
00121   static int compare_ambig_specs(const void *spec1, const void *spec2) {
00122     const AmbigSpec *s1 =
00123       *reinterpret_cast<const AmbigSpec * const *>(spec1);
00124     const AmbigSpec *s2 =
00125       *reinterpret_cast<const AmbigSpec * const *>(spec2);
00126     int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
00127     if (result != 0) return result;
00128     return UnicharIdArrayUtils::compare(s1->correct_fragments,
00129                                         s2->correct_fragments);
00130   }
00131 
00132   UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
00133   UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
00134   UNICHAR_ID correct_ngram_id;
00135   AmbigType type;
00136   int wrong_ngram_size;
00137 };
00138 ELISTIZEH(AmbigSpec);
00139 
00140 // AMBIG_TABLE[i] stores a set of ambiguities whose
00141 // wrong ngram starts with unichar id i.
00142 typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
00143 
00144 class UnicharAmbigs {
00145  public:
00146   UnicharAmbigs() {}
00147   ~UnicharAmbigs() {
00148     replace_ambigs_.delete_data_pointers();
00149     dang_ambigs_.delete_data_pointers();
00150     one_to_one_definite_ambigs_.delete_data_pointers();
00151   }
00152 
00153   const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
00154   const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
00155 
00156   // Initializes the ambigs by adding a NULL pointer to each table.
00157   void InitUnicharAmbigs(const UNICHARSET& unicharset,
00158                          bool use_ambigs_for_adaption);
00159 
00160   // Loads the universal ambigs that are useful for any language.
00161   void LoadUniversal(const UNICHARSET& encoder_set, UNICHARSET* unicharset);
00162 
00163   // Fills in two ambiguity tables (replaceable and dangerous) with information
00164   // read from the ambigs file. An ambiguity table is an array of lists.
00165   // The array is indexed by a class id. Each entry in the table provides
00166   // a list of potential ambiguities which can start with the corresponding
00167   // character. For example the ambiguity "rn -> m", would be located in the
00168   // table at index of unicharset.unichar_to_id('r').
00169   // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
00170   // one_to_one_definite_ambigs_. This vector is also indexed by the class id
00171   // of the wrong part of the ambiguity and each entry contains a vector of
00172   // unichar ids that are ambiguous to it.
00173   // encoder_set is used to encode the ambiguity strings, undisturbed by new
00174   // unichar_ids that may be created by adding the ambigs.
00175   void LoadUnicharAmbigs(const UNICHARSET& encoder_set,
00176                          TFile *ambigs_file, int debug_level,
00177                          bool use_ambigs_for_adaption, UNICHARSET *unicharset);
00178 
00179   // Returns definite 1-1 ambigs for the given unichar id.
00180   inline const UnicharIdVector *OneToOneDefiniteAmbigs(
00181       UNICHAR_ID unichar_id) const {
00182     if (one_to_one_definite_ambigs_.empty()) return NULL;
00183     return one_to_one_definite_ambigs_[unichar_id];
00184   }
00185 
00186   // Returns a pointer to the vector with all unichar ids that appear in the
00187   // 'correct' part of the ambiguity pair when the given unichar id appears
00188   // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of
00189   // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of
00190   // m will return a pointer to a vector with unichar ids of r,n,i.
00191   inline const UnicharIdVector *AmbigsForAdaption(
00192       UNICHAR_ID unichar_id) const {
00193     if (ambigs_for_adaption_.empty()) return NULL;
00194     return ambigs_for_adaption_[unichar_id];
00195   }
00196 
00197   // Similar to the above, but return the vector of unichar ids for which
00198   // the given unichar_id is an ambiguity (appears in the 'wrong' part of
00199   // some ambiguity pair).
00200   inline const UnicharIdVector *ReverseAmbigsForAdaption(
00201       UNICHAR_ID unichar_id) const {
00202     if (reverse_ambigs_for_adaption_.empty()) return NULL;
00203     return reverse_ambigs_for_adaption_[unichar_id];
00204   }
00205 
00206  private:
00207   bool ParseAmbiguityLine(int line_num, int version, int debug_level,
00208                           const UNICHARSET &unicharset, char *buffer,
00209                           int *test_ambig_part_size,
00210                           UNICHAR_ID *test_unichar_ids,
00211                           int *replacement_ambig_part_size,
00212                           char *replacement_string, int *type);
00213   bool InsertIntoTable(UnicharAmbigsVector &table,
00214                        int test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
00215                        int replacement_ambig_part_size,
00216                        const char *replacement_string, int type,
00217                        AmbigSpec *ambig_spec, UNICHARSET *unicharset);
00218 
00219   UnicharAmbigsVector dang_ambigs_;
00220   UnicharAmbigsVector replace_ambigs_;
00221   GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
00222   GenericVector<UnicharIdVector *> ambigs_for_adaption_;
00223   GenericVector<UnicharIdVector *> reverse_ambigs_for_adaption_;
00224 };
00225 
00226 }  // namespace tesseract
00227 
00228 #endif  // TESSERACT_CCUTIL_AMBIGS_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines