tesseract
3.03
|
00001 // Copyright 2010 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00004 // File: shapetable.h 00005 // Description: Class to map a classifier shape index to unicharset 00006 // indices and font indices. 00007 // Author: Ray Smith 00008 // Created: Thu Oct 28 17:46:32 PDT 2010 00009 // 00010 // (C) Copyright 2010, Google Inc. 00011 // Licensed under the Apache License, Version 2.0 (the "License"); 00012 // you may not use this file except in compliance with the License. 00013 // You may obtain a copy of the License at 00014 // http://www.apache.org/licenses/LICENSE-2.0 00015 // Unless required by applicable law or agreed to in writing, software 00016 // distributed under the License is distributed on an "AS IS" BASIS, 00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00018 // See the License for the specific language governing permissions and 00019 // limitations under the License. 00020 // 00022 00023 #ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_ 00024 #define TESSERACT_CLASSIFY_SHAPETABLE_H_ 00025 00026 #include "bitvector.h" 00027 #include "genericheap.h" 00028 #include "genericvector.h" 00029 #include "intmatcher.h" 00030 00031 class STRING; 00032 class UNICHARSET; 00033 00034 namespace tesseract { 00035 00036 struct FontInfo; 00037 class FontInfoTable; 00038 class ShapeTable; 00039 00040 // Simple struct to hold a single classifier unichar selection, a corresponding 00041 // rating, and a list of appropriate fonts. 00042 struct UnicharRating { 00043 UnicharRating() : unichar_id(0), rating(0.0f) {} 00044 UnicharRating(int u, float r) 00045 : unichar_id(u), rating(r) {} 00046 00047 // Sort function to sort ratings appropriately by descending rating. 00048 static int SortDescendingRating(const void* t1, const void* t2) { 00049 const UnicharRating* a = reinterpret_cast<const UnicharRating *>(t1); 00050 const UnicharRating* b = reinterpret_cast<const UnicharRating *>(t2); 00051 if (a->rating > b->rating) { 00052 return -1; 00053 } else if (a->rating < b->rating) { 00054 return 1; 00055 } else { 00056 return a->unichar_id - b->unichar_id; 00057 } 00058 } 00059 // Helper function to get the index of the first result with the required 00060 // unichar_id. If the results are sorted by rating, this will also be the 00061 // best result with the required unichar_id. 00062 // Returns -1 if the unichar_id is not found 00063 static int FirstResultWithUnichar(const GenericVector<UnicharRating>& results, 00064 UNICHAR_ID unichar_id); 00065 00066 // Index into some UNICHARSET table indicates the class of the answer. 00067 UNICHAR_ID unichar_id; 00068 // Rating from classifier with 1.0 perfect and 0.0 impossible. 00069 // Call it a probability if you must. 00070 float rating; 00071 // Set of fonts for this shape in order of decreasing preference. 00072 // (There is no mechanism for storing scores for fonts as yet.) 00073 GenericVector<int> fonts; 00074 }; 00075 00076 // Classifier result from a low-level classification is an index into some 00077 // ShapeTable and a rating. 00078 struct ShapeRating { 00079 ShapeRating() 00080 : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f), 00081 joined(false), broken(false) {} 00082 ShapeRating(int s, float r) 00083 : shape_id(s), rating(r), raw(1.0f), font(0.0f), 00084 joined(false), broken(false) {} 00085 00086 // Sort function to sort ratings appropriately by descending rating. 00087 static int SortDescendingRating(const void* t1, const void* t2) { 00088 const ShapeRating* a = reinterpret_cast<const ShapeRating *>(t1); 00089 const ShapeRating* b = reinterpret_cast<const ShapeRating *>(t2); 00090 if (a->rating > b->rating) { 00091 return -1; 00092 } else if (a->rating < b->rating) { 00093 return 1; 00094 } else { 00095 return a->shape_id - b->shape_id; 00096 } 00097 } 00098 // Helper function to get the index of the first result with the required 00099 // unichar_id. If the results are sorted by rating, this will also be the 00100 // best result with the required unichar_id. 00101 // Returns -1 if the unichar_id is not found 00102 static int FirstResultWithUnichar(const GenericVector<ShapeRating>& results, 00103 const ShapeTable& shape_table, 00104 UNICHAR_ID unichar_id); 00105 00106 // Index into some shape table indicates the class of the answer. 00107 int shape_id; 00108 // Rating from classifier with 1.0 perfect and 0.0 impossible. 00109 // Call it a probability if you must. 00110 float rating; 00111 // Subsidiary rating that a classifier may use internally. 00112 float raw; 00113 // Subsidiary rating that a classifier may use internally. 00114 float font; 00115 // Flag indicating that the input may be joined. 00116 bool joined; 00117 // Flag indicating that the input may be broken (a fragment). 00118 bool broken; 00119 }; 00120 00121 // Simple struct to hold an entry for a heap-based priority queue of 00122 // ShapeRating. 00123 struct ShapeQueueEntry { 00124 ShapeQueueEntry() : result(ShapeRating(0, 0.0f)), level(0) {} 00125 ShapeQueueEntry(const ShapeRating& rating, int level0) 00126 : result(rating), level(level0) {} 00127 00128 // Sort by decreasing rating and decreasing level for equal rating. 00129 bool operator<(const ShapeQueueEntry& other) const { 00130 if (result.rating > other.result.rating) return true; 00131 if (result.rating == other.result.rating) 00132 return level > other.level; 00133 return false; 00134 } 00135 00136 // Output from classifier. 00137 ShapeRating result; 00138 // Which level in the tree did this come from? 00139 int level; 00140 }; 00141 typedef GenericHeap<ShapeQueueEntry> ShapeQueue; 00142 00143 // Simple struct to hold a set of fonts associated with a single unichar-id. 00144 // A vector of UnicharAndFonts makes a shape. 00145 struct UnicharAndFonts { 00146 UnicharAndFonts() : unichar_id(0) { 00147 } 00148 UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) { 00149 font_ids.push_back(font_id); 00150 } 00151 00152 // Writes to the given file. Returns false in case of error. 00153 bool Serialize(FILE* fp) const; 00154 // Reads from the given file. Returns false in case of error. 00155 // If swap is true, assumes a big/little-endian swap is needed. 00156 bool DeSerialize(bool swap, FILE* fp); 00157 00158 // Sort function to sort a pair of UnicharAndFonts by unichar_id. 00159 static int SortByUnicharId(const void* v1, const void* v2); 00160 00161 GenericVector<inT32> font_ids; 00162 inT32 unichar_id; 00163 }; 00164 00165 // A Shape is a collection of unichar-ids and a list of fonts associated with 00166 // each, organized as a vector of UnicharAndFonts. Conceptually a Shape is 00167 // a classifiable unit, and represents a group of characters or parts of 00168 // characters that have a similar or identical shape. Shapes/ShapeTables may 00169 // be organized hierarchically from identical shapes at the leaves to vaguely 00170 // similar shapes near the root. 00171 class Shape { 00172 public: 00173 Shape() : destination_index_(-1) {} 00174 00175 // Writes to the given file. Returns false in case of error. 00176 bool Serialize(FILE* fp) const; 00177 // Reads from the given file. Returns false in case of error. 00178 // If swap is true, assumes a big/little-endian swap is needed. 00179 bool DeSerialize(bool swap, FILE* fp); 00180 00181 int destination_index() const { 00182 return destination_index_; 00183 } 00184 void set_destination_index(int index) { 00185 destination_index_ = index; 00186 } 00187 int size() const { 00188 return unichars_.size(); 00189 } 00190 // Returns a UnicharAndFonts entry for the given index, which must be 00191 // in the range [0, size()). 00192 const UnicharAndFonts& operator[](int index) const { 00193 return unichars_[index]; 00194 } 00195 // Sets the unichar_id of the given index to the new unichar_id. 00196 void SetUnicharId(int index, int unichar_id) { 00197 unichars_[index].unichar_id = unichar_id; 00198 } 00199 // Adds a font_id for the given unichar_id. If the unichar_id is not 00200 // in the shape, it is added. 00201 void AddToShape(int unichar_id, int font_id); 00202 // Adds everything in other to this. 00203 void AddShape(const Shape& other); 00204 // Returns true if the shape contains the given unichar_id, font_id pair. 00205 bool ContainsUnicharAndFont(int unichar_id, int font_id) const; 00206 // Returns true if the shape contains the given unichar_id, ignoring font. 00207 bool ContainsUnichar(int unichar_id) const; 00208 // Returns true if the shape contains the given font, ignoring unichar_id. 00209 bool ContainsFont(int font_id) const; 00210 // Returns true if the shape contains the given font properties, ignoring 00211 // unichar_id. 00212 bool ContainsFontProperties(const FontInfoTable& font_table, 00213 uinT32 properties) const; 00214 // Returns true if the shape contains multiple different font properties, 00215 // ignoring unichar_id. 00216 bool ContainsMultipleFontProperties(const FontInfoTable& font_table) const; 00217 // Returns true if this shape is equal to other (ignoring order of unichars 00218 // and fonts). 00219 bool operator==(const Shape& other) const; 00220 // Returns true if this is a subset (including equal) of other. 00221 bool IsSubsetOf(const Shape& other) const; 00222 // Returns true if the lists of unichar ids are the same in this and other, 00223 // ignoring fonts. 00224 // NOT const, as it will sort the unichars on demand. 00225 bool IsEqualUnichars(Shape* other); 00226 00227 private: 00228 // Sorts the unichars_ vector by unichar. 00229 void SortUnichars(); 00230 00231 // Flag indicates that the unichars are sorted, allowing faster set 00232 // operations with another shape. 00233 bool unichars_sorted_; 00234 // If this Shape is part of a ShapeTable the destiation_index_ is the index 00235 // of some other shape in the ShapeTable with which this shape is merged. 00236 int destination_index_; 00237 // Array of unichars, each with a set of fonts. Each unichar has at most 00238 // one entry in the vector. 00239 GenericVector<UnicharAndFonts> unichars_; 00240 }; 00241 00242 // ShapeTable is a class to encapsulate the triple indirection that is 00243 // used here. 00244 // ShapeTable is a vector of shapes. 00245 // Each shape is a vector of UnicharAndFonts representing the set of unichars 00246 // that the shape represents. 00247 // Each UnicharAndFonts also lists the fonts of the unichar_id that were 00248 // mapped to the shape during training. 00249 class ShapeTable { 00250 public: 00251 ShapeTable(); 00252 // The UNICHARSET reference supplied here, or in set_unicharset below must 00253 // exist for the entire life of the ShapeTable. It is used only by DebugStr. 00254 explicit ShapeTable(const UNICHARSET& unicharset); 00255 00256 // Writes to the given file. Returns false in case of error. 00257 bool Serialize(FILE* fp) const; 00258 // Reads from the given file. Returns false in case of error. 00259 // If swap is true, assumes a big/little-endian swap is needed. 00260 bool DeSerialize(bool swap, FILE* fp); 00261 00262 // Accessors. 00263 int NumShapes() const { 00264 return shape_table_.size(); 00265 } 00266 const UNICHARSET& unicharset() const { 00267 return *unicharset_; 00268 } 00269 // Returns the number of fonts used in this ShapeTable, computing it if 00270 // necessary. 00271 int NumFonts() const; 00272 // Shapetable takes a pointer to the UNICHARSET, so it must persist for the 00273 // entire life of the ShapeTable. 00274 void set_unicharset(const UNICHARSET& unicharset) { 00275 unicharset_ = &unicharset; 00276 } 00277 // Re-indexes the class_ids in the shapetable according to the given map. 00278 // Useful in conjunction with set_unicharset. 00279 void ReMapClassIds(const GenericVector<int>& unicharset_map); 00280 // Returns a string listing the classes/fonts in a shape. 00281 STRING DebugStr(int shape_id) const; 00282 // Returns a debug string summarizing the table. 00283 STRING SummaryStr() const; 00284 00285 // Adds a new shape starting with the given unichar_id and font_id. 00286 // Returns the assigned index. 00287 int AddShape(int unichar_id, int font_id); 00288 // Adds a copy of the given shape unless it is already present. 00289 // Returns the assigned index or index of existing shape if already present. 00290 int AddShape(const Shape& other); 00291 // Removes the shape given by the shape index. All indices above are changed! 00292 void DeleteShape(int shape_id); 00293 // Adds a font_id to the given existing shape index for the given 00294 // unichar_id. If the unichar_id is not in the shape, it is added. 00295 void AddToShape(int shape_id, int unichar_id, int font_id); 00296 // Adds the given shape to the existing shape with the given index. 00297 void AddShapeToShape(int shape_id, const Shape& other); 00298 // Returns the id of the shape that contains the given unichar and font. 00299 // If not found, returns -1. 00300 // If font_id < 0, the font_id is ignored and the first shape that matches 00301 // the unichar_id is returned. 00302 int FindShape(int unichar_id, int font_id) const; 00303 // Returns the first unichar_id and font_id in the given shape. 00304 void GetFirstUnicharAndFont(int shape_id, 00305 int* unichar_id, int* font_id) const; 00306 00307 // Accessors for the Shape with the given shape_id. 00308 const Shape& GetShape(int shape_id) const { 00309 return *shape_table_[shape_id]; 00310 } 00311 Shape* MutableShape(int shape_id) { 00312 return shape_table_[shape_id]; 00313 } 00314 00315 // Expands all the classes/fonts in the shape individually to build 00316 // a ShapeTable. 00317 int BuildFromShape(const Shape& shape, const ShapeTable& master_shapes); 00318 00319 // Returns true if the shapes are already merged. 00320 bool AlreadyMerged(int shape_id1, int shape_id2) const; 00321 // Returns true if any shape contains multiple unichars. 00322 bool AnyMultipleUnichars() const; 00323 // Returns the maximum number of unichars over all shapes. 00324 int MaxNumUnichars() const; 00325 // Merges shapes with a common unichar over the [start, end) interval. 00326 // Assumes single unichar per shape. 00327 void ForceFontMerges(int start, int end); 00328 // Returns the number of unichars in the master shape. 00329 int MasterUnicharCount(int shape_id) const; 00330 // Returns the sum of the font counts in the master shape. 00331 int MasterFontCount(int shape_id) const; 00332 // Returns the number of unichars that would result from merging the shapes. 00333 int MergedUnicharCount(int shape_id1, int shape_id2) const; 00334 // Merges two shape_ids, leaving shape_id2 marked as merged. 00335 void MergeShapes(int shape_id1, int shape_id2); 00336 // Swaps two shape_ids. 00337 void SwapShapes(int shape_id1, int shape_id2); 00338 // Appends the master shapes from other to this. 00339 // Used to create a clean ShapeTable from a merged one, or to create a 00340 // copy of a ShapeTable. 00341 // If not NULL, shape_map is set to map other shape_ids to this's shape_ids. 00342 void AppendMasterShapes(const ShapeTable& other, 00343 GenericVector<int>* shape_map); 00344 // Returns the number of master shapes remaining after merging. 00345 int NumMasterShapes() const; 00346 // Returns the destination of this shape, (if merged), taking into account 00347 // the fact that the destination may itself have been merged. 00348 // For a non-merged shape, returns the input shape_id. 00349 int MasterDestinationIndex(int shape_id) const; 00350 00351 // Returns false if the unichars in neither shape is a subset of the other.. 00352 bool SubsetUnichar(int shape_id1, int shape_id2) const; 00353 // Returns false if the unichars in neither shape is a subset of the other.. 00354 bool MergeSubsetUnichar(int merge_id1, int merge_id2, int shape_id) const; 00355 // Returns true if the unichar sets are equal between the shapes. 00356 bool EqualUnichars(int shape_id1, int shape_id2) const; 00357 bool MergeEqualUnichars(int merge_id1, int merge_id2, int shape_id) const; 00358 // Returns true if there is a common unichar between the shapes. 00359 bool CommonUnichars(int shape_id1, int shape_id2) const; 00360 // Returns true if there is a common font id between the shapes. 00361 bool CommonFont(int shape_id1, int shape_id2) const; 00362 00363 // Adds the unichars of the given shape_id to the vector of results. Any 00364 // unichar_id that is already present just has the fonts added to the 00365 // font set for that result without adding a new entry in the vector. 00366 // NOTE: it is assumed that the results are given to this function in order 00367 // of decreasing rating. 00368 // The unichar_map vector indicates the index of the results entry containing 00369 // each unichar, or -1 if the unichar is not yet included in results. 00370 void AddShapeToResults(const ShapeRating& shape_rating, 00371 GenericVector<int>* unichar_map, 00372 GenericVector<UnicharRating>* results) const; 00373 00374 private: 00375 // Adds the given unichar_id to the results if needed, updating unichar_map 00376 // and returning the index of unichar in results. 00377 int AddUnicharToResults(int unichar_id, float rating, 00378 GenericVector<int>* unichar_map, 00379 GenericVector<UnicharRating>* results) const; 00380 00381 // Pointer to a provided unicharset used only by the Debugstr member. 00382 const UNICHARSET* unicharset_; 00383 // Vector of pointers to the Shapes in this ShapeTable. 00384 PointerVector<Shape> shape_table_; 00385 00386 // Cached data calculated on demand. 00387 mutable int num_fonts_; 00388 }; 00389 00390 } // namespace tesseract. 00391 00392 #endif // TESSERACT_CLASSIFY_SHAPETABLE_H_