tesseract
3.03
|
00001 // Copyright 2010 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00003 // 00004 // Licensed under the Apache License, Version 2.0 (the "License"); 00005 // you may not use this file except in compliance with the License. 00006 // You may obtain a copy of the License at 00007 // http://www.apache.org/licenses/LICENSE-2.0 00008 // Unless required by applicable law or agreed to in writing, software 00009 // distributed under the License is distributed on an "AS IS" BASIS, 00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00011 // See the License for the specific language governing permissions and 00012 // limitations under the License. 00013 // 00015 00016 #ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H__ 00017 #define TESSERACT_TRAINING_TRAININGSAMPLESET_H__ 00018 00019 #include "bitvector.h" 00020 #include "genericvector.h" 00021 #include "indexmapbidi.h" 00022 #include "matrix.h" 00023 #include "shapetable.h" 00024 #include "trainingsample.h" 00025 00026 class UNICHARSET; 00027 00028 namespace tesseract { 00029 00030 struct FontInfo; 00031 class FontInfoTable; 00032 class IntFeatureMap; 00033 class IntFeatureSpace; 00034 class TrainingSample; 00035 struct UnicharAndFonts; 00036 00037 // Collection of TrainingSample used for training or testing a classifier. 00038 // Provides several useful methods to operate on the collection as a whole, 00039 // including outlier detection and deletion, providing access by font and 00040 // class, finding the canonical sample, finding the "cloud" features (OR of 00041 // all features in all samples), replication of samples, caching of distance 00042 // metrics. 00043 class TrainingSampleSet { 00044 public: 00045 explicit TrainingSampleSet(const FontInfoTable& fontinfo_table); 00046 ~TrainingSampleSet(); 00047 00048 // Writes to the given file. Returns false in case of error. 00049 bool Serialize(FILE* fp) const; 00050 // Reads from the given file. Returns false in case of error. 00051 // If swap is true, assumes a big/little-endian swap is needed. 00052 bool DeSerialize(bool swap, FILE* fp); 00053 00054 // Accessors 00055 int num_samples() const { 00056 return samples_.size(); 00057 } 00058 int num_raw_samples() const { 00059 return num_raw_samples_; 00060 } 00061 int NumFonts() const { 00062 return font_id_map_.SparseSize(); 00063 } 00064 const UNICHARSET& unicharset() const { 00065 return unicharset_; 00066 } 00067 int charsetsize() const { 00068 return unicharset_size_; 00069 } 00070 const FontInfoTable& fontinfo_table() const { 00071 return fontinfo_table_; 00072 } 00073 00074 // Loads an initial unicharset, or sets one up if the file cannot be read. 00075 void LoadUnicharset(const char* filename); 00076 00077 // Adds a character sample to this sample set. 00078 // If the unichar is not already in the local unicharset, it is added. 00079 // Returns the unichar_id of the added sample, from the local unicharset. 00080 int AddSample(const char* unichar, TrainingSample* sample); 00081 // Adds a character sample to this sample set with the given unichar_id, 00082 // which must correspond to the local unicharset (in this). 00083 void AddSample(int unichar_id, TrainingSample* sample); 00084 00085 // Returns the number of samples for the given font,class pair. 00086 // If randomize is true, returns the number of samples accessible 00087 // with randomizing on. (Increases the number of samples if small.) 00088 // OrganizeByFontAndClass must have been already called. 00089 int NumClassSamples(int font_id, int class_id, bool randomize) const; 00090 00091 // Gets a sample by its index. 00092 const TrainingSample* GetSample(int index) const; 00093 00094 // Gets a sample by its font, class, index. 00095 // OrganizeByFontAndClass must have been already called. 00096 const TrainingSample* GetSample(int font_id, int class_id, int index) const; 00097 00098 // Get a sample by its font, class, index. Does not randomize. 00099 // OrganizeByFontAndClass must have been already called. 00100 TrainingSample* MutableSample(int font_id, int class_id, int index); 00101 00102 // Returns a string debug representation of the given sample: 00103 // font, unichar_str, bounding box, page. 00104 STRING SampleToString(const TrainingSample& sample) const; 00105 00106 // Gets the combined set of features used by all the samples of the given 00107 // font/class combination. 00108 const BitVector& GetCloudFeatures(int font_id, int class_id) const; 00109 // Gets the indexed features of the canonical sample of the given 00110 // font/class combination. 00111 const GenericVector<int>& GetCanonicalFeatures(int font_id, 00112 int class_id) const; 00113 00114 // Returns the distance between the given UniCharAndFonts pair. 00115 // If matched_fonts, only matching fonts, are considered, unless that yields 00116 // the empty set. 00117 // OrganizeByFontAndClass must have been already called. 00118 float UnicharDistance(const UnicharAndFonts& uf1, const UnicharAndFonts& uf2, 00119 bool matched_fonts, const IntFeatureMap& feature_map); 00120 00121 // Returns the distance between the given pair of font/class pairs. 00122 // Finds in cache or computes and caches. 00123 // OrganizeByFontAndClass must have been already called. 00124 float ClusterDistance(int font_id1, int class_id1, 00125 int font_id2, int class_id2, 00126 const IntFeatureMap& feature_map); 00127 00128 // Computes the distance between the given pair of font/class pairs. 00129 float ComputeClusterDistance(int font_id1, int class_id1, 00130 int font_id2, int class_id2, 00131 const IntFeatureMap& feature_map) const; 00132 00133 // Returns the number of canonical features of font/class 2 for which 00134 // neither the feature nor any of its near neighbors occurs in the cloud 00135 // of font/class 1. Each such feature is a reliable separation between 00136 // the classes, ASSUMING that the canonical sample is sufficiently 00137 // representative that every sample has a feature near that particular 00138 // feature. To check that this is so on the fly would be prohibitively 00139 // expensive, but it might be possible to pre-qualify the canonical features 00140 // to include only those for which this assumption is true. 00141 // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called 00142 // first, or the results will be nonsense. 00143 int ReliablySeparable(int font_id1, int class_id1, 00144 int font_id2, int class_id2, 00145 const IntFeatureMap& feature_map, 00146 bool thorough) const; 00147 00148 00149 // Returns the total index of the requested sample. 00150 // OrganizeByFontAndClass must have been already called. 00151 int GlobalSampleIndex(int font_id, int class_id, int index) const; 00152 00153 // Gets the canonical sample for the given font, class pair. 00154 // ComputeCanonicalSamples must have been called first. 00155 const TrainingSample* GetCanonicalSample(int font_id, int class_id) const; 00156 // Gets the max distance for the given canonical sample. 00157 // ComputeCanonicalSamples must have been called first. 00158 float GetCanonicalDist(int font_id, int class_id) const; 00159 00160 // Returns a mutable pointer to the sample with the given index. 00161 TrainingSample* mutable_sample(int index) { 00162 return samples_[index]; 00163 } 00164 // Gets ownership of the sample with the given index, removing it from this. 00165 TrainingSample* extract_sample(int index) { 00166 TrainingSample* sample = samples_[index]; 00167 samples_[index] = NULL; 00168 return sample; 00169 } 00170 00171 // Generates indexed features for all samples with the supplied feature_space. 00172 void IndexFeatures(const IntFeatureSpace& feature_space); 00173 00174 // Delete outlier samples with few features that are shared with others. 00175 // IndexFeatures must have been called already. 00176 void DeleteOutliers(const IntFeatureSpace& feature_space, bool debug); 00177 00178 // Marks the given sample for deletion. 00179 // Deletion is actually completed by DeleteDeadSamples. 00180 void KillSample(TrainingSample* sample); 00181 00182 // Deletes all samples with a negative sample index marked by KillSample. 00183 // Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass 00184 // must be called after as the samples have been renumbered. 00185 void DeleteDeadSamples(); 00186 00187 // Callback function returns true if the given sample is to be deleted, due 00188 // to having a negative classid. 00189 bool DeleteableSample(const TrainingSample* sample); 00190 00191 // Construct an array to access the samples by font,class pair. 00192 void OrganizeByFontAndClass(); 00193 00194 // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact 00195 // index for the font_class_array_. 00196 void SetupFontIdMap(); 00197 00198 // Finds the sample for each font, class pair that has least maximum 00199 // distance to all the other samples of the same font, class. 00200 // OrganizeByFontAndClass must have been already called. 00201 void ComputeCanonicalSamples(const IntFeatureMap& map, bool debug); 00202 00203 // Replicates the samples to a minimum frequency defined by 00204 // 2 * kSampleRandomSize, or for larger counts duplicates all samples. 00205 // After replication, the replicated samples are perturbed slightly, but 00206 // in a predictable and repeatable way. 00207 // Use after OrganizeByFontAndClass(). 00208 void ReplicateAndRandomizeSamples(); 00209 00210 // Caches the indexed features of the canonical samples. 00211 // ComputeCanonicalSamples must have been already called. 00212 void ComputeCanonicalFeatures(); 00213 // Computes the combined set of features used by all the samples of each 00214 // font/class combination. Use after ReplicateAndRandomizeSamples. 00215 void ComputeCloudFeatures(int feature_space_size); 00216 00217 // Adds all fonts of the given class to the shape. 00218 void AddAllFontsForClass(int class_id, Shape* shape) const; 00219 00220 // Display the samples with the given indexed feature that also match 00221 // the given shape. 00222 void DisplaySamplesWithFeature(int f_index, const Shape& shape, 00223 const IntFeatureSpace& feature_space, 00224 ScrollView::Color color, 00225 ScrollView* window) const; 00226 00227 private: 00228 // Struct to store a triplet of unichar, font, distance in the distance cache. 00229 struct FontClassDistance { 00230 int unichar_id; 00231 int font_id; // Real font id. 00232 float distance; 00233 }; 00234 // Simple struct to store information related to each font/class combination. 00235 struct FontClassInfo { 00236 FontClassInfo(); 00237 00238 // Writes to the given file. Returns false in case of error. 00239 bool Serialize(FILE* fp) const; 00240 // Reads from the given file. Returns false in case of error. 00241 // If swap is true, assumes a big/little-endian swap is needed. 00242 bool DeSerialize(bool swap, FILE* fp); 00243 00244 // Number of raw samples. 00245 inT32 num_raw_samples; 00246 // Index of the canonical sample. 00247 inT32 canonical_sample; 00248 // Max distance of the canonical sample from any other. 00249 float canonical_dist; 00250 // Sample indices for the samples, including replicated. 00251 GenericVector<inT32> samples; 00252 00253 // Non-serialized cache data. 00254 // Indexed features of the canonical sample. 00255 GenericVector<int> canonical_features; 00256 // The mapped features of all the samples. 00257 BitVector cloud_features; 00258 00259 // Caches for ClusterDistance. 00260 // Caches for other fonts but matching this unichar. -1 indicates not set. 00261 // Indexed by compact font index from font_id_map_. 00262 GenericVector<float> font_distance_cache; 00263 // Caches for other unichars but matching this font. -1 indicates not set. 00264 GenericVector<float> unichar_distance_cache; 00265 // Cache for the rest (non matching font and unichar.) 00266 // A cache of distances computed by ReliablySeparable. 00267 GenericVector<FontClassDistance> distance_cache; 00268 }; 00269 00270 PointerVector<TrainingSample> samples_; 00271 // Number of samples before replication/randomization. 00272 int num_raw_samples_; 00273 // Character set we are training for. 00274 UNICHARSET unicharset_; 00275 // Character set size to which the 2-d arrays below refer. 00276 int unicharset_size_; 00277 // Map to allow the font_class_array_ below to be compact. 00278 // The sparse space is the real font_id, used in samples_ . 00279 // The compact space is an index to font_class_array_ 00280 IndexMapBiDi font_id_map_; 00281 // A 2-d array of FontClassInfo holding information related to each 00282 // (font_id, class_id) pair. 00283 GENERIC_2D_ARRAY<FontClassInfo>* font_class_array_; 00284 00285 // Reference to the fontinfo_table_ in MasterTrainer. Provides names 00286 // for font_ids in the samples. Not serialized! 00287 const FontInfoTable& fontinfo_table_; 00288 }; 00289 00290 } // namespace tesseract. 00291 00292 00293 #endif // TRAININGSAMPLESETSET_H_