tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/classify/trainingsampleset.h
Go to the documentation of this file.
00001 // Copyright 2010 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00003 //
00004 // Licensed under the Apache License, Version 2.0 (the "License");
00005 // you may not use this file except in compliance with the License.
00006 // You may obtain a copy of the License at
00007 // http://www.apache.org/licenses/LICENSE-2.0
00008 // Unless required by applicable law or agreed to in writing, software
00009 // distributed under the License is distributed on an "AS IS" BASIS,
00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00011 // See the License for the specific language governing permissions and
00012 // limitations under the License.
00013 //
00015 
00016 #ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H__
00017 #define TESSERACT_TRAINING_TRAININGSAMPLESET_H__
00018 
00019 #include "bitvector.h"
00020 #include "genericvector.h"
00021 #include "indexmapbidi.h"
00022 #include "matrix.h"
00023 #include "shapetable.h"
00024 #include "trainingsample.h"
00025 
00026 class UNICHARSET;
00027 
00028 namespace tesseract {
00029 
00030 struct FontInfo;
00031 class FontInfoTable;
00032 class IntFeatureMap;
00033 class IntFeatureSpace;
00034 class TrainingSample;
00035 struct UnicharAndFonts;
00036 
00037 // Collection of TrainingSample used for training or testing a classifier.
00038 // Provides several useful methods to operate on the collection as a whole,
00039 // including outlier detection and deletion, providing access by font and
00040 // class, finding the canonical sample, finding the "cloud" features (OR of
00041 // all features in all samples), replication of samples, caching of distance
00042 // metrics.
00043 class TrainingSampleSet {
00044  public:
00045   explicit TrainingSampleSet(const FontInfoTable& fontinfo_table);
00046   ~TrainingSampleSet();
00047 
00048   // Writes to the given file. Returns false in case of error.
00049   bool Serialize(FILE* fp) const;
00050   // Reads from the given file. Returns false in case of error.
00051   // If swap is true, assumes a big/little-endian swap is needed.
00052   bool DeSerialize(bool swap, FILE* fp);
00053 
00054   // Accessors
00055   int num_samples() const {
00056     return samples_.size();
00057   }
00058   int num_raw_samples() const {
00059     return num_raw_samples_;
00060   }
00061   int NumFonts() const {
00062     return font_id_map_.SparseSize();
00063   }
00064   const UNICHARSET& unicharset() const {
00065     return unicharset_;
00066   }
00067   int charsetsize() const {
00068     return unicharset_size_;
00069   }
00070   const FontInfoTable& fontinfo_table() const {
00071     return fontinfo_table_;
00072   }
00073 
00074   // Loads an initial unicharset, or sets one up if the file cannot be read.
00075   void LoadUnicharset(const char* filename);
00076 
00077   // Adds a character sample to this sample set.
00078   // If the unichar is not already in the local unicharset, it is added.
00079   // Returns the unichar_id of the added sample, from the local unicharset.
00080   int AddSample(const char* unichar, TrainingSample* sample);
00081   // Adds a character sample to this sample set with the given unichar_id,
00082   // which must correspond to the local unicharset (in this).
00083   void AddSample(int unichar_id, TrainingSample* sample);
00084 
00085   // Returns the number of samples for the given font,class pair.
00086   // If randomize is true, returns the number of samples accessible
00087   // with randomizing on. (Increases the number of samples if small.)
00088   // OrganizeByFontAndClass must have been already called.
00089   int NumClassSamples(int font_id, int class_id, bool randomize) const;
00090 
00091   // Gets a sample by its index.
00092   const TrainingSample* GetSample(int index) const;
00093 
00094   // Gets a sample by its font, class, index.
00095   // OrganizeByFontAndClass must have been already called.
00096   const TrainingSample* GetSample(int font_id, int class_id, int index) const;
00097 
00098   // Get a sample by its font, class, index. Does not randomize.
00099   // OrganizeByFontAndClass must have been already called.
00100   TrainingSample* MutableSample(int font_id, int class_id, int index);
00101 
00102   // Returns a string debug representation of the given sample:
00103   // font, unichar_str, bounding box, page.
00104   STRING SampleToString(const TrainingSample& sample) const;
00105 
00106   // Gets the combined set of features used by all the samples of the given
00107   // font/class combination.
00108   const BitVector& GetCloudFeatures(int font_id, int class_id) const;
00109   // Gets the indexed features of the canonical sample of the given
00110   // font/class combination.
00111   const GenericVector<int>& GetCanonicalFeatures(int font_id,
00112                                                  int class_id) const;
00113 
00114   // Returns the distance between the given UniCharAndFonts pair.
00115   // If matched_fonts, only matching fonts, are considered, unless that yields
00116   // the empty set.
00117   // OrganizeByFontAndClass must have been already called.
00118   float UnicharDistance(const UnicharAndFonts& uf1, const UnicharAndFonts& uf2,
00119                         bool matched_fonts, const IntFeatureMap& feature_map);
00120 
00121   // Returns the distance between the given pair of font/class pairs.
00122   // Finds in cache or computes and caches.
00123   // OrganizeByFontAndClass must have been already called.
00124   float ClusterDistance(int font_id1, int class_id1,
00125                         int font_id2, int class_id2,
00126                         const IntFeatureMap& feature_map);
00127 
00128   // Computes the distance between the given pair of font/class pairs.
00129   float ComputeClusterDistance(int font_id1, int class_id1,
00130                                int font_id2, int class_id2,
00131                                const IntFeatureMap& feature_map) const;
00132 
00133   // Returns the number of canonical features of font/class 2 for which
00134   // neither the feature nor any of its near neighbors occurs in the cloud
00135   // of font/class 1. Each such feature is a reliable separation between
00136   // the classes, ASSUMING that the canonical sample is sufficiently
00137   // representative that every sample has a feature near that particular
00138   // feature. To check that this is so on the fly would be prohibitively
00139   // expensive, but it might be possible to pre-qualify the canonical features
00140   // to include only those for which this assumption is true.
00141   // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
00142   // first, or the results will be nonsense.
00143   int ReliablySeparable(int font_id1, int class_id1,
00144                         int font_id2, int class_id2,
00145                         const IntFeatureMap& feature_map,
00146                         bool thorough) const;
00147 
00148 
00149   // Returns the total index of the requested sample.
00150   // OrganizeByFontAndClass must have been already called.
00151   int GlobalSampleIndex(int font_id, int class_id, int index) const;
00152 
00153   // Gets the canonical sample for the given font, class pair.
00154   // ComputeCanonicalSamples must have been called first.
00155   const TrainingSample* GetCanonicalSample(int font_id, int class_id) const;
00156   // Gets the max distance for the given canonical sample.
00157   // ComputeCanonicalSamples must have been called first.
00158   float GetCanonicalDist(int font_id, int class_id) const;
00159 
00160   // Returns a mutable pointer to the sample with the given index.
00161   TrainingSample* mutable_sample(int index) {
00162     return samples_[index];
00163   }
00164   // Gets ownership of the sample with the given index, removing it from this.
00165   TrainingSample* extract_sample(int index) {
00166     TrainingSample* sample = samples_[index];
00167     samples_[index] = NULL;
00168     return sample;
00169   }
00170 
00171   // Generates indexed features for all samples with the supplied feature_space.
00172   void IndexFeatures(const IntFeatureSpace& feature_space);
00173 
00174   // Delete outlier samples with few features that are shared with others.
00175   // IndexFeatures must have been called already.
00176   void DeleteOutliers(const IntFeatureSpace& feature_space, bool debug);
00177 
00178   // Marks the given sample for deletion.
00179   // Deletion is actually completed by DeleteDeadSamples.
00180   void KillSample(TrainingSample* sample);
00181 
00182   // Deletes all samples with a negative sample index marked by KillSample.
00183   // Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass
00184   // must be called after as the samples have been renumbered.
00185   void DeleteDeadSamples();
00186 
00187   // Callback function returns true if the given sample is to be deleted, due
00188   // to having a negative classid.
00189   bool DeleteableSample(const TrainingSample* sample);
00190 
00191   // Construct an array to access the samples by font,class pair.
00192   void OrganizeByFontAndClass();
00193 
00194   // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
00195   // index for the font_class_array_.
00196   void SetupFontIdMap();
00197 
00198   // Finds the sample for each font, class pair that has least maximum
00199   // distance to all the other samples of the same font, class.
00200   // OrganizeByFontAndClass must have been already called.
00201   void ComputeCanonicalSamples(const IntFeatureMap& map, bool debug);
00202 
00203   // Replicates the samples to a minimum frequency defined by
00204   // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
00205   // After replication, the replicated samples are perturbed slightly, but
00206   // in a predictable and repeatable way.
00207   // Use after OrganizeByFontAndClass().
00208   void ReplicateAndRandomizeSamples();
00209 
00210   // Caches the indexed features of the canonical samples.
00211   // ComputeCanonicalSamples must have been already called.
00212   void ComputeCanonicalFeatures();
00213   // Computes the combined set of features used by all the samples of each
00214   // font/class combination. Use after ReplicateAndRandomizeSamples.
00215   void ComputeCloudFeatures(int feature_space_size);
00216 
00217   // Adds all fonts of the given class to the shape.
00218   void AddAllFontsForClass(int class_id, Shape* shape) const;
00219 
00220   // Display the samples with the given indexed feature that also match
00221   // the given shape.
00222   void DisplaySamplesWithFeature(int f_index, const Shape& shape,
00223                                  const IntFeatureSpace& feature_space,
00224                                  ScrollView::Color color,
00225                                  ScrollView* window) const;
00226 
00227  private:
00228   // Struct to store a triplet of unichar, font, distance in the distance cache.
00229   struct FontClassDistance {
00230     int unichar_id;
00231     int font_id;  // Real font id.
00232     float distance;
00233   };
00234   // Simple struct to store information related to each font/class combination.
00235   struct FontClassInfo {
00236     FontClassInfo();
00237 
00238     // Writes to the given file. Returns false in case of error.
00239     bool Serialize(FILE* fp) const;
00240     // Reads from the given file. Returns false in case of error.
00241     // If swap is true, assumes a big/little-endian swap is needed.
00242     bool DeSerialize(bool swap, FILE* fp);
00243 
00244     // Number of raw samples.
00245     inT32 num_raw_samples;
00246     // Index of the canonical sample.
00247     inT32 canonical_sample;
00248     // Max distance of the canonical sample from any other.
00249     float canonical_dist;
00250     // Sample indices for the samples, including replicated.
00251     GenericVector<inT32> samples;
00252 
00253     // Non-serialized cache data.
00254     // Indexed features of the canonical sample.
00255     GenericVector<int> canonical_features;
00256     // The mapped features of all the samples.
00257     BitVector cloud_features;
00258 
00259     // Caches for ClusterDistance.
00260     // Caches for other fonts but matching this unichar. -1 indicates not set.
00261     // Indexed by compact font index from font_id_map_.
00262     GenericVector<float> font_distance_cache;
00263     // Caches for other unichars but matching this font. -1 indicates not set.
00264     GenericVector<float> unichar_distance_cache;
00265     // Cache for the rest (non matching font and unichar.)
00266     // A cache of distances computed by ReliablySeparable.
00267     GenericVector<FontClassDistance> distance_cache;
00268   };
00269 
00270   PointerVector<TrainingSample> samples_;
00271   // Number of samples before replication/randomization.
00272   int num_raw_samples_;
00273   // Character set we are training for.
00274   UNICHARSET unicharset_;
00275   // Character set size to which the 2-d arrays below refer.
00276   int unicharset_size_;
00277   // Map to allow the font_class_array_ below to be compact.
00278   // The sparse space is the real font_id, used in samples_ .
00279   // The compact space is an index to font_class_array_
00280   IndexMapBiDi font_id_map_;
00281   // A 2-d array of FontClassInfo holding information related to each
00282   // (font_id, class_id) pair.
00283   GENERIC_2D_ARRAY<FontClassInfo>* font_class_array_;
00284 
00285   // Reference to the fontinfo_table_ in MasterTrainer. Provides names
00286   // for font_ids in the samples. Not serialized!
00287   const FontInfoTable& fontinfo_table_;
00288 };
00289 
00290 }  // namespace tesseract.
00291 
00292 
00293 #endif  // TRAININGSAMPLESETSET_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines