tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/classify/trainingsampleset.cpp
Go to the documentation of this file.
00001 // Copyright 2010 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00003 //
00004 // Licensed under the Apache License, Version 2.0 (the "License");
00005 // you may not use this file except in compliance with the License.
00006 // You may obtain a copy of the License at
00007 // http://www.apache.org/licenses/LICENSE-2.0
00008 // Unless required by applicable law or agreed to in writing, software
00009 // distributed under the License is distributed on an "AS IS" BASIS,
00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00011 // See the License for the specific language governing permissions and
00012 // limitations under the License.
00013 //
00015 
00016 #include "trainingsampleset.h"
00017 #include "allheaders.h"
00018 #include "boxread.h"
00019 #include "fontinfo.h"
00020 #include "indexmapbidi.h"
00021 #include "intfeaturedist.h"
00022 #include "intfeaturemap.h"
00023 #include "intfeaturespace.h"
00024 #include "shapetable.h"
00025 #include "trainingsample.h"
00026 #include "unicity_table.h"
00027 
00028 namespace tesseract {
00029 
00030 const int kTestChar = -1;  // 37;
00031 // Max number of distances to compute the squared way
00032 const int kSquareLimit = 25;
00033 // Prime numbers for subsampling distances.
00034 const int kPrime1 = 17;
00035 const int kPrime2 = 13;
00036 // Min samples from which to start discarding outliers.
00037 const int kMinOutlierSamples = 5;
00038 
00039 TrainingSampleSet::FontClassInfo::FontClassInfo()
00040   : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {
00041 }
00042 
00043 // Writes to the given file. Returns false in case of error.
00044 bool TrainingSampleSet::FontClassInfo::Serialize(FILE* fp) const {
00045   if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
00046     return false;
00047   if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
00048     return false;
00049   if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
00050   if (!samples.Serialize(fp)) return false;
00051   return true;
00052 }
00053 // Reads from the given file. Returns false in case of error.
00054 // If swap is true, assumes a big/little-endian swap is needed.
00055 bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE* fp) {
00056   if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
00057     return false;
00058   if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
00059     return false;
00060   if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
00061   if (!samples.DeSerialize(swap, fp)) return false;
00062   if (swap) {
00063     ReverseN(&num_raw_samples, sizeof(num_raw_samples));
00064     ReverseN(&canonical_sample, sizeof(canonical_sample));
00065     ReverseN(&canonical_dist, sizeof(canonical_dist));
00066   }
00067   return true;
00068 }
00069 
00070 TrainingSampleSet::TrainingSampleSet(const FontInfoTable& font_table)
00071   : num_raw_samples_(0), unicharset_size_(0),
00072     font_class_array_(NULL), fontinfo_table_(font_table) {
00073 }
00074 
00075 TrainingSampleSet::~TrainingSampleSet() {
00076   delete font_class_array_;
00077 }
00078 
00079 // Writes to the given file. Returns false in case of error.
00080 bool TrainingSampleSet::Serialize(FILE* fp) const {
00081   if (!samples_.Serialize(fp)) return false;
00082   if (!unicharset_.save_to_file(fp)) return false;
00083   if (!font_id_map_.Serialize(fp)) return false;
00084   inT8 not_null = font_class_array_ != NULL;
00085   if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) return false;
00086   if (not_null) {
00087     if (!font_class_array_->SerializeClasses(fp)) return false;
00088   }
00089   return true;
00090 }
00091 
00092 // Reads from the given file. Returns false in case of error.
00093 // If swap is true, assumes a big/little-endian swap is needed.
00094 bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) {
00095   if (!samples_.DeSerialize(swap, fp)) return false;
00096   num_raw_samples_ = samples_.size();
00097   if (!unicharset_.load_from_file(fp)) return false;
00098   if (!font_id_map_.DeSerialize(swap, fp)) return false;
00099   if (font_class_array_ != NULL) {
00100     delete font_class_array_;
00101     font_class_array_ = NULL;
00102   }
00103   inT8 not_null;
00104   if (fread(&not_null, sizeof(not_null), 1, fp) != 1) return false;
00105   if (not_null) {
00106     FontClassInfo empty;
00107     font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo >(1, 1 , empty);
00108     if (!font_class_array_->DeSerializeClasses(swap, fp)) return false;
00109   }
00110   unicharset_size_ = unicharset_.size();
00111   return true;
00112 }
00113 
00114 // Load an initial unicharset, or set one up if the file cannot be read.
00115 void TrainingSampleSet::LoadUnicharset(const char* filename) {
00116   if (!unicharset_.load_from_file(filename)) {
00117     tprintf("Failed to load unicharset from file %s\n"
00118             "Building unicharset from scratch...\n",
00119             filename);
00120     unicharset_.clear();
00121     // Add special characters as they were removed by the clear.
00122     UNICHARSET empty;
00123     unicharset_.AppendOtherUnicharset(empty);
00124   }
00125   unicharset_size_ = unicharset_.size();
00126 }
00127 
00128 // Adds a character sample to this sample set.
00129 // If the unichar is not already in the local unicharset, it is added.
00130 // Returns the unichar_id of the added sample, from the local unicharset.
00131 int TrainingSampleSet::AddSample(const char* unichar, TrainingSample* sample) {
00132   if (!unicharset_.contains_unichar(unichar)) {
00133     unicharset_.unichar_insert(unichar);
00134     if (unicharset_.size() > MAX_NUM_CLASSES) {
00135       tprintf("Error: Size of unicharset in TrainingSampleSet::AddSample is "
00136               "greater than MAX_NUM_CLASSES\n");
00137       return -1;
00138     }
00139   }
00140   UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
00141   AddSample(char_id, sample);
00142   return char_id;
00143 }
00144 
00145 // Adds a character sample to this sample set with the given unichar_id,
00146 // which must correspond to the local unicharset (in this).
00147 void TrainingSampleSet::AddSample(int unichar_id, TrainingSample* sample) {
00148   sample->set_class_id(unichar_id);
00149   samples_.push_back(sample);
00150   num_raw_samples_ = samples_.size();
00151   unicharset_size_ = unicharset_.size();
00152 }
00153 
00154 // Returns the number of samples for the given font,class pair.
00155 // If randomize is true, returns the number of samples accessible
00156 // with randomizing on. (Increases the number of samples if small.)
00157 // OrganizeByFontAndClass must have been already called.
00158 int TrainingSampleSet::NumClassSamples(int font_id, int class_id,
00159                                        bool randomize) const {
00160   ASSERT_HOST(font_class_array_ != NULL);
00161   if (font_id < 0 || class_id < 0 ||
00162       font_id >= font_id_map_.SparseSize() || class_id >= unicharset_size_) {
00163     // There are no samples because the font or class doesn't exist.
00164     return 0;
00165   }
00166   int font_index = font_id_map_.SparseToCompact(font_id);
00167   if (font_index < 0)
00168     return 0;  // The font has no samples.
00169   if (randomize)
00170     return (*font_class_array_)(font_index, class_id).samples.size();
00171   else
00172     return (*font_class_array_)(font_index, class_id).num_raw_samples;
00173 }
00174 
00175 // Gets a sample by its index.
00176 const TrainingSample* TrainingSampleSet::GetSample(int index) const {
00177   return samples_[index];
00178 }
00179 
00180 // Gets a sample by its font, class, index.
00181 // OrganizeByFontAndClass must have been already called.
00182 const TrainingSample* TrainingSampleSet::GetSample(int font_id, int class_id,
00183                                                    int index) const {
00184   ASSERT_HOST(font_class_array_ != NULL);
00185   int font_index = font_id_map_.SparseToCompact(font_id);
00186   if (font_index < 0) return NULL;
00187   int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
00188   return samples_[sample_index];
00189 }
00190 
00191 // Get a sample by its font, class, index. Does not randomize.
00192 // OrganizeByFontAndClass must have been already called.
00193 TrainingSample* TrainingSampleSet::MutableSample(int font_id, int class_id,
00194                                                  int index) {
00195   ASSERT_HOST(font_class_array_ != NULL);
00196   int font_index = font_id_map_.SparseToCompact(font_id);
00197   if (font_index < 0) return NULL;
00198   int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
00199   return samples_[sample_index];
00200 }
00201 
00202 // Returns a string debug representation of the given sample:
00203 // font, unichar_str, bounding box, page.
00204 STRING TrainingSampleSet::SampleToString(const TrainingSample& sample) const {
00205   STRING boxfile_str;
00206   MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()),
00207                  sample.bounding_box(), sample.page_num(), &boxfile_str);
00208   return STRING(fontinfo_table_.get(sample.font_id()).name) + " " + boxfile_str;
00209 }
00210 
00211 // Gets the combined set of features used by all the samples of the given
00212 // font/class combination.
00213 const BitVector& TrainingSampleSet::GetCloudFeatures(
00214     int font_id, int class_id) const {
00215   int font_index = font_id_map_.SparseToCompact(font_id);
00216   ASSERT_HOST(font_index >= 0);
00217   return (*font_class_array_)(font_index, class_id).cloud_features;
00218 }
00219 // Gets the indexed features of the canonical sample of the given
00220 // font/class combination.
00221 const GenericVector<int>& TrainingSampleSet::GetCanonicalFeatures(
00222     int font_id, int class_id) const {
00223   int font_index = font_id_map_.SparseToCompact(font_id);
00224   ASSERT_HOST(font_index >= 0);
00225   return (*font_class_array_)(font_index, class_id).canonical_features;
00226 }
00227 
00228 // Returns the distance between the given UniCharAndFonts pair.
00229 // If matched_fonts, only matching fonts, are considered, unless that yields
00230 // the empty set.
00231 // OrganizeByFontAndClass must have been already called.
00232 float TrainingSampleSet::UnicharDistance(const UnicharAndFonts& uf1,
00233                                          const UnicharAndFonts& uf2,
00234                                          bool matched_fonts,
00235                                          const IntFeatureMap& feature_map) {
00236   int num_fonts1 = uf1.font_ids.size();
00237   int c1 = uf1.unichar_id;
00238   int num_fonts2 = uf2.font_ids.size();
00239   int c2 = uf2.unichar_id;
00240   double dist_sum = 0.0;
00241   int dist_count = 0;
00242   bool debug = false;
00243   if (matched_fonts) {
00244     // Compute distances only where fonts match.
00245     for (int i = 0; i < num_fonts1; ++i) {
00246       int f1 = uf1.font_ids[i];
00247       for (int j = 0; j < num_fonts2; ++j) {
00248         int f2 = uf2.font_ids[j];
00249         if (f1 == f2) {
00250           dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
00251           ++dist_count;
00252         }
00253       }
00254     }
00255   } else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
00256     // Small enough sets to compute all the distances.
00257     for (int i = 0; i < num_fonts1; ++i) {
00258       int f1 = uf1.font_ids[i];
00259       for (int j = 0; j < num_fonts2; ++j) {
00260         int f2 = uf2.font_ids[j];
00261         dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
00262         if (debug) {
00263             tprintf("Cluster dist %d %d %d %d = %g\n",
00264                     f1, c1, f2, c2,
00265                     ClusterDistance(f1, c1, f2, c2, feature_map));
00266         }
00267         ++dist_count;
00268       }
00269     }
00270   } else {
00271     // Subsample distances, using the largest set once, and stepping through
00272     // the smaller set so as to ensure that all the pairs are different.
00273     int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
00274     int index = 0;
00275     int num_samples = MAX(num_fonts1, num_fonts2);
00276     for (int i = 0; i < num_samples; ++i, index += increment) {
00277       int f1 = uf1.font_ids[i % num_fonts1];
00278       int f2 = uf2.font_ids[index % num_fonts2];
00279       if (debug) {
00280           tprintf("Cluster dist %d %d %d %d = %g\n",
00281                   f1, c1, f2, c2, ClusterDistance(f1, c1, f2, c2, feature_map));
00282       }
00283       dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
00284       ++dist_count;
00285     }
00286   }
00287   if (dist_count == 0) {
00288     if (matched_fonts)
00289       return UnicharDistance(uf1, uf2, false, feature_map);
00290     return 0.0f;
00291   }
00292   return dist_sum / dist_count;
00293 }
00294 
00295 // Returns the distance between the given pair of font/class pairs.
00296 // Finds in cache or computes and caches.
00297 // OrganizeByFontAndClass must have been already called.
00298 float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1,
00299                                          int font_id2, int class_id2,
00300                                          const IntFeatureMap& feature_map) {
00301   ASSERT_HOST(font_class_array_ != NULL);
00302   int font_index1 = font_id_map_.SparseToCompact(font_id1);
00303   int font_index2 = font_id_map_.SparseToCompact(font_id2);
00304   if (font_index1 < 0 || font_index2 < 0)
00305     return 0.0f;
00306   FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
00307   if (font_id1 == font_id2) {
00308     // Special case cache for speed.
00309     if (fc_info.unichar_distance_cache.size() == 0)
00310       fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
00311     if (fc_info.unichar_distance_cache[class_id2] < 0) {
00312       // Distance has to be calculated.
00313       float result = ComputeClusterDistance(font_id1, class_id1,
00314                                             font_id2, class_id2,
00315                                             feature_map);
00316       fc_info.unichar_distance_cache[class_id2] = result;
00317       // Copy to the symmetric cache entry.
00318       FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
00319       if (fc_info2.unichar_distance_cache.size() == 0)
00320         fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
00321       fc_info2.unichar_distance_cache[class_id1] = result;
00322     }
00323     return fc_info.unichar_distance_cache[class_id2];
00324   } else if (class_id1 == class_id2) {
00325     // Another special-case cache for equal class-id.
00326     if (fc_info.font_distance_cache.size() == 0)
00327       fc_info.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
00328                                                -1.0f);
00329     if (fc_info.font_distance_cache[font_index2] < 0) {
00330       // Distance has to be calculated.
00331       float result = ComputeClusterDistance(font_id1, class_id1,
00332                                             font_id2, class_id2,
00333                                             feature_map);
00334       fc_info.font_distance_cache[font_index2] = result;
00335       // Copy to the symmetric cache entry.
00336       FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
00337       if (fc_info2.font_distance_cache.size() == 0)
00338         fc_info2.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
00339                                                   -1.0f);
00340       fc_info2.font_distance_cache[font_index1] = result;
00341     }
00342     return fc_info.font_distance_cache[font_index2];
00343   }
00344   // Both font and class are different. Linear search for class_id2/font_id2
00345   // in what is a hopefully short list of distances.
00346   int cache_index = 0;
00347   while (cache_index < fc_info.distance_cache.size() &&
00348          (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
00349           fc_info.distance_cache[cache_index].font_id != font_id2))
00350     ++cache_index;
00351   if (cache_index == fc_info.distance_cache.size()) {
00352     // Distance has to be calculated.
00353     float result = ComputeClusterDistance(font_id1, class_id1,
00354                                           font_id2, class_id2,
00355                                           feature_map);
00356     FontClassDistance fc_dist = { class_id2, font_id2, result };
00357     fc_info.distance_cache.push_back(fc_dist);
00358     // Copy to the symmetric cache entry. We know it isn't there already, as
00359     // we always copy to the symmetric entry.
00360     FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
00361     fc_dist.unichar_id = class_id1;
00362     fc_dist.font_id = font_id1;
00363     fc_info2.distance_cache.push_back(fc_dist);
00364   }
00365   return fc_info.distance_cache[cache_index].distance;
00366 }
00367 
00368 // Computes the distance between the given pair of font/class pairs.
00369 float TrainingSampleSet::ComputeClusterDistance(
00370     int font_id1, int class_id1, int font_id2, int class_id2,
00371     const IntFeatureMap& feature_map) const {
00372   int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2,
00373                                feature_map, false);
00374   dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1,
00375                             feature_map, false);
00376   int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
00377   denominator += GetCanonicalFeatures(font_id2, class_id2).size();
00378   return static_cast<float>(dist) / denominator;
00379 }
00380 
00381 // Helper to add a feature and its near neighbors to the good_features.
00382 // levels indicates how many times to compute the offset features of what is
00383 // already there. This is done by iteration rather than recursion.
00384 static void AddNearFeatures(const IntFeatureMap& feature_map, int f, int levels,
00385                             GenericVector<int>* good_features) {
00386   int prev_num_features = 0;
00387   good_features->push_back(f);
00388   int num_features = 1;
00389   for (int level = 0; level < levels; ++level) {
00390     for (int i = prev_num_features; i < num_features; ++i) {
00391       int feature = (*good_features)[i];
00392       for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
00393         if (dir == 0) continue;
00394         int f1 = feature_map.OffsetFeature(feature, dir);
00395         if (f1 >= 0) {
00396           good_features->push_back(f1);
00397         }
00398       }
00399     }
00400     prev_num_features = num_features;
00401     num_features = good_features->size();
00402   }
00403 }
00404 
00405 // Returns the number of canonical features of font/class 2 for which
00406 // neither the feature nor any of its near neighbors occurs in the cloud
00407 // of font/class 1. Each such feature is a reliable separation between
00408 // the classes, ASSUMING that the canonical sample is sufficiently
00409 // representative that every sample has a feature near that particular
00410 // feature. To check that this is so on the fly would be prohibitively
00411 // expensive, but it might be possible to pre-qualify the canonical features
00412 // to include only those for which this assumption is true.
00413 // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
00414 // first, or the results will be nonsense.
00415 int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1,
00416                                          int font_id2, int class_id2,
00417                                          const IntFeatureMap& feature_map,
00418                                          bool thorough) const {
00419   int result = 0;
00420   const TrainingSample* sample2 = GetCanonicalSample(font_id2, class_id2);
00421   if (sample2 == NULL)
00422     return 0;  // There are no canonical features.
00423   const GenericVector<int>& canonical2 = GetCanonicalFeatures(font_id2,
00424                                                               class_id2);
00425   const BitVector& cloud1 = GetCloudFeatures(font_id1, class_id1);
00426   if (cloud1.size() == 0)
00427     return canonical2.size();  // There are no cloud features.
00428 
00429   // Find a canonical2 feature that is not in cloud1.
00430   for (int f = 0; f < canonical2.size(); ++f) {
00431     int feature = canonical2[f];
00432     if (cloud1[feature])
00433       continue;
00434     // Gather the near neighbours of f.
00435     GenericVector<int> good_features;
00436     AddNearFeatures(feature_map, feature, 1, &good_features);
00437     // Check that none of the good_features are in the cloud.
00438     int i;
00439     for (i = 0; i < good_features.size(); ++i) {
00440       int good_f = good_features[i];
00441       if (cloud1[good_f]) {
00442         break;
00443       }
00444     }
00445     if (i < good_features.size())
00446        continue;  // Found one in the cloud.
00447     ++result;
00448   }
00449   return result;
00450 }
00451 
00452 // Returns the total index of the requested sample.
00453 // OrganizeByFontAndClass must have been already called.
00454 int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id,
00455                                          int index) const {
00456   ASSERT_HOST(font_class_array_ != NULL);
00457   int font_index = font_id_map_.SparseToCompact(font_id);
00458   if (font_index < 0) return -1;
00459   return (*font_class_array_)(font_index, class_id).samples[index];
00460 }
00461 
00462 // Gets the canonical sample for the given font, class pair.
00463 // ComputeCanonicalSamples must have been called first.
00464 const TrainingSample* TrainingSampleSet::GetCanonicalSample(
00465     int font_id, int class_id) const {
00466   ASSERT_HOST(font_class_array_ != NULL);
00467   int font_index = font_id_map_.SparseToCompact(font_id);
00468   if (font_index < 0) return NULL;
00469   int sample_index = (*font_class_array_)(font_index,
00470                                           class_id).canonical_sample;
00471   return sample_index >= 0 ? samples_[sample_index] : NULL;
00472 }
00473 
00474 // Gets the max distance for the given canonical sample.
00475 // ComputeCanonicalSamples must have been called first.
00476 float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const {
00477   ASSERT_HOST(font_class_array_ != NULL);
00478   int font_index = font_id_map_.SparseToCompact(font_id);
00479   if (font_index < 0) return 0.0f;
00480   if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
00481     return (*font_class_array_)(font_index, class_id).canonical_dist;
00482   else
00483     return 0.0f;
00484 }
00485 
00486 // Generates indexed features for all samples with the supplied feature_space.
00487 void TrainingSampleSet::IndexFeatures(const IntFeatureSpace& feature_space) {
00488   for (int s = 0; s < samples_.size(); ++s)
00489     samples_[s]->IndexFeatures(feature_space);
00490 }
00491 
00492 // Delete outlier samples with few features that are shared with others.
00493 // IndexFeatures must have been called already.
00494 void TrainingSampleSet::DeleteOutliers(const IntFeatureSpace& feature_space,
00495                                        bool debug) {
00496   if (font_class_array_ == NULL)
00497     OrganizeByFontAndClass();
00498   Pixa* pixa = NULL;
00499   if (debug)
00500     pixa = pixaCreate(0);
00501   GenericVector<int> feature_counts;
00502   int fs_size = feature_space.Size();
00503   int font_size = font_id_map_.CompactSize();
00504   for (int font_index = 0; font_index < font_size; ++font_index) {
00505     for (int c = 0; c < unicharset_size_; ++c) {
00506       // Create a histogram of the features used by all samples of this
00507       // font/class combination.
00508       feature_counts.init_to_size(fs_size, 0);
00509       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
00510       int sample_count = fcinfo.samples.size();
00511       if (sample_count < kMinOutlierSamples)
00512         continue;
00513       for (int i = 0; i < sample_count; ++i) {
00514         int s = fcinfo.samples[i];
00515         const GenericVector<int>& features = samples_[s]->indexed_features();
00516         for (int f = 0; f < features.size(); ++f) {
00517           ++feature_counts[features[f]];
00518         }
00519       }
00520       for (int i = 0; i < sample_count; ++i) {
00521         int s = fcinfo.samples[i];
00522         const TrainingSample& sample = *samples_[s];
00523         const GenericVector<int>& features = sample.indexed_features();
00524         // A feature that has a histogram count of 1 is only used by this
00525         // sample, making it 'bad'. All others are 'good'.
00526         int good_features = 0;
00527         int bad_features = 0;
00528         for (int f = 0; f < features.size(); ++f) {
00529           if (feature_counts[features[f]] > 1)
00530             ++good_features;
00531           else
00532             ++bad_features;
00533         }
00534         // If more than 1/3 features are bad, then this is an outlier.
00535         if (bad_features * 2 > good_features) {
00536           tprintf("Deleting outlier sample of %s, %d good, %d bad\n",
00537                   SampleToString(sample).string(),
00538                   good_features, bad_features);
00539           if (debug) {
00540             pixaAddPix(pixa, sample.RenderToPix(&unicharset_), L_INSERT);
00541             // Add the previous sample as well, so it is easier to see in
00542             // the output what is wrong with this sample.
00543             int t;
00544             if (i == 0)
00545               t = fcinfo.samples[1];
00546             else
00547               t = fcinfo.samples[i - 1];
00548             const TrainingSample &csample = *samples_[t];
00549             pixaAddPix(pixa, csample.RenderToPix(&unicharset_), L_INSERT);
00550           }
00551           // Mark the sample for deletion.
00552           KillSample(samples_[s]);
00553         }
00554       }
00555     }
00556   }
00557   // Truly delete all bad samples and renumber everything.
00558   DeleteDeadSamples();
00559   if (pixa != NULL) {
00560     Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10);
00561     pixaDestroy(&pixa);
00562     pixWrite("outliers.png", pix, IFF_PNG);
00563     pixDestroy(&pix);
00564   }
00565 }
00566 
00567 // Marks the given sample index for deletion.
00568 // Deletion is actually completed by DeleteDeadSamples.
00569 void TrainingSampleSet::KillSample(TrainingSample* sample) {
00570   sample->set_sample_index(-1);
00571 }
00572 
00573 // Deletes all samples with zero features marked by KillSample.
00574 void TrainingSampleSet::DeleteDeadSamples() {
00575   samples_.compact(
00576       NewPermanentTessCallback(this, &TrainingSampleSet::DeleteableSample));
00577   num_raw_samples_ = samples_.size();
00578   // Samples must be re-organized now we have deleted a few.
00579 }
00580 
00581 // Callback function returns true if the given sample is to be deleted, due
00582 // to having a negative classid.
00583 bool TrainingSampleSet::DeleteableSample(const TrainingSample* sample) {
00584   return sample == NULL || sample->class_id() < 0;
00585 }
00586 
00587 static Pix* DebugSample(const UNICHARSET& unicharset,
00588                         TrainingSample* sample) {
00589   tprintf("\nOriginal features:\n");
00590   for (int i = 0; i < sample->num_features(); ++i) {
00591     sample->features()[i].print();
00592   }
00593   if (sample->features_are_mapped()) {
00594     tprintf("\nMapped features:\n");
00595     for (int i = 0; i < sample->mapped_features().size(); ++i) {
00596       tprintf("%d ", sample->mapped_features()[i]);
00597     }
00598     tprintf("\n");
00599   }
00600   return sample->RenderToPix(&unicharset);
00601 }
00602 
00603 // Construct an array to access the samples by font,class pair.
00604 void TrainingSampleSet::OrganizeByFontAndClass() {
00605   // Font indexes are sparse, so we used a map to compact them, so we can
00606   // have an efficient 2-d array of fonts and character classes.
00607   SetupFontIdMap();
00608   int compact_font_size = font_id_map_.CompactSize();
00609   // Get a 2-d array of generic vectors.
00610   if (font_class_array_ != NULL)
00611     delete font_class_array_;
00612   FontClassInfo empty;
00613   font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(
00614       compact_font_size, unicharset_size_, empty);
00615   for (int s = 0; s < samples_.size(); ++s) {
00616     int font_id = samples_[s]->font_id();
00617     int class_id = samples_[s]->class_id();
00618     if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
00619       tprintf("Font id = %d/%d, class id = %d/%d on sample %d\n",
00620               font_id, font_id_map_.SparseSize(), class_id, unicharset_size_,
00621               s);
00622     }
00623     ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
00624     ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
00625     int font_index = font_id_map_.SparseToCompact(font_id);
00626     (*font_class_array_)(font_index, class_id).samples.push_back(s);
00627   }
00628   // Set the num_raw_samples member of the FontClassInfo, to set the boundary
00629   // between the raw samples and the replicated ones.
00630   for (int f = 0; f < compact_font_size; ++f) {
00631     for (int c = 0; c < unicharset_size_; ++c)
00632       (*font_class_array_)(f, c).num_raw_samples =
00633           (*font_class_array_)(f, c).samples.size();
00634   }
00635   // This is the global number of samples and also marks the boundary between
00636   // real and replicated samples.
00637   num_raw_samples_ = samples_.size();
00638 }
00639 
00640 // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
00641 // index for the font_class_array_.
00642 void TrainingSampleSet::SetupFontIdMap() {
00643   // Number of samples for each font_id.
00644   GenericVector<int> font_counts;
00645   for (int s = 0; s < samples_.size(); ++s) {
00646     int font_id = samples_[s]->font_id();
00647     while (font_id >= font_counts.size())
00648       font_counts.push_back(0);
00649     ++font_counts[font_id];
00650   }
00651   font_id_map_.Init(font_counts.size(), false);
00652   for (int f = 0; f < font_counts.size(); ++f) {
00653     font_id_map_.SetMap(f, font_counts[f] > 0);
00654   }
00655   font_id_map_.Setup();
00656 }
00657 
00658 
00659 // Finds the sample for each font, class pair that has least maximum
00660 // distance to all the other samples of the same font, class.
00661 // OrganizeByFontAndClass must have been already called.
00662 void TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap& map,
00663                                                 bool debug) {
00664   ASSERT_HOST(font_class_array_ != NULL);
00665   IntFeatureDist f_table;
00666   if (debug) tprintf("feature table size %d\n", map.sparse_size());
00667   f_table.Init(&map);
00668   int worst_s1 = 0;
00669   int worst_s2 = 0;
00670   double global_worst_dist = 0.0;
00671   // Compute distances independently for each font and char index.
00672   int font_size = font_id_map_.CompactSize();
00673   for (int font_index = 0; font_index < font_size; ++font_index) {
00674     int font_id = font_id_map_.CompactToSparse(font_index);
00675     for (int c = 0; c < unicharset_size_; ++c) {
00676       int samples_found = 0;
00677       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
00678       if (fcinfo.samples.size() == 0 ||
00679           (kTestChar >= 0 && c != kTestChar)) {
00680         fcinfo.canonical_sample = -1;
00681         fcinfo.canonical_dist = 0.0f;
00682         if (debug) tprintf("Skipping class %d\n", c);
00683         continue;
00684       }
00685       // The canonical sample will be the one with the min_max_dist, which
00686       // is the sample with the lowest maximum distance to all other samples.
00687       double min_max_dist = 2.0;
00688       // We keep track of the farthest apart pair (max_s1, max_s2) which
00689       // are max_max_dist apart, so we can see how bad the variability is.
00690       double max_max_dist = 0.0;
00691       int max_s1 = 0;
00692       int max_s2 = 0;
00693       fcinfo.canonical_sample = fcinfo.samples[0];
00694       fcinfo.canonical_dist = 0.0f;
00695       for (int i = 0; i < fcinfo.samples.size(); ++i) {
00696         int s1 = fcinfo.samples[i];
00697         const GenericVector<int>& features1 = samples_[s1]->indexed_features();
00698         f_table.Set(features1, features1.size(), true);
00699         double max_dist = 0.0;
00700         // Run the full squared-order search for similar samples. It is still
00701         // reasonably fast because f_table.FeatureDistance is fast, but we
00702         // may have to reconsider if we start playing with too many samples
00703         // of a single char/font.
00704         for (int j = 0; j < fcinfo.samples.size(); ++j) {
00705           int s2 = fcinfo.samples[j];
00706           if (samples_[s2]->class_id() != c  ||
00707               samples_[s2]->font_id() != font_id ||
00708               s2 == s1)
00709             continue;
00710           GenericVector<int> features2 = samples_[s2]->indexed_features();
00711           double dist = f_table.FeatureDistance(features2);
00712           if (dist > max_dist) {
00713             max_dist = dist;
00714             if (dist > max_max_dist) {
00715               max_s1 = s1;
00716               max_s2 = s2;
00717             }
00718           }
00719         }
00720         // Using Set(..., false) is far faster than re initializing, due to
00721         // the sparseness of the feature space.
00722         f_table.Set(features1, features1.size(), false);
00723         samples_[s1]->set_max_dist(max_dist);
00724         ++samples_found;
00725         if (max_dist < min_max_dist) {
00726           fcinfo.canonical_sample = s1;
00727           fcinfo.canonical_dist = max_dist;
00728         }
00729         UpdateRange(max_dist, &min_max_dist, &max_max_dist);
00730       }
00731       if (max_max_dist > global_worst_dist) {
00732         // Keep a record of the worst pair over all characters/fonts too.
00733         global_worst_dist = max_max_dist;
00734         worst_s1 = max_s1;
00735         worst_s2 = max_s2;
00736       }
00737       if (debug) {
00738         tprintf("Found %d samples of class %d=%s, font %d, "
00739                 "dist range [%g, %g], worst pair= %s, %s\n",
00740                 samples_found, c, unicharset_.debug_str(c).string(),
00741                 font_index, min_max_dist, max_max_dist,
00742                 SampleToString(*samples_[max_s1]).string(),
00743                 SampleToString(*samples_[max_s2]).string());
00744       }
00745     }
00746   }
00747   if (debug) {
00748     tprintf("Global worst dist = %g, between sample %d and %d\n",
00749             global_worst_dist, worst_s1, worst_s2);
00750     Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]);
00751     Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]);
00752     pixOr(pix1, pix1, pix2);
00753     pixWrite("worstpair.png", pix1, IFF_PNG);
00754     pixDestroy(&pix1);
00755     pixDestroy(&pix2);
00756   }
00757 }
00758 
00759 // Replicates the samples to a minimum frequency defined by
00760 // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
00761 // After replication, the replicated samples are perturbed slightly, but
00762 // in a predictable and repeatable way.
00763 // Use after OrganizeByFontAndClass().
00764 void TrainingSampleSet::ReplicateAndRandomizeSamples() {
00765   ASSERT_HOST(font_class_array_ != NULL);
00766   int font_size = font_id_map_.CompactSize();
00767   for (int font_index = 0; font_index < font_size; ++font_index) {
00768     for (int c = 0; c < unicharset_size_; ++c) {
00769       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
00770       int sample_count = fcinfo.samples.size();
00771       int min_samples = 2 * MAX(kSampleRandomSize, sample_count);
00772       if (sample_count > 0 && sample_count < min_samples) {
00773         int base_count = sample_count;
00774         for (int base_index = 0; sample_count < min_samples; ++sample_count) {
00775           int src_index = fcinfo.samples[base_index++];
00776           if (base_index >= base_count) base_index = 0;
00777           TrainingSample* sample = samples_[src_index]->RandomizedCopy(
00778               sample_count % kSampleRandomSize);
00779           int sample_index = samples_.size();
00780           sample->set_sample_index(sample_index);
00781           samples_.push_back(sample);
00782           fcinfo.samples.push_back(sample_index);
00783         }
00784       }
00785     }
00786   }
00787 }
00788 
00789 // Caches the indexed features of the canonical samples.
00790 // ComputeCanonicalSamples must have been already called.
00791 // TODO(rays) see note on ReliablySeparable and try restricting the
00792 // canonical features to those that truly represent all samples.
00793 void TrainingSampleSet::ComputeCanonicalFeatures() {
00794   ASSERT_HOST(font_class_array_ != NULL);
00795   int font_size = font_id_map_.CompactSize();
00796   for (int font_index = 0; font_index < font_size; ++font_index) {
00797     int font_id = font_id_map_.CompactToSparse(font_index);
00798     for (int c = 0; c < unicharset_size_; ++c) {
00799       int num_samples = NumClassSamples(font_id, c, false);
00800       if (num_samples == 0)
00801         continue;
00802       const TrainingSample* sample = GetCanonicalSample(font_id, c);
00803       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
00804       fcinfo.canonical_features = sample->indexed_features();
00805     }
00806   }
00807 }
00808 
00809 // Computes the combined set of features used by all the samples of each
00810 // font/class combination. Use after ReplicateAndRandomizeSamples.
00811 void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) {
00812   ASSERT_HOST(font_class_array_ != NULL);
00813   int font_size = font_id_map_.CompactSize();
00814   for (int font_index = 0; font_index < font_size; ++font_index) {
00815     int font_id = font_id_map_.CompactToSparse(font_index);
00816     for (int c = 0; c < unicharset_size_; ++c) {
00817       int num_samples = NumClassSamples(font_id, c, false);
00818       if (num_samples == 0)
00819         continue;
00820       FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
00821       fcinfo.cloud_features.Init(feature_space_size);
00822       for (int s = 0; s < num_samples; ++s) {
00823         const TrainingSample* sample = GetSample(font_id, c, s);
00824         const GenericVector<int>& sample_features = sample->indexed_features();
00825         for (int i = 0; i < sample_features.size(); ++i)
00826           fcinfo.cloud_features.SetBit(sample_features[i]);
00827       }
00828     }
00829   }
00830 }
00831 
00832 // Adds all fonts of the given class to the shape.
00833 void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape* shape) const {
00834   for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
00835     int font_id = font_id_map_.CompactToSparse(f);
00836     shape->AddToShape(class_id, font_id);
00837   }
00838 }
00839 
00840 // Display the samples with the given indexed feature that also match
00841 // the given shape.
00842 void TrainingSampleSet::DisplaySamplesWithFeature(int f_index,
00843                                                   const Shape& shape,
00844                                                   const IntFeatureSpace& space,
00845                                                   ScrollView::Color color,
00846                                                   ScrollView* window) const {
00847   for (int s = 0; s < num_raw_samples(); ++s) {
00848     const TrainingSample* sample = GetSample(s);
00849     if (shape.ContainsUnichar(sample->class_id())) {
00850       GenericVector<int> indexed_features;
00851       space.IndexAndSortFeatures(sample->features(), sample->num_features(),
00852                                  &indexed_features);
00853       for (int f = 0; f < indexed_features.size(); ++f) {
00854         if (indexed_features[f] == f_index) {
00855           sample->DisplayFeatures(color, window);
00856         }
00857       }
00858     }
00859   }
00860 }
00861 
00862 
00863 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines