tesseract
3.03
|
00001 // Copyright 2011 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00003 // 00004 // Licensed under the Apache License, Version 2.0 (the "License"); 00005 // you may not use this file except in compliance with the License. 00006 // You may obtain a copy of the License at 00007 // http://www.apache.org/licenses/LICENSE-2.0 00008 // Unless required by applicable law or agreed to in writing, software 00009 // distributed under the License is distributed on an "AS IS" BASIS, 00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00011 // See the License for the specific language governing permissions and 00012 // limitations under the License. 00013 // 00015 00016 #ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ 00017 #define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ 00018 00019 #include "genericvector.h" 00020 #include "matrix.h" 00021 #include "statistc.h" 00022 00023 struct Pix; 00024 template <typename T> class UnicityTable; 00025 00026 namespace tesseract { 00027 00028 struct FontInfo; 00029 class FontInfoTable; 00030 class SampleIterator; 00031 class ShapeClassifier; 00032 class TrainingSample; 00033 struct UnicharRating; 00034 00035 // Enumeration of the different types of error count. 00036 // Error counts work as follows: 00037 // 00038 // Ground truth is a valid unichar-id / font-id pair: 00039 // Number of classifier answers? 00040 // 0 >0 00041 // CT_REJECT unichar-id matches top shape? 00042 // __________ yes! no 00043 // CT_UNICHAR_TOP_OK CT_UNICHAR_TOP1_ERR 00044 // Top shape-id has multiple unichars? 2nd shape unichar id matches? 00045 // yes! no yes! no 00046 // CT_OK_MULTI_UNICHAR | _____ CT_UNICHAR_TOP2_ERR 00047 // Font attributes match? Any unichar-id matches? 00048 // yes! no yes! no 00049 // CT_FONT_ATTR_OK CT_FONT_ATTR_ERR ______ CT_UNICHAR_TOPN_ERR 00050 // | __________________ _________________ 00051 // Top shape-id has multiple font attrs? 00052 // yes! no 00053 // CT_OK_MULTI_FONT 00054 // _____________________________ 00055 // 00056 // Note that multiple counts may be activated for a single sample! 00057 // 00058 // Ground truth is for a fragment/n-gram that is NOT in the unicharset. 00059 // This is called junk and is expected to be rejected: 00060 // Number of classifier answers? 00061 // 0 >0 00062 // CT_REJECTED_JUNK CT_ACCEPTED_JUNK 00063 // 00064 // Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores 00065 // the mean rank of the correct result, counting from 0, and with an error 00066 // receiving the number of answers as the correct rank. 00067 // 00068 // Keep in sync with the ReportString function. 00069 enum CountTypes { 00070 CT_UNICHAR_TOP_OK, // Top shape contains correct unichar id. 00071 // The rank of the results in TOP1, TOP2, TOPN is determined by a gap of 00072 // kRatingEpsilon from the first result in each group. The real top choice 00073 // is measured using TOPTOP. 00074 CT_UNICHAR_TOP1_ERR, // Top shape does not contain correct unichar id. 00075 CT_UNICHAR_TOP2_ERR, // Top 2 shapes don't contain correct unichar id. 00076 CT_UNICHAR_TOPN_ERR, // No output shape contains correct unichar id. 00077 CT_UNICHAR_TOPTOP_ERR, // Very top choice not correct. 00078 CT_OK_MULTI_UNICHAR, // Top shape id has correct unichar id, and others. 00079 CT_OK_JOINED, // Top shape id is correct but marked joined. 00080 CT_OK_BROKEN, // Top shape id is correct but marked broken. 00081 CT_REJECT, // Classifier hates this. 00082 CT_FONT_ATTR_ERR, // Top unichar OK, but font attributes incorrect. 00083 CT_OK_MULTI_FONT, // CT_FONT_ATTR_OK but there are multiple font attrs. 00084 CT_NUM_RESULTS, // Number of answers produced. 00085 CT_RANK, // Rank of correct answer. 00086 CT_REJECTED_JUNK, // Junk that was correctly rejected. 00087 CT_ACCEPTED_JUNK, // Junk that was incorrectly classified otherwise. 00088 00089 CT_SIZE // Number of types for array sizing. 00090 }; 00091 00092 // Class to encapsulate all the functionality and sub-structures required 00093 // to count errors for an isolated character classifier (ShapeClassifier). 00094 class ErrorCounter { 00095 public: 00096 // Computes and returns the unweighted boosting_mode error rate of the given 00097 // classifier. Can be used for testing, or inside an iterative training 00098 // system, including one that uses boosting. 00099 // report_levels: 00100 // 0 = no output. 00101 // 1 = bottom-line error rate. 00102 // 2 = bottom-line error rate + time. 00103 // 3 = font-level error rate + time. 00104 // 4 = list of all errors + short classifier debug output on 16 errors. 00105 // 5 = list of all errors + short classifier debug output on 25 errors. 00106 // * The boosting_mode determines which error type is used for computing the 00107 // scaled_error output, and setting the is_error flag in the samples. 00108 // * The fontinfo_table is used to get string font names for the debug 00109 // output, and also to count font attributes errors. 00110 // * The page_images vector may contain a Pix* (which may be NULL) for each 00111 // page index assigned to the samples. 00112 // * The it provides encapsulated iteration over some sample set. 00113 // * The outputs unichar_error, scaled_error and totals_report are all 00114 // optional. 00115 // * If not NULL, unichar error gets the top1 unichar error rate. 00116 // * Scaled_error gets the error chosen by boosting_mode weighted by the 00117 // weights on the samples. 00118 // * Fonts_report gets a string summarizing the error rates for each font in 00119 // both human-readable form and as a tab-separated list of error counts. 00120 // The human-readable form is all before the first tab. 00121 // * The return value is the un-weighted version of the scaled_error. 00122 static double ComputeErrorRate(ShapeClassifier* classifier, 00123 int report_level, CountTypes boosting_mode, 00124 const FontInfoTable& fontinfo_table, 00125 const GenericVector<Pix*>& page_images, 00126 SampleIterator* it, 00127 double* unichar_error, 00128 double* scaled_error, 00129 STRING* fonts_report); 00130 // Tests a pair of classifiers, debugging errors of the new against the old. 00131 // See errorcounter.h for description of arguments. 00132 // Iterates over the samples, calling the classifiers in normal/silent mode. 00133 // If the new_classifier makes a boosting_mode error that the old_classifier 00134 // does not, and the appropriate, it will then call the new_classifier again 00135 // with a debug flag and a keep_this argument to find out what is going on. 00136 static void DebugNewErrors(ShapeClassifier* new_classifier, 00137 ShapeClassifier* old_classifier, 00138 CountTypes boosting_mode, 00139 const FontInfoTable& fontinfo_table, 00140 const GenericVector<Pix*>& page_images, 00141 SampleIterator* it); 00142 00143 private: 00144 // Simple struct to hold an array of counts. 00145 struct Counts { 00146 Counts(); 00147 // Adds other into this for computing totals. 00148 void operator+=(const Counts& other); 00149 00150 int n[CT_SIZE]; 00151 }; 00152 00153 // Constructor is private. Only anticipated use of ErrorCounter is via 00154 // the static ComputeErrorRate. 00155 ErrorCounter(const UNICHARSET& unicharset, int fontsize); 00156 ~ErrorCounter(); 00157 00158 // Accumulates the errors from the classifier results on a single sample. 00159 // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred. 00160 // boosting_mode selects the type of error to be used for boosting and the 00161 // is_error_ member of sample is set according to whether the required type 00162 // of error occurred. The font_table provides access to font properties 00163 // for error counting and shape_table is used to understand the relationship 00164 // between unichar_ids and shape_ids in the results 00165 bool AccumulateErrors(bool debug, CountTypes boosting_mode, 00166 const FontInfoTable& font_table, 00167 const GenericVector<UnicharRating>& results, 00168 TrainingSample* sample); 00169 00170 // Accumulates counts for junk. Counts only whether the junk was correctly 00171 // rejected or not. 00172 bool AccumulateJunk(bool debug, const GenericVector<UnicharRating>& results, 00173 TrainingSample* sample); 00174 00175 // Creates a report of the error rate. The report_level controls the detail 00176 // that is reported to stderr via tprintf: 00177 // 0 -> no output. 00178 // >=1 -> bottom-line error rate. 00179 // >=3 -> font-level error rate. 00180 // boosting_mode determines the return value. It selects which (un-weighted) 00181 // error rate to return. 00182 // The fontinfo_table from MasterTrainer provides the names of fonts. 00183 // The it determines the current subset of the training samples. 00184 // If not NULL, the top-choice unichar error rate is saved in unichar_error. 00185 // If not NULL, the report string is saved in fonts_report. 00186 // (Ignoring report_level). 00187 double ReportErrors(int report_level, CountTypes boosting_mode, 00188 const FontInfoTable& fontinfo_table, 00189 const SampleIterator& it, 00190 double* unichar_error, 00191 STRING* fonts_report); 00192 00193 // Sets the report string to a combined human and machine-readable report 00194 // string of the error rates. 00195 // Returns false if there is no data, leaving report unchanged, unless 00196 // even_if_empty is true. 00197 static bool ReportString(bool even_if_empty, const Counts& counts, 00198 STRING* report); 00199 00200 // Computes the error rates and returns in rates which is an array of size 00201 // CT_SIZE. Returns false if there is no data, leaving rates unchanged. 00202 static bool ComputeRates(const Counts& counts, double rates[CT_SIZE]); 00203 00204 00205 // Total scaled error used by boosting algorithms. 00206 double scaled_error_; 00207 // Difference in result rating to be thought of as an "equal" choice. 00208 double rating_epsilon_; 00209 // Vector indexed by font_id from the samples of error accumulators. 00210 GenericVector<Counts> font_counts_; 00211 // Counts of the results that map each unichar_id (from samples) to an 00212 // incorrect shape_id. 00213 GENERIC_2D_ARRAY<int> unichar_counts_; 00214 // Count of the number of times each shape_id occurs, is correct, and multi- 00215 // unichar. 00216 GenericVector<int> multi_unichar_counts_; 00217 // Histogram of scores (as percent) for correct answers. 00218 STATS ok_score_hist_; 00219 // Histogram of scores (as percent) for incorrect answers. 00220 STATS bad_score_hist_; 00221 // Unicharset for printing character ids in results. 00222 const UNICHARSET& unicharset_; 00223 }; 00224 00225 } // namespace tesseract. 00226 00227 #endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */