tesseract
3.03
|
00001 /****************************************************************************** 00002 ** Filename: mftraining.c 00003 ** Purpose: Separates training pages into files for each character. 00004 ** Strips from files only the features and there parameters of 00005 the feature type mf. 00006 ** Author: Dan Johnson 00007 ** Revisment: Christy Russon 00008 ** Environment: HPUX 6.5 00009 ** Library: HPUX 6.5 00010 ** History: Fri Aug 18 08:53:50 1989, DSJ, Created. 00011 ** 5/25/90, DSJ, Adapted to multiple feature types. 00012 ** Tuesday, May 17, 1998 Changes made to make feature specific and 00013 ** simplify structures. First step in simplifying training process. 00014 ** 00015 ** (c) Copyright Hewlett-Packard Company, 1988. 00016 ** Licensed under the Apache License, Version 2.0 (the "License"); 00017 ** you may not use this file except in compliance with the License. 00018 ** You may obtain a copy of the License at 00019 ** http://www.apache.org/licenses/LICENSE-2.0 00020 ** Unless required by applicable law or agreed to in writing, software 00021 ** distributed under the License is distributed on an "AS IS" BASIS, 00022 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00023 ** See the License for the specific language governing permissions and 00024 ** limitations under the License. 00025 ******************************************************************************/ 00029 #ifdef HAVE_CONFIG_H 00030 #include "config_auto.h" 00031 #endif 00032 00033 #include <string.h> 00034 #include <stdio.h> 00035 #define _USE_MATH_DEFINES 00036 #include <math.h> 00037 #ifdef _WIN32 00038 #ifndef M_PI 00039 #define M_PI 3.14159265358979323846 00040 #endif 00041 #endif 00042 00043 #include "classify.h" 00044 #include "cluster.h" 00045 #include "clusttool.h" 00046 #include "commontraining.h" 00047 #include "danerror.h" 00048 #include "efio.h" 00049 #include "emalloc.h" 00050 #include "featdefs.h" 00051 #include "fontinfo.h" 00052 #include "genericvector.h" 00053 #include "indexmapbidi.h" 00054 #include "intproto.h" 00055 #include "mastertrainer.h" 00056 #include "mergenf.h" 00057 #include "mf.h" 00058 #include "ndminx.h" 00059 #include "ocrfeatures.h" 00060 #include "oldlist.h" 00061 #include "protos.h" 00062 #include "shapetable.h" 00063 #include "tessopt.h" 00064 #include "tprintf.h" 00065 #include "unicity_table.h" 00066 00067 using tesseract::Classify; 00068 using tesseract::FontInfo; 00069 using tesseract::FontSpacingInfo; 00070 using tesseract::IndexMapBiDi; 00071 using tesseract::MasterTrainer; 00072 using tesseract::Shape; 00073 using tesseract::ShapeTable; 00074 00075 #define PROGRAM_FEATURE_TYPE "mf" 00076 00077 // Max length of a fake shape label. 00078 const int kMaxShapeLabelLength = 10; 00079 00080 DECLARE_STRING_PARAM_FLAG(test_ch); 00081 00085 int main ( 00086 int argc, 00087 char **argv); 00088 00089 00090 /*---------------------------------------------------------------------------- 00091 Public Code 00092 -----------------------------------------------------------------------------*/ 00093 #ifndef GRAPHICS_DISABLED 00094 static void DisplayProtoList(const char* ch, LIST protolist) { 00095 void* window = c_create_window("Char samples", 50, 200, 00096 520, 520, -130.0, 130.0, -130.0, 130.0); 00097 LIST proto = protolist; 00098 iterate(proto) { 00099 PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE *>(first_node(proto)); 00100 if (prototype->Significant) 00101 c_line_color_index(window, Green); 00102 else if (prototype->NumSamples == 0) 00103 c_line_color_index(window, Blue); 00104 else if (prototype->Merged) 00105 c_line_color_index(window, Magenta); 00106 else 00107 c_line_color_index(window, Red); 00108 float x = CenterX(prototype->Mean); 00109 float y = CenterY(prototype->Mean); 00110 double angle = OrientationOf(prototype->Mean) * 2 * M_PI; 00111 float dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2); 00112 float dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2); 00113 c_move(window, (x - dx) * 256, (y - dy) * 256); 00114 c_draw(window, (x + dx) * 256, (y + dy) * 256); 00115 if (prototype->Significant) 00116 tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n", 00117 x, y, dx, dy, prototype->NumSamples); 00118 else if (prototype->NumSamples > 0 && !prototype->Merged) 00119 tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n", 00120 x, y, dx, dy, prototype->NumSamples); 00121 } 00122 c_make_current(window); 00123 } 00124 #endif // GRAPHICS_DISABLED 00125 00126 // Helper to run clustering on a single config. 00127 // Mostly copied from the old mftraining, but with renamed variables. 00128 static LIST ClusterOneConfig(int shape_id, const char* class_label, 00129 LIST mf_classes, 00130 const ShapeTable& shape_table, 00131 MasterTrainer* trainer) { 00132 int num_samples; 00133 CLUSTERER *clusterer = trainer->SetupForClustering(shape_table, 00134 feature_defs, 00135 shape_id, 00136 &num_samples); 00137 Config.MagicSamples = num_samples; 00138 LIST proto_list = ClusterSamples(clusterer, &Config); 00139 CleanUpUnusedData(proto_list); 00140 00141 // Merge protos where reasonable to make more of them significant by 00142 // representing almost all samples of the class/font. 00143 MergeInsignificantProtos(proto_list, class_label, clusterer, &Config); 00144 #ifndef GRAPHICS_DISABLED 00145 if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0) 00146 DisplayProtoList(FLAGS_test_ch.c_str(), proto_list); 00147 #endif // GRAPHICS_DISABLED 00148 // Delete the protos that will not be used in the inttemp output file. 00149 proto_list = RemoveInsignificantProtos(proto_list, true, 00150 false, 00151 clusterer->SampleSize); 00152 FreeClusterer(clusterer); 00153 MERGE_CLASS merge_class = FindClass(mf_classes, class_label); 00154 if (merge_class == NULL) { 00155 merge_class = NewLabeledClass(class_label); 00156 mf_classes = push(mf_classes, merge_class); 00157 } 00158 int config_id = AddConfigToClass(merge_class->Class); 00159 merge_class->Class->font_set.push_back(shape_id); 00160 LIST proto_it = proto_list; 00161 iterate(proto_it) { 00162 PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE*>(first_node(proto_it)); 00163 // See if proto can be approximated by existing proto. 00164 int p_id = FindClosestExistingProto(merge_class->Class, 00165 merge_class->NumMerged, prototype); 00166 if (p_id == NO_PROTO) { 00167 // Need to make a new proto, as it doesn't match anything. 00168 p_id = AddProtoToClass(merge_class->Class); 00169 MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype); 00170 merge_class->NumMerged[p_id] = 1; 00171 } else { 00172 PROTO_STRUCT dummy_proto; 00173 MakeNewFromOld(&dummy_proto, prototype); 00174 // Merge with the similar proto. 00175 ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto, 00176 static_cast<FLOAT32>(merge_class->NumMerged[p_id]), 00177 1.0, 00178 ProtoIn(merge_class->Class, p_id)); 00179 merge_class->NumMerged[p_id]++; 00180 } 00181 AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]); 00182 } 00183 FreeProtoList(&proto_list); 00184 return mf_classes; 00185 } 00186 00187 // Helper to setup the config map. 00188 // Setup an index mapping from the shapes in the shape table to the classes 00189 // that will be trained. In keeping with the original design, each shape 00190 // with the same list of unichars becomes a different class and the configs 00191 // represent the different combinations of fonts. 00192 static void SetupConfigMap(ShapeTable* shape_table, IndexMapBiDi* config_map) { 00193 int num_configs = shape_table->NumShapes(); 00194 config_map->Init(num_configs, true); 00195 config_map->Setup(); 00196 for (int c1 = 0; c1 < num_configs; ++c1) { 00197 // Only process ids that are not already merged. 00198 if (config_map->SparseToCompact(c1) == c1) { 00199 Shape* shape1 = shape_table->MutableShape(c1); 00200 // Find all the subsequent shapes that are equal. 00201 for (int c2 = c1 + 1; c2 < num_configs; ++c2) { 00202 if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) { 00203 config_map->Merge(c1, c2); 00204 } 00205 } 00206 } 00207 } 00208 config_map->CompleteMerges(); 00209 } 00210 00211 /*---------------------------------------------------------------------------*/ 00212 int main (int argc, char **argv) { 00213 /* 00214 ** Parameters: 00215 ** argc number of command line arguments 00216 ** argv array of command line arguments 00217 ** Globals: none 00218 ** Operation: 00219 ** This program reads in a text file consisting of feature 00220 ** samples from a training page in the following format: 00221 ** 00222 ** FontName UTF8-char-str xmin ymin xmax ymax page-number 00223 ** NumberOfFeatureTypes(N) 00224 ** FeatureTypeName1 NumberOfFeatures(M) 00225 ** Feature1 00226 ** ... 00227 ** FeatureM 00228 ** FeatureTypeName2 NumberOfFeatures(M) 00229 ** Feature1 00230 ** ... 00231 ** FeatureM 00232 ** ... 00233 ** FeatureTypeNameN NumberOfFeatures(M) 00234 ** Feature1 00235 ** ... 00236 ** FeatureM 00237 ** FontName CharName ... 00238 ** 00239 ** The result of this program is a binary inttemp file used by 00240 ** the OCR engine. 00241 ** Return: none 00242 ** Exceptions: none 00243 ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. 00244 ** Mon May 18 1998, Christy Russson, Revistion started. 00245 */ 00246 ParseArguments(&argc, &argv); 00247 00248 ShapeTable* shape_table = NULL; 00249 STRING file_prefix; 00250 // Load the training data. 00251 MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv, 00252 false, 00253 &shape_table, 00254 &file_prefix); 00255 if (trainer == NULL) 00256 return 1; // Failed. 00257 00258 // Setup an index mapping from the shapes in the shape table to the classes 00259 // that will be trained. In keeping with the original design, each shape 00260 // with the same list of unichars becomes a different class and the configs 00261 // represent the different combinations of fonts. 00262 IndexMapBiDi config_map; 00263 SetupConfigMap(shape_table, &config_map); 00264 00265 WriteShapeTable(file_prefix, *shape_table); 00266 // If the shape_table is flat, then either we didn't run shape clustering, or 00267 // it did nothing, so we just output the trainer's unicharset. 00268 // Otherwise shape_set will hold a fake unicharset with an entry for each 00269 // shape in the shape table, and we will output that instead. 00270 UNICHARSET shape_set; 00271 const UNICHARSET* unicharset = &trainer->unicharset(); 00272 // If we ran shapeclustering (and it worked) then at least one shape will 00273 // have multiple unichars, so we have to build a fake unicharset. 00274 if (shape_table->AnyMultipleUnichars()) { 00275 unicharset = &shape_set; 00276 // Now build a fake unicharset for the compact shape space to keep the 00277 // output modules happy that we are doing things correctly. 00278 int num_shapes = config_map.CompactSize(); 00279 for (int s = 0; s < num_shapes; ++s) { 00280 char shape_label[kMaxShapeLabelLength + 1]; 00281 snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s); 00282 shape_set.unichar_insert(shape_label); 00283 } 00284 } 00285 00286 // Now train each config separately. 00287 int num_configs = shape_table->NumShapes(); 00288 LIST mf_classes = NIL_LIST; 00289 for (int s = 0; s < num_configs; ++s) { 00290 int unichar_id, font_id; 00291 if (unicharset == &shape_set) { 00292 // Using fake unichar_ids from the config_map/shape_set. 00293 unichar_id = config_map.SparseToCompact(s); 00294 } else { 00295 // Get the real unichar_id from the shape table/unicharset. 00296 shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id); 00297 } 00298 const char* class_label = unicharset->id_to_unichar(unichar_id); 00299 mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table, 00300 trainer); 00301 } 00302 STRING inttemp_file = file_prefix; 00303 inttemp_file += "inttemp"; 00304 STRING pffmtable_file = file_prefix; 00305 pffmtable_file += "pffmtable"; 00306 CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes); 00307 // Now write the inttemp and pffmtable. 00308 trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset, 00309 *shape_table, float_classes, 00310 inttemp_file.string(), 00311 pffmtable_file.string()); 00312 delete [] float_classes; 00313 FreeLabeledClassList(mf_classes); 00314 delete trainer; 00315 delete shape_table; 00316 printf("Done!\n"); 00317 if (!FLAGS_test_ch.empty()) { 00318 // If we are displaying debug window(s), wait for the user to look at them. 00319 printf("Hit return to exit...\n"); 00320 while (getchar() != '\n'); 00321 } 00322 return 0; 00323 } /* main */