tesseract
3.03
|
00001 // Copyright 2008 Google Inc. All Rights Reserved. 00002 // Author: scharron@google.com (Samuel Charron) 00003 // 00004 // Licensed under the Apache License, Version 2.0 (the "License"); 00005 // you may not use this file except in compliance with the License. 00006 // You may obtain a copy of the License at 00007 // http://www.apache.org/licenses/LICENSE-2.0 00008 // Unless required by applicable law or agreed to in writing, software 00009 // distributed under the License is distributed on an "AS IS" BASIS, 00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00011 // See the License for the specific language governing permissions and 00012 // limitations under the License. 00013 00014 #ifndef TESSERACT_TRAINING_COMMONTRAINING_H__ 00015 #define TESSERACT_TRAINING_COMMONTRAINING_H__ 00016 00017 #include "cluster.h" 00018 #include "commandlineflags.h" 00019 #include "featdefs.h" 00020 #include "intproto.h" 00021 #include "oldlist.h" 00022 00023 namespace tesseract { 00024 class Classify; 00025 class MasterTrainer; 00026 class ShapeTable; 00027 } 00028 00030 // Globals /////////////////////////////////////////////////////////////////// 00032 00033 extern FEATURE_DEFS_STRUCT feature_defs; 00034 00035 // Must be defined in the file that "implements" commonTraining facilities. 00036 extern CLUSTERCONFIG Config; 00037 00039 // Structs /////////////////////////////////////////////////////////////////// 00041 typedef struct 00042 { 00043 char *Label; 00044 int SampleCount; 00045 int font_sample_count; 00046 LIST List; 00047 } 00048 LABELEDLISTNODE, *LABELEDLIST; 00049 00050 typedef struct 00051 { 00052 char* Label; 00053 int NumMerged[MAX_NUM_PROTOS]; 00054 CLASS_TYPE Class; 00055 }MERGE_CLASS_NODE; 00056 typedef MERGE_CLASS_NODE* MERGE_CLASS; 00057 00058 00060 // Functions ///////////////////////////////////////////////////////////////// 00062 void ParseArguments(int* argc, char*** argv); 00063 00064 namespace tesseract { 00065 00066 // Helper loads shape table from the given file. 00067 ShapeTable* LoadShapeTable(const STRING& file_prefix); 00068 // Helper to write the shape_table. 00069 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table); 00070 00071 // Creates a MasterTraininer and loads the training data into it: 00072 // Initializes feature_defs and IntegerFX. 00073 // Loads the shape_table if shape_table != NULL. 00074 // Loads initial unicharset from -U command-line option. 00075 // If FLAGS_input_trainer is set, loads the majority of data from there, else: 00076 // Loads font info from -F option. 00077 // Loads xheights from -X option. 00078 // Loads samples from .tr files in remaining command-line args. 00079 // Deletes outliers and computes canonical samples. 00080 // If FLAGS_output_trainer is set, saves the trainer for future use. 00081 // Computes canonical and cloud features. 00082 // If shape_table is not NULL, but failed to load, make a fake flat one, 00083 // as shape clustering was not run. 00084 MasterTrainer* LoadTrainingData(int argc, const char* const * argv, 00085 bool replication, 00086 ShapeTable** shape_table, 00087 STRING* file_prefix); 00088 } // namespace tesseract. 00089 00090 const char *GetNextFilename(int argc, const char* const * argv); 00091 00092 LABELEDLIST FindList( 00093 LIST List, 00094 char *Label); 00095 00096 LABELEDLIST NewLabeledList( 00097 const char *Label); 00098 00099 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs, 00100 const char *feature_name, int max_samples, 00101 UNICHARSET* unicharset, 00102 FILE* file, LIST* training_samples); 00103 00104 void WriteTrainingSamples( 00105 const FEATURE_DEFS_STRUCT &FeatureDefs, 00106 char *Directory, 00107 LIST CharList, 00108 const char *program_feature_type); 00109 00110 void FreeTrainingSamples( 00111 LIST CharList); 00112 00113 void FreeLabeledList( 00114 LABELEDLIST LabeledList); 00115 00116 void FreeLabeledClassList( 00117 LIST ClassListList); 00118 00119 CLUSTERER *SetUpForClustering( 00120 const FEATURE_DEFS_STRUCT &FeatureDefs, 00121 LABELEDLIST CharSample, 00122 const char *program_feature_type); 00123 00124 LIST RemoveInsignificantProtos( 00125 LIST ProtoList, 00126 BOOL8 KeepSigProtos, 00127 BOOL8 KeepInsigProtos, 00128 int N); 00129 00130 void CleanUpUnusedData( 00131 LIST ProtoList); 00132 00133 void MergeInsignificantProtos( 00134 LIST ProtoList, 00135 const char *label, 00136 CLUSTERER *Clusterer, 00137 CLUSTERCONFIG *Config); 00138 00139 MERGE_CLASS FindClass( 00140 LIST List, 00141 const char *Label); 00142 00143 MERGE_CLASS NewLabeledClass( 00144 const char *Label); 00145 00146 void FreeTrainingSamples( 00147 LIST CharList); 00148 00149 CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset, 00150 LIST LabeledClassList); 00151 00152 void Normalize( 00153 float *Values); 00154 00155 void FreeNormProtoList( 00156 LIST CharList); 00157 00158 void AddToNormProtosList( 00159 LIST* NormProtoList, 00160 LIST ProtoList, 00161 char *CharName); 00162 00163 int NumberOfProtos( 00164 LIST ProtoList, 00165 BOOL8 CountSigProtos, 00166 BOOL8 CountInsigProtos); 00167 00168 00169 void allocNormProtos(); 00170 #endif // TESSERACT_TRAINING_COMMONTRAINING_H__