tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/commontraining.h
Go to the documentation of this file.
00001 // Copyright 2008 Google Inc. All Rights Reserved.
00002 // Author: scharron@google.com (Samuel Charron)
00003 //
00004 // Licensed under the Apache License, Version 2.0 (the "License");
00005 // you may not use this file except in compliance with the License.
00006 // You may obtain a copy of the License at
00007 // http://www.apache.org/licenses/LICENSE-2.0
00008 // Unless required by applicable law or agreed to in writing, software
00009 // distributed under the License is distributed on an "AS IS" BASIS,
00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00011 // See the License for the specific language governing permissions and
00012 // limitations under the License.
00013 
00014 #ifndef TESSERACT_TRAINING_COMMONTRAINING_H__
00015 #define TESSERACT_TRAINING_COMMONTRAINING_H__
00016 
00017 #include "cluster.h"
00018 #include "commandlineflags.h"
00019 #include "featdefs.h"
00020 #include "intproto.h"
00021 #include "oldlist.h"
00022 
00023 namespace tesseract {
00024 class Classify;
00025 class MasterTrainer;
00026 class ShapeTable;
00027 }
00028 
00030 // Globals ///////////////////////////////////////////////////////////////////
00032 
00033 extern FEATURE_DEFS_STRUCT feature_defs;
00034 
00035 // Must be defined in the file that "implements" commonTraining facilities.
00036 extern CLUSTERCONFIG Config;
00037 
00039 // Structs ///////////////////////////////////////////////////////////////////
00041 typedef struct
00042 {
00043   char  *Label;
00044   int   SampleCount;
00045   int   font_sample_count;
00046   LIST  List;
00047 }
00048 LABELEDLISTNODE, *LABELEDLIST;
00049 
00050 typedef struct
00051 {
00052   char* Label;
00053   int   NumMerged[MAX_NUM_PROTOS];
00054   CLASS_TYPE Class;
00055 }MERGE_CLASS_NODE;
00056 typedef MERGE_CLASS_NODE* MERGE_CLASS;
00057 
00058 
00060 // Functions /////////////////////////////////////////////////////////////////
00062 void ParseArguments(int* argc, char*** argv);
00063 
00064 namespace tesseract {
00065 
00066 // Helper loads shape table from the given file.
00067 ShapeTable* LoadShapeTable(const STRING& file_prefix);
00068 // Helper to write the shape_table.
00069 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table);
00070 
00071 // Creates a MasterTraininer and loads the training data into it:
00072 // Initializes feature_defs and IntegerFX.
00073 // Loads the shape_table if shape_table != NULL.
00074 // Loads initial unicharset from -U command-line option.
00075 // If FLAGS_input_trainer is set, loads the majority of data from there, else:
00076 //   Loads font info from -F option.
00077 //   Loads xheights from -X option.
00078 //   Loads samples from .tr files in remaining command-line args.
00079 //   Deletes outliers and computes canonical samples.
00080 //   If FLAGS_output_trainer is set, saves the trainer for future use.
00081 // Computes canonical and cloud features.
00082 // If shape_table is not NULL, but failed to load, make a fake flat one,
00083 // as shape clustering was not run.
00084 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
00085                                 bool replication,
00086                                 ShapeTable** shape_table,
00087                                 STRING* file_prefix);
00088 }  // namespace tesseract.
00089 
00090 const char *GetNextFilename(int argc, const char* const * argv);
00091 
00092 LABELEDLIST FindList(
00093     LIST        List,
00094     char        *Label);
00095 
00096 LABELEDLIST NewLabeledList(
00097     const char  *Label);
00098 
00099 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs,
00100                          const char *feature_name, int max_samples,
00101                          UNICHARSET* unicharset,
00102                          FILE* file, LIST* training_samples);
00103 
00104 void WriteTrainingSamples(
00105     const FEATURE_DEFS_STRUCT &FeatureDefs,
00106     char *Directory,
00107     LIST CharList,
00108     const char  *program_feature_type);
00109 
00110 void FreeTrainingSamples(
00111     LIST        CharList);
00112 
00113 void FreeLabeledList(
00114     LABELEDLIST LabeledList);
00115 
00116 void FreeLabeledClassList(
00117     LIST        ClassListList);
00118 
00119 CLUSTERER *SetUpForClustering(
00120     const FEATURE_DEFS_STRUCT &FeatureDefs,
00121     LABELEDLIST CharSample,
00122     const char  *program_feature_type);
00123 
00124 LIST RemoveInsignificantProtos(
00125     LIST        ProtoList,
00126     BOOL8       KeepSigProtos,
00127     BOOL8       KeepInsigProtos,
00128     int         N);
00129 
00130 void CleanUpUnusedData(
00131     LIST        ProtoList);
00132 
00133 void MergeInsignificantProtos(
00134     LIST        ProtoList,
00135     const char  *label,
00136     CLUSTERER   *Clusterer,
00137     CLUSTERCONFIG *Config);
00138 
00139 MERGE_CLASS FindClass(
00140     LIST        List,
00141     const char        *Label);
00142 
00143 MERGE_CLASS NewLabeledClass(
00144     const char        *Label);
00145 
00146 void FreeTrainingSamples(
00147     LIST        CharList);
00148 
00149 CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset,
00150                                 LIST LabeledClassList);
00151 
00152 void Normalize(
00153     float       *Values);
00154 
00155 void FreeNormProtoList(
00156     LIST        CharList);
00157 
00158 void AddToNormProtosList(
00159     LIST*       NormProtoList,
00160     LIST        ProtoList,
00161     char        *CharName);
00162 
00163 int NumberOfProtos(
00164     LIST        ProtoList,
00165     BOOL8       CountSigProtos,
00166     BOOL8       CountInsigProtos);
00167 
00168 
00169 void allocNormProtos();
00170 #endif  // TESSERACT_TRAINING_COMMONTRAINING_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines