tesseract
3.03
|
00001 /****************************************************************************** 00002 ** Filename: normmatch.c 00003 ** Purpose: Simple matcher based on character normalization features. 00004 ** Author: Dan Johnson 00005 ** History: Wed Dec 19 16:18:06 1990, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00021 #include "normmatch.h" 00022 00023 #include <stdio.h> 00024 #include <math.h> 00025 00026 #include "classify.h" 00027 #include "clusttool.h" 00028 #include "const.h" 00029 #include "efio.h" 00030 #include "emalloc.h" 00031 #include "globals.h" 00032 #include "helpers.h" 00033 #include "normfeat.h" 00034 #include "scanutils.h" 00035 #include "unicharset.h" 00036 #include "params.h" 00037 00038 struct NORM_PROTOS 00039 { 00040 int NumParams; 00041 PARAM_DESC *ParamDesc; 00042 LIST* Protos; 00043 int NumProtos; 00044 }; 00045 00049 double NormEvidenceOf(register double NormAdj); 00050 00051 void PrintNormMatch(FILE *File, 00052 int NumParams, 00053 PROTOTYPE *Proto, 00054 FEATURE Feature); 00055 00056 NORM_PROTOS *ReadNormProtos(FILE *File); 00057 00062 /* control knobs used to control the normalization adjustment process */ 00063 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ..."); 00064 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ..."); 00065 // Weight of width variance against height and vertical position. 00066 const double kWidthErrorWeighting = 0.125; 00067 00071 /*---------------------------------------------------------------------------*/ 00072 namespace tesseract { 00073 FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId, 00074 const FEATURE_STRUCT& feature, 00075 BOOL8 DebugMatch) { 00076 /* 00077 ** Parameters: 00078 ** ClassId id of class to match against 00079 ** Feature character normalization feature 00080 ** DebugMatch controls dump of debug info 00081 ** Globals: 00082 ** NormProtos character normalization prototypes 00083 ** Operation: This routine compares Features against each character 00084 ** normalization proto for ClassId and returns the match 00085 ** rating of the best match. 00086 ** Return: Best match rating for Feature against protos of ClassId. 00087 ** Exceptions: none 00088 ** History: Wed Dec 19 16:56:12 1990, DSJ, Created. 00089 */ 00090 LIST Protos; 00091 FLOAT32 BestMatch; 00092 FLOAT32 Match; 00093 FLOAT32 Delta; 00094 PROTOTYPE *Proto; 00095 int ProtoId; 00096 00097 if (ClassId >= NormProtos->NumProtos) { 00098 ClassId = NO_CLASS; 00099 } 00100 00101 /* handle requests for classification as noise */ 00102 if (ClassId == NO_CLASS) { 00103 /* kludge - clean up constants and make into control knobs later */ 00104 Match = (feature.Params[CharNormLength] * 00105 feature.Params[CharNormLength] * 500.0 + 00106 feature.Params[CharNormRx] * 00107 feature.Params[CharNormRx] * 8000.0 + 00108 feature.Params[CharNormRy] * 00109 feature.Params[CharNormRy] * 8000.0); 00110 return (1.0 - NormEvidenceOf (Match)); 00111 } 00112 00113 BestMatch = MAX_FLOAT32; 00114 Protos = NormProtos->Protos[ClassId]; 00115 00116 if (DebugMatch) { 00117 tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId)); 00118 } 00119 00120 ProtoId = 0; 00121 iterate(Protos) { 00122 Proto = (PROTOTYPE *) first_node (Protos); 00123 Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY]; 00124 Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY]; 00125 if (DebugMatch) { 00126 tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", 00127 Proto->Mean[CharNormY], Delta, 00128 Proto->Weight.Elliptical[CharNormY], Match); 00129 } 00130 Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx]; 00131 Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx]; 00132 if (DebugMatch) { 00133 tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", 00134 Proto->Mean[CharNormRx], Delta, 00135 Proto->Weight.Elliptical[CharNormRx], Match); 00136 } 00137 // Ry is width! See intfx.cpp. 00138 Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy]; 00139 if (DebugMatch) { 00140 tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", 00141 Proto->Mean[CharNormRy], Delta, 00142 Proto->Weight.Elliptical[CharNormRy]); 00143 } 00144 Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy]; 00145 Delta *= kWidthErrorWeighting; 00146 Match += Delta; 00147 if (DebugMatch) { 00148 tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", 00149 Match, Match / classify_norm_adj_midpoint, 00150 NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match))); 00151 } 00152 00153 if (Match < BestMatch) 00154 BestMatch = Match; 00155 00156 ProtoId++; 00157 } 00158 return 1.0 - NormEvidenceOf(BestMatch); 00159 } /* ComputeNormMatch */ 00160 00161 void Classify::FreeNormProtos() { 00162 if (NormProtos != NULL) { 00163 for (int i = 0; i < NormProtos->NumProtos; i++) 00164 FreeProtoList(&NormProtos->Protos[i]); 00165 Efree(NormProtos->Protos); 00166 Efree(NormProtos->ParamDesc); 00167 Efree(NormProtos); 00168 NormProtos = NULL; 00169 } 00170 } 00171 } // namespace tesseract 00172 00176 /********************************************************************** 00177 * NormEvidenceOf 00178 * 00179 * Return the new type of evidence number corresponding to this 00180 * normalization adjustment. The equation that represents the transform is: 00181 * 1 / (1 + (NormAdj / midpoint) ^ curl) 00182 **********************************************************************/ 00183 double NormEvidenceOf(register double NormAdj) { 00184 NormAdj /= classify_norm_adj_midpoint; 00185 00186 if (classify_norm_adj_curl == 3) 00187 NormAdj = NormAdj * NormAdj * NormAdj; 00188 else if (classify_norm_adj_curl == 2) 00189 NormAdj = NormAdj * NormAdj; 00190 else 00191 NormAdj = pow (NormAdj, classify_norm_adj_curl); 00192 return (1.0 / (1.0 + NormAdj)); 00193 } 00194 00195 00196 /*---------------------------------------------------------------------------*/ 00197 void PrintNormMatch(FILE *File, 00198 int NumParams, 00199 PROTOTYPE *Proto, 00200 FEATURE Feature) { 00201 /* 00202 ** Parameters: 00203 ** File open text file to dump match debug info to 00204 ** NumParams # of parameters in proto and feature 00205 ** Proto[] array of prototype parameters 00206 ** Feature[] array of feature parameters 00207 ** Globals: none 00208 ** Operation: This routine dumps out detailed normalization match info. 00209 ** Return: none 00210 ** Exceptions: none 00211 ** History: Wed Jan 2 09:49:35 1991, DSJ, Created. 00212 */ 00213 int i; 00214 FLOAT32 ParamMatch; 00215 FLOAT32 TotalMatch; 00216 00217 for (i = 0, TotalMatch = 0.0; i < NumParams; i++) { 00218 ParamMatch = (Feature->Params[i] - Mean(Proto, i)) / 00219 StandardDeviation(Proto, i); 00220 00221 fprintf (File, " %6.1f", ParamMatch); 00222 00223 if (i == CharNormY || i == CharNormRx) 00224 TotalMatch += ParamMatch * ParamMatch; 00225 } 00226 fprintf (File, " --> %6.1f (%4.2f)\n", 00227 TotalMatch, NormEvidenceOf (TotalMatch)); 00228 00229 } /* PrintNormMatch */ 00230 00231 00232 /*---------------------------------------------------------------------------*/ 00233 namespace tesseract { 00234 NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) { 00235 /* 00236 ** Parameters: 00237 ** File open text file to read normalization protos from 00238 ** Globals: none 00239 ** Operation: This routine allocates a new data structure to hold 00240 ** a set of character normalization protos. It then fills in 00241 ** the data structure by reading from the specified File. 00242 ** Return: Character normalization protos. 00243 ** Exceptions: none 00244 ** History: Wed Dec 19 16:38:49 1990, DSJ, Created. 00245 */ 00246 NORM_PROTOS *NormProtos; 00247 int i; 00248 char unichar[2 * UNICHAR_LEN + 1]; 00249 UNICHAR_ID unichar_id; 00250 LIST Protos; 00251 int NumProtos; 00252 00253 /* allocate and initialization data structure */ 00254 NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS)); 00255 NormProtos->NumProtos = unicharset.size(); 00256 NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST)); 00257 for (i = 0; i < NormProtos->NumProtos; i++) 00258 NormProtos->Protos[i] = NIL_LIST; 00259 00260 /* read file header and save in data structure */ 00261 NormProtos->NumParams = ReadSampleSize (File); 00262 NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams); 00263 00264 /* read protos for each class into a separate list */ 00265 while ((end_offset < 0 || ftell(File) < end_offset) && 00266 fscanf(File, "%s %d", unichar, &NumProtos) == 2) { 00267 if (unicharset.contains_unichar(unichar)) { 00268 unichar_id = unicharset.unichar_to_id(unichar); 00269 Protos = NormProtos->Protos[unichar_id]; 00270 for (i = 0; i < NumProtos; i++) 00271 Protos = 00272 push_last (Protos, ReadPrototype (File, NormProtos->NumParams)); 00273 NormProtos->Protos[unichar_id] = Protos; 00274 } else { 00275 cprintf("Error: unichar %s in normproto file is not in unichar set.\n", 00276 unichar); 00277 for (i = 0; i < NumProtos; i++) 00278 FreePrototype(ReadPrototype (File, NormProtos->NumParams)); 00279 } 00280 SkipNewline(File); 00281 } 00282 return (NormProtos); 00283 } /* ReadNormProtos */ 00284 } // namespace tesseract