tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/classify/normmatch.cpp
Go to the documentation of this file.
00001 /******************************************************************************
00002  **     Filename:    normmatch.c
00003  **     Purpose:     Simple matcher based on character normalization features.
00004  **     Author:      Dan Johnson
00005  **     History:     Wed Dec 19 16:18:06 1990, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00021 #include "normmatch.h"
00022 
00023 #include <stdio.h>
00024 #include <math.h>
00025 
00026 #include "classify.h"
00027 #include "clusttool.h"
00028 #include "const.h"
00029 #include "efio.h"
00030 #include "emalloc.h"
00031 #include "globals.h"
00032 #include "helpers.h"
00033 #include "normfeat.h"
00034 #include "scanutils.h"
00035 #include "unicharset.h"
00036 #include "params.h"
00037 
00038 struct NORM_PROTOS
00039 {
00040   int NumParams;
00041   PARAM_DESC *ParamDesc;
00042   LIST* Protos;
00043   int NumProtos;
00044 };
00045 
00049 double NormEvidenceOf(register double NormAdj);
00050 
00051 void PrintNormMatch(FILE *File,
00052                     int NumParams,
00053                     PROTOTYPE *Proto,
00054                     FEATURE Feature);
00055 
00056 NORM_PROTOS *ReadNormProtos(FILE *File);
00057 
00062 /* control knobs used to control the normalization adjustment process */
00063 double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
00064 double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
00065 // Weight of width variance against height and vertical position.
00066 const double kWidthErrorWeighting = 0.125;
00067 
00071 /*---------------------------------------------------------------------------*/
00072 namespace tesseract {
00073 FLOAT32 Classify::ComputeNormMatch(CLASS_ID ClassId,
00074                                    const FEATURE_STRUCT& feature,
00075                                    BOOL8 DebugMatch) {
00076 /*
00077  **     Parameters:
00078  **             ClassId         id of class to match against
00079  **             Feature         character normalization feature
00080  **             DebugMatch      controls dump of debug info
00081  **     Globals:
00082  **             NormProtos      character normalization prototypes
00083  **     Operation: This routine compares Features against each character
00084  **             normalization proto for ClassId and returns the match
00085  **             rating of the best match.
00086  **     Return: Best match rating for Feature against protos of ClassId.
00087  **     Exceptions: none
00088  **     History: Wed Dec 19 16:56:12 1990, DSJ, Created.
00089  */
00090   LIST Protos;
00091   FLOAT32 BestMatch;
00092   FLOAT32 Match;
00093   FLOAT32 Delta;
00094   PROTOTYPE *Proto;
00095   int ProtoId;
00096 
00097   if (ClassId >= NormProtos->NumProtos) {
00098     ClassId = NO_CLASS;
00099   }
00100 
00101   /* handle requests for classification as noise */
00102   if (ClassId == NO_CLASS) {
00103     /* kludge - clean up constants and make into control knobs later */
00104     Match = (feature.Params[CharNormLength] *
00105       feature.Params[CharNormLength] * 500.0 +
00106       feature.Params[CharNormRx] *
00107       feature.Params[CharNormRx] * 8000.0 +
00108       feature.Params[CharNormRy] *
00109       feature.Params[CharNormRy] * 8000.0);
00110     return (1.0 - NormEvidenceOf (Match));
00111   }
00112 
00113   BestMatch = MAX_FLOAT32;
00114   Protos = NormProtos->Protos[ClassId];
00115 
00116   if (DebugMatch) {
00117     tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
00118   }
00119 
00120   ProtoId = 0;
00121   iterate(Protos) {
00122     Proto = (PROTOTYPE *) first_node (Protos);
00123     Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
00124     Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
00125     if (DebugMatch) {
00126       tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
00127               Proto->Mean[CharNormY], Delta,
00128               Proto->Weight.Elliptical[CharNormY], Match);
00129     }
00130     Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
00131     Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
00132     if (DebugMatch) {
00133       tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n",
00134               Proto->Mean[CharNormRx], Delta,
00135               Proto->Weight.Elliptical[CharNormRx], Match);
00136     }
00137     // Ry is width! See intfx.cpp.
00138     Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
00139     if (DebugMatch) {
00140       tprintf("Width: Proto=%g, Delta=%g, Var=%g\n",
00141               Proto->Mean[CharNormRy], Delta,
00142               Proto->Weight.Elliptical[CharNormRy]);
00143     }
00144     Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
00145     Delta *= kWidthErrorWeighting;
00146     Match += Delta;
00147     if (DebugMatch) {
00148       tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n",
00149               Match, Match / classify_norm_adj_midpoint,
00150               NormEvidenceOf(Match), 256 * (1 - NormEvidenceOf(Match)));
00151     }
00152 
00153     if (Match < BestMatch)
00154       BestMatch = Match;
00155 
00156     ProtoId++;
00157   }
00158   return 1.0 - NormEvidenceOf(BestMatch);
00159 }                                /* ComputeNormMatch */
00160 
00161 void Classify::FreeNormProtos() {
00162   if (NormProtos != NULL) {
00163     for (int i = 0; i < NormProtos->NumProtos; i++)
00164       FreeProtoList(&NormProtos->Protos[i]);
00165     Efree(NormProtos->Protos);
00166     Efree(NormProtos->ParamDesc);
00167     Efree(NormProtos);
00168     NormProtos = NULL;
00169   }
00170 }
00171 }  // namespace tesseract
00172 
00176 /**********************************************************************
00177  * NormEvidenceOf
00178  *
00179  * Return the new type of evidence number corresponding to this
00180  * normalization adjustment.  The equation that represents the transform is:
00181  *       1 / (1 + (NormAdj / midpoint) ^ curl)
00182  **********************************************************************/
00183 double NormEvidenceOf(register double NormAdj) {
00184   NormAdj /= classify_norm_adj_midpoint;
00185 
00186   if (classify_norm_adj_curl == 3)
00187     NormAdj = NormAdj * NormAdj * NormAdj;
00188   else if (classify_norm_adj_curl == 2)
00189     NormAdj = NormAdj * NormAdj;
00190   else
00191     NormAdj = pow (NormAdj, classify_norm_adj_curl);
00192   return (1.0 / (1.0 + NormAdj));
00193 }
00194 
00195 
00196 /*---------------------------------------------------------------------------*/
00197 void PrintNormMatch(FILE *File,
00198                     int NumParams,
00199                     PROTOTYPE *Proto,
00200                     FEATURE Feature) {
00201 /*
00202  **     Parameters:
00203  **             File            open text file to dump match debug info to
00204  **             NumParams       # of parameters in proto and feature
00205  **             Proto[]         array of prototype parameters
00206  **             Feature[]       array of feature parameters
00207  **     Globals: none
00208  **     Operation: This routine dumps out detailed normalization match info.
00209  **     Return: none
00210  **     Exceptions: none
00211  **     History: Wed Jan  2 09:49:35 1991, DSJ, Created.
00212  */
00213   int i;
00214   FLOAT32 ParamMatch;
00215   FLOAT32 TotalMatch;
00216 
00217   for (i = 0, TotalMatch = 0.0; i < NumParams; i++) {
00218     ParamMatch = (Feature->Params[i] - Mean(Proto, i)) /
00219       StandardDeviation(Proto, i);
00220 
00221     fprintf (File, " %6.1f", ParamMatch);
00222 
00223     if (i == CharNormY || i == CharNormRx)
00224       TotalMatch += ParamMatch * ParamMatch;
00225   }
00226   fprintf (File, " --> %6.1f (%4.2f)\n",
00227     TotalMatch, NormEvidenceOf (TotalMatch));
00228 
00229 }                                /* PrintNormMatch */
00230 
00231 
00232 /*---------------------------------------------------------------------------*/
00233 namespace tesseract {
00234 NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
00235 /*
00236  **     Parameters:
00237  **             File    open text file to read normalization protos from
00238  **     Globals: none
00239  **     Operation: This routine allocates a new data structure to hold
00240  **             a set of character normalization protos.  It then fills in
00241  **             the data structure by reading from the specified File.
00242  **     Return: Character normalization protos.
00243  **     Exceptions: none
00244  **     History: Wed Dec 19 16:38:49 1990, DSJ, Created.
00245  */
00246   NORM_PROTOS *NormProtos;
00247   int i;
00248   char unichar[2 * UNICHAR_LEN + 1];
00249   UNICHAR_ID unichar_id;
00250   LIST Protos;
00251   int NumProtos;
00252 
00253   /* allocate and initialization data structure */
00254   NormProtos = (NORM_PROTOS *) Emalloc (sizeof (NORM_PROTOS));
00255   NormProtos->NumProtos = unicharset.size();
00256   NormProtos->Protos = (LIST *) Emalloc (NormProtos->NumProtos * sizeof(LIST));
00257   for (i = 0; i < NormProtos->NumProtos; i++)
00258     NormProtos->Protos[i] = NIL_LIST;
00259 
00260   /* read file header and save in data structure */
00261   NormProtos->NumParams = ReadSampleSize (File);
00262   NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams);
00263 
00264   /* read protos for each class into a separate list */
00265   while ((end_offset < 0 || ftell(File) < end_offset) &&
00266          fscanf(File, "%s %d", unichar, &NumProtos) == 2) {
00267     if (unicharset.contains_unichar(unichar)) {
00268       unichar_id = unicharset.unichar_to_id(unichar);
00269       Protos = NormProtos->Protos[unichar_id];
00270       for (i = 0; i < NumProtos; i++)
00271         Protos =
00272             push_last (Protos, ReadPrototype (File, NormProtos->NumParams));
00273       NormProtos->Protos[unichar_id] = Protos;
00274     } else {
00275       cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
00276               unichar);
00277       for (i = 0; i < NumProtos; i++)
00278         FreePrototype(ReadPrototype (File, NormProtos->NumParams));
00279     }
00280     SkipNewline(File);
00281   }
00282   return (NormProtos);
00283 }                                /* ReadNormProtos */
00284 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines