tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/commontraining.cpp
Go to the documentation of this file.
00001 // Copyright 2008 Google Inc. All Rights Reserved.
00002 // Author: scharron@google.com (Samuel Charron)
00003 //
00004 // Licensed under the Apache License, Version 2.0 (the "License");
00005 // you may not use this file except in compliance with the License.
00006 // You may obtain a copy of the License at
00007 // http://www.apache.org/licenses/LICENSE-2.0
00008 // Unless required by applicable law or agreed to in writing, software
00009 // distributed under the License is distributed on an "AS IS" BASIS,
00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00011 // See the License for the specific language governing permissions and
00012 // limitations under the License.
00013 
00014 #include "commontraining.h"
00015 
00016 #include "allheaders.h"
00017 #include "ccutil.h"
00018 #include "classify.h"
00019 #include "cluster.h"
00020 #include "clusttool.h"
00021 #include "efio.h"
00022 #include "emalloc.h"
00023 #include "featdefs.h"
00024 #include "fontinfo.h"
00025 #include "freelist.h"
00026 #include "globals.h"
00027 #include "intfeaturespace.h"
00028 #include "mastertrainer.h"
00029 #include "mf.h"
00030 #include "ndminx.h"
00031 #include "oldlist.h"
00032 #include "params.h"
00033 #include "shapetable.h"
00034 #include "tessdatamanager.h"
00035 #include "tessopt.h"
00036 #include "tprintf.h"
00037 #include "unicity_table.h"
00038 
00039 #include <math.h>
00040 
00041 using tesseract::CCUtil;
00042 using tesseract::FontInfo;
00043 using tesseract::IntFeatureSpace;
00044 using tesseract::ParamUtils;
00045 using tesseract::ShapeTable;
00046 
00047 // Global Variables.
00048 
00049 // global variable to hold configuration parameters to control clustering
00050 // -M 0.625   -B 0.05   -I 1.0   -C 1e-6.
00051 CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };
00052 FEATURE_DEFS_STRUCT feature_defs;
00053 CCUtil ccutil;
00054 
00055 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
00056 INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
00057 STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
00058 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
00059 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
00060 STRING_PARAM_FLAG(X, "", "File listing font xheights");
00061 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
00062 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
00063 STRING_PARAM_FLAG(T, "", "File to load trainer from");
00064 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
00065 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
00066 DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
00067                   "Min number of samples per proto as % of total");
00068 DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,
00069                   "Max percentage of samples in a cluster which have more"
00070                   " than 1 feature in that cluster");
00071 DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,
00072                   "Desired independence between dimensions");
00073 DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,
00074                   "Desired confidence in prototypes created");
00075 
00076 /*
00077  **     Parameters:
00078  **             argc    number of command line arguments to parse
00079  **             argv    command line arguments
00080  **     Globals:
00081  **             Config  current clustering parameters
00082  **     Operation:
00083  **             This routine parses the command line arguments that were
00084  **             passed to the program and ses them to set relevant
00085  **             training-related global parameters
00086  **     Return: none
00087  **     Exceptions: Illegal options terminate the program.
00088  */
00089 void ParseArguments(int* argc, char ***argv) {
00090   STRING usage;
00091   if (*argc) {
00092     usage += (*argv)[0];
00093   }
00094   usage += " [.tr files ...]";
00095   tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
00096   // Record the index of the first non-flag argument to 1, since we set
00097   // remove_flags to true when parsing the flags.
00098   tessoptind = 1;
00099   // Set some global values based on the flags.
00100   Config.MinSamples =
00101       MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
00102   Config.MaxIllegal =
00103       MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_max_illegal)));
00104   Config.Independence =
00105       MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_independence)));
00106   Config.Confidence =
00107       MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_confidence)));
00108   // Set additional parameters from config file if specified.
00109   if (!FLAGS_configfile.empty()) {
00110     tesseract::ParamUtils::ReadParamsFile(
00111         FLAGS_configfile.c_str(),
00112         tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
00113         ccutil.params());
00114   }
00115 }
00116 
00117 namespace tesseract {
00118 // Helper loads shape table from the given file.
00119 ShapeTable* LoadShapeTable(const STRING& file_prefix) {
00120   ShapeTable* shape_table = NULL;
00121   STRING shape_table_file = file_prefix;
00122   shape_table_file += kShapeTableFileSuffix;
00123   FILE* shape_fp = fopen(shape_table_file.string(), "rb");
00124   if (shape_fp != NULL) {
00125     shape_table = new ShapeTable;
00126     if (!shape_table->DeSerialize(false, shape_fp)) {
00127       delete shape_table;
00128       shape_table = NULL;
00129       tprintf("Error: Failed to read shape table %s\n",
00130               shape_table_file.string());
00131     } else {
00132       int num_shapes = shape_table->NumShapes();
00133       tprintf("Read shape table %s of %d shapes\n",
00134               shape_table_file.string(), num_shapes);
00135     }
00136     fclose(shape_fp);
00137   } else {
00138     tprintf("Warning: No shape table file present: %s\n",
00139             shape_table_file.string());
00140   }
00141   return shape_table;
00142 }
00143 
00144 // Helper to write the shape_table.
00145 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) {
00146   STRING shape_table_file = file_prefix;
00147   shape_table_file += kShapeTableFileSuffix;
00148   FILE* fp = fopen(shape_table_file.string(), "wb");
00149   if (fp != NULL) {
00150     if (!shape_table.Serialize(fp)) {
00151       fprintf(stderr, "Error writing shape table: %s\n",
00152               shape_table_file.string());
00153     }
00154     fclose(fp);
00155   } else {
00156     fprintf(stderr, "Error creating shape table: %s\n",
00157             shape_table_file.string());
00158   }
00159 }
00160 
00161 // Creates a MasterTraininer and loads the training data into it:
00162 // Initializes feature_defs and IntegerFX.
00163 // Loads the shape_table if shape_table != NULL.
00164 // Loads initial unicharset from -U command-line option.
00165 // If FLAGS_T is set, loads the majority of data from there, else:
00166 //   Loads font info from -F option.
00167 //   Loads xheights from -X option.
00168 //   Loads samples from .tr files in remaining command-line args.
00169 //   Deletes outliers and computes canonical samples.
00170 //   If FLAGS_output_trainer is set, saves the trainer for future use.
00171 // Computes canonical and cloud features.
00172 // If shape_table is not NULL, but failed to load, make a fake flat one,
00173 // as shape clustering was not run.
00174 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
00175                                 bool replication,
00176                                 ShapeTable** shape_table,
00177                                 STRING* file_prefix) {
00178   InitFeatureDefs(&feature_defs);
00179   InitIntegerFX();
00180   *file_prefix = "";
00181   if (!FLAGS_D.empty()) {
00182     *file_prefix += FLAGS_D.c_str();
00183     *file_prefix += "/";
00184   }
00185   // If we are shape clustering (NULL shape_table) or we successfully load
00186   // a shape_table written by a previous shape clustering, then
00187   // shape_analysis will be true, meaning that the MasterTrainer will replace
00188   // some members of the unicharset with their fragments.
00189   bool shape_analysis = false;
00190   if (shape_table != NULL) {
00191     *shape_table = LoadShapeTable(*file_prefix);
00192     if (*shape_table != NULL)
00193       shape_analysis = true;
00194   } else {
00195     shape_analysis = true;
00196   }
00197   MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC,
00198                                              shape_analysis,
00199                                              replication,
00200                                              FLAGS_debug_level);
00201   IntFeatureSpace fs;
00202   fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
00203   if (FLAGS_T.empty()) {
00204     trainer->LoadUnicharset(FLAGS_U.c_str());
00205     // Get basic font information from font_properties.
00206     if (!FLAGS_F.empty()) {
00207       if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
00208         delete trainer;
00209         return NULL;
00210       }
00211     }
00212     if (!FLAGS_X.empty()) {
00213       if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
00214         delete trainer;
00215         return NULL;
00216       }
00217     }
00218     trainer->SetFeatureSpace(fs);
00219     const char* page_name;
00220     // Load training data from .tr files on the command line.
00221     while ((page_name = GetNextFilename(argc, argv)) != NULL) {
00222       tprintf("Reading %s ...\n", page_name);
00223       trainer->ReadTrainingSamples(page_name, feature_defs, false);
00224 
00225       // If there is a file with [lang].[fontname].exp[num].fontinfo present,
00226       // read font spacing information in to fontinfo_table.
00227       int pagename_len = strlen(page_name);
00228       char *fontinfo_file_name = new char[pagename_len + 7];
00229       strncpy(fontinfo_file_name, page_name, pagename_len - 2);  // remove "tr"
00230       strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo");  // +"fontinfo"
00231       trainer->AddSpacingInfo(fontinfo_file_name);
00232       delete[] fontinfo_file_name;
00233 
00234       // Load the images into memory if required by the classifier.
00235       if (FLAGS_load_images) {
00236         STRING image_name = page_name;
00237         // Chop off the tr and replace with tif. Extension must be tif!
00238         image_name.truncate_at(image_name.length() - 2);
00239         image_name += "tif";
00240         trainer->LoadPageImages(image_name.string());
00241       }
00242     }
00243     trainer->PostLoadCleanup();
00244     // Write the master trainer if required.
00245     if (!FLAGS_output_trainer.empty()) {
00246       FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
00247       if (fp == NULL) {
00248         tprintf("Can't create saved trainer data!\n");
00249       } else {
00250         trainer->Serialize(fp);
00251         fclose(fp);
00252       }
00253     }
00254   } else {
00255     bool success = false;
00256     tprintf("Loading master trainer from file:%s\n",
00257             FLAGS_T.c_str());
00258     FILE* fp = fopen(FLAGS_T.c_str(), "rb");
00259     if (fp == NULL) {
00260       tprintf("Can't read file %s to initialize master trainer\n",
00261               FLAGS_T.c_str());
00262     } else {
00263       success = trainer->DeSerialize(false, fp);
00264       fclose(fp);
00265     }
00266     if (!success) {
00267       tprintf("Deserialize of master trainer failed!\n");
00268       delete trainer;
00269       return NULL;
00270     }
00271     trainer->SetFeatureSpace(fs);
00272   }
00273   trainer->PreTrainingSetup();
00274   if (!FLAGS_O.empty() &&
00275       !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
00276     fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
00277     delete trainer;
00278     return NULL;
00279   }
00280   if (shape_table != NULL) {
00281     // If we previously failed to load a shapetable, then shape clustering
00282     // wasn't run so make a flat one now.
00283     if (*shape_table == NULL) {
00284       *shape_table = new ShapeTable;
00285       trainer->SetupFlatShapeTable(*shape_table);
00286       tprintf("Flat shape table summary: %s\n",
00287               (*shape_table)->SummaryStr().string());
00288     }
00289     (*shape_table)->set_unicharset(trainer->unicharset());
00290   }
00291   return trainer;
00292 }
00293 
00294 }  // namespace tesseract.
00295 
00296 /*---------------------------------------------------------------------------*/
00297 const char *GetNextFilename(int argc, const char* const * argv) {
00298   /*
00299    **   Parameters: none
00300    **   Globals:
00301    **           tessoptind                      defined by tessopt sys call
00302    **   Operation:
00303    **           This routine returns the next command line argument.  If
00304    **           there are no remaining command line arguments, it returns
00305    **           NULL.  This routine should only be called after all option
00306    **           arguments have been parsed and removed with ParseArguments.
00307    **   Return: Next command line argument or NULL.
00308    **   Exceptions: none
00309    **   History: Fri Aug 18 09:34:12 1989, DSJ, Created.
00310    */
00311   if (tessoptind < argc)
00312     return argv[tessoptind++];
00313   else
00314     return NULL;
00315 }       /* GetNextFilename */
00316 
00317 
00318 
00319 /*---------------------------------------------------------------------------*/
00320 LABELEDLIST FindList (
00321     LIST        List,
00322     char        *Label)
00323 
00324 /*
00325  **     Parameters:
00326  **             List            list to search
00327  **             Label           label to search for
00328  **     Globals: none
00329  **     Operation:
00330  **             This routine searches thru a list of labeled lists to find
00331  **             a list with the specified label.  If a matching labeled list
00332  **             cannot be found, NULL is returned.
00333  **     Return: Labeled list with the specified Label or NULL.
00334  **     Exceptions: none
00335  **     History: Fri Aug 18 15:57:41 1989, DSJ, Created.
00336  */
00337 
00338 {
00339   LABELEDLIST   LabeledList;
00340 
00341   iterate (List)
00342   {
00343     LabeledList = (LABELEDLIST) first_node (List);
00344     if (strcmp (LabeledList->Label, Label) == 0)
00345       return (LabeledList);
00346   }
00347   return (NULL);
00348 
00349 }       /* FindList */
00350 
00351 /*---------------------------------------------------------------------------*/
00352 LABELEDLIST NewLabeledList (
00353     const char  *Label)
00354 
00355 /*
00356  **     Parameters:
00357  **             Label   label for new list
00358  **     Globals: none
00359  **     Operation:
00360  **             This routine allocates a new, empty labeled list and gives
00361  **             it the specified label.
00362  **     Return: New, empty labeled list.
00363  **     Exceptions: none
00364  **     History: Fri Aug 18 16:08:46 1989, DSJ, Created.
00365  */
00366 
00367 {
00368   LABELEDLIST   LabeledList;
00369 
00370   LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
00371   LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
00372   strcpy (LabeledList->Label, Label);
00373   LabeledList->List = NIL_LIST;
00374   LabeledList->SampleCount = 0;
00375   LabeledList->font_sample_count = 0;
00376   return (LabeledList);
00377 
00378 }       /* NewLabeledList */
00379 
00380 /*---------------------------------------------------------------------------*/
00381 // TODO(rays) This is now used only by cntraining. Convert cntraining to use
00382 // the new method or get rid of it entirely.
00383 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs,
00384                          const char *feature_name, int max_samples,
00385                          UNICHARSET* unicharset,
00386                          FILE* file, LIST* training_samples) {
00387 /*
00388 **  Parameters:
00389 **    file    open text file to read samples from
00390 **  Globals: none
00391 **  Operation:
00392 **    This routine reads training samples from a file and
00393 **    places them into a data structure which organizes the
00394 **    samples by FontName and CharName.  It then returns this
00395 **    data structure.
00396 **  Return: none
00397 **  Exceptions: none
00398 **  History: Fri Aug 18 13:11:39 1989, DSJ, Created.
00399 **       Tue May 17 1998 simplifications to structure, illiminated
00400 **        font, and feature specification levels of structure.
00401 */
00402   char    buffer[2048];
00403   char    unichar[UNICHAR_LEN + 1];
00404   LABELEDLIST char_sample;
00405   FEATURE_SET feature_samples;
00406   CHAR_DESC char_desc;
00407   int   i;
00408   int feature_type = ShortNameToFeatureType(feature_defs, feature_name);
00409   // Zero out the font_sample_count for all the classes.
00410   LIST it = *training_samples;
00411   iterate(it) {
00412     char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
00413     char_sample->font_sample_count = 0;
00414   }
00415 
00416   while (fgets(buffer, 2048, file) != NULL) {
00417     if (buffer[0] == '\n')
00418       continue;
00419 
00420     sscanf(buffer, "%*s %s", unichar);
00421     if (unicharset != NULL && !unicharset->contains_unichar(unichar)) {
00422       unicharset->unichar_insert(unichar);
00423       if (unicharset->size() > MAX_NUM_CLASSES) {
00424         tprintf("Error: Size of unicharset in training is "
00425                 "greater than MAX_NUM_CLASSES\n");
00426         exit(1);
00427       }
00428     }
00429     char_sample = FindList(*training_samples, unichar);
00430     if (char_sample == NULL) {
00431       char_sample = NewLabeledList(unichar);
00432       *training_samples = push(*training_samples, char_sample);
00433     }
00434     char_desc = ReadCharDescription(feature_defs, file);
00435     feature_samples = char_desc->FeatureSets[feature_type];
00436     if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
00437       char_sample->List = push(char_sample->List, feature_samples);
00438       char_sample->SampleCount++;
00439       char_sample->font_sample_count++;
00440     } else {
00441       FreeFeatureSet(feature_samples);
00442     }
00443     for (i = 0; i < char_desc->NumFeatureSets; i++) {
00444       if (feature_type != i)
00445         FreeFeatureSet(char_desc->FeatureSets[i]);
00446     }
00447     free(char_desc);
00448   }
00449 }  // ReadTrainingSamples
00450 
00451 
00452 /*---------------------------------------------------------------------------*/
00453 void FreeTrainingSamples(LIST CharList) {
00454 /*
00455  **     Parameters:
00456  **             FontList        list of all fonts in document
00457  **     Globals: none
00458  **     Operation:
00459  **             This routine deallocates all of the space allocated to
00460  **             the specified list of training samples.
00461  **     Return: none
00462  **     Exceptions: none
00463  **     History: Fri Aug 18 17:44:27 1989, DSJ, Created.
00464  */
00465   LABELEDLIST char_sample;
00466   FEATURE_SET FeatureSet;
00467   LIST FeatureList;
00468 
00469 
00470   iterate(CharList) {  /* iterate thru all of the fonts */
00471     char_sample = (LABELEDLIST) first_node(CharList);
00472     FeatureList = char_sample->List;
00473     iterate(FeatureList) {  /* iterate thru all of the classes */
00474       FeatureSet = (FEATURE_SET) first_node(FeatureList);
00475       FreeFeatureSet(FeatureSet);
00476     }
00477     FreeLabeledList(char_sample);
00478   }
00479   destroy(CharList);
00480 }  /* FreeTrainingSamples */
00481 
00482 /*---------------------------------------------------------------------------*/
00483 void FreeLabeledList(LABELEDLIST LabeledList) {
00484 /*
00485  **     Parameters:
00486  **             LabeledList     labeled list to be freed
00487  **     Globals: none
00488  **     Operation:
00489  **             This routine deallocates all of the memory consumed by
00490  **             a labeled list.  It does not free any memory which may be
00491  **             consumed by the items in the list.
00492  **     Return: none
00493  **     Exceptions: none
00494  **     History: Fri Aug 18 17:52:45 1989, DSJ, Created.
00495  */
00496   destroy(LabeledList->List);
00497   free(LabeledList->Label);
00498   free(LabeledList);
00499 }  /* FreeLabeledList */
00500 
00501 /*---------------------------------------------------------------------------*/
00502 CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs,
00503                               LABELEDLIST char_sample,
00504                               const char* program_feature_type) {
00505 /*
00506  **     Parameters:
00507  **             char_sample: LABELEDLIST that holds all the feature information for a
00508  **             given character.
00509  **     Globals:
00510  **             None
00511  **     Operation:
00512  **             This routine reads samples from a LABELEDLIST and enters
00513  **             those samples into a clusterer data structure.  This
00514  **             data structure is then returned to the caller.
00515  **     Return:
00516  **             Pointer to new clusterer data structure.
00517  **     Exceptions:
00518  **             None
00519  **     History:
00520  **             8/16/89, DSJ, Created.
00521  */
00522   uinT16 N;
00523   int i, j;
00524   FLOAT32 *Sample = NULL;
00525   CLUSTERER *Clusterer;
00526   inT32 CharID;
00527   LIST FeatureList = NULL;
00528   FEATURE_SET FeatureSet = NULL;
00529 
00530   int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
00531   N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
00532   Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
00533 
00534   FeatureList = char_sample->List;
00535   CharID = 0;
00536   iterate(FeatureList) {
00537     FeatureSet = (FEATURE_SET) first_node(FeatureList);
00538     for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
00539       if (Sample == NULL)
00540         Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00541       for (j = 0; j < N; j++)
00542         Sample[j] = FeatureSet->Features[i]->Params[j];
00543       MakeSample (Clusterer, Sample, CharID);
00544     }
00545     CharID++;
00546   }
00547   if ( Sample != NULL ) free( Sample );
00548   return( Clusterer );
00549 
00550 }       /* SetUpForClustering */
00551 
00552 /*------------------------------------------------------------------------*/
00553 void MergeInsignificantProtos(LIST ProtoList, const char* label,
00554                               CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
00555   PROTOTYPE     *Prototype;
00556   bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
00557 
00558   LIST pProtoList = ProtoList;
00559   iterate(pProtoList) {
00560     Prototype = (PROTOTYPE *) first_node (pProtoList);
00561     if (Prototype->Significant || Prototype->Merged)
00562       continue;
00563     FLOAT32 best_dist = 0.125;
00564     PROTOTYPE* best_match = NULL;
00565     // Find the nearest alive prototype.
00566     LIST list_it = ProtoList;
00567     iterate(list_it) {
00568       PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
00569       if (test_p != Prototype && !test_p->Merged) {
00570         FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
00571                                        Clusterer->ParamDesc,
00572                                        Prototype->Mean, test_p->Mean);
00573         if (dist < best_dist) {
00574           best_match = test_p;
00575           best_dist = dist;
00576         }
00577       }
00578     }
00579     if (best_match != NULL && !best_match->Significant) {
00580       if (debug)
00581         tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
00582                 best_match->NumSamples, Prototype->NumSamples,
00583                 best_match->Mean[0], best_match->Mean[1],
00584                 Prototype->Mean[0], Prototype->Mean[1]);
00585       best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
00586                                              Clusterer->ParamDesc,
00587                                              best_match->NumSamples,
00588                                              Prototype->NumSamples,
00589                                              best_match->Mean,
00590                                              best_match->Mean, Prototype->Mean);
00591       Prototype->NumSamples = 0;
00592       Prototype->Merged = 1;
00593     } else if (best_match != NULL) {
00594       if (debug)
00595         tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
00596                 Prototype->Mean[0], Prototype->Mean[1],
00597                 best_match->Mean[0], best_match->Mean[1]);
00598       Prototype->Merged = 1;
00599     }
00600   }
00601   // Mark significant those that now have enough samples.
00602   int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
00603   pProtoList = ProtoList;
00604   iterate(pProtoList) {
00605     Prototype = (PROTOTYPE *) first_node (pProtoList);
00606     // Process insignificant protos that do not match a green one
00607     if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
00608         !Prototype->Merged) {
00609       if (debug)
00610         tprintf("Red proto at %g,%g becoming green\n",
00611                 Prototype->Mean[0], Prototype->Mean[1]);
00612       Prototype->Significant = true;
00613     }
00614   }
00615 }       /* MergeInsignificantProtos */
00616 
00617 /*-----------------------------------------------------------------------------*/
00618 void CleanUpUnusedData(
00619     LIST ProtoList)
00620 {
00621   PROTOTYPE* Prototype;
00622 
00623   iterate(ProtoList)
00624   {
00625     Prototype = (PROTOTYPE *) first_node (ProtoList);
00626     if(Prototype->Variance.Elliptical != NULL)
00627     {
00628       memfree(Prototype->Variance.Elliptical);
00629       Prototype->Variance.Elliptical = NULL;
00630     }
00631     if(Prototype->Magnitude.Elliptical != NULL)
00632     {
00633       memfree(Prototype->Magnitude.Elliptical);
00634       Prototype->Magnitude.Elliptical = NULL;
00635     }
00636     if(Prototype->Weight.Elliptical != NULL)
00637     {
00638       memfree(Prototype->Weight.Elliptical);
00639       Prototype->Weight.Elliptical = NULL;
00640     }
00641   }
00642 }
00643 
00644 /*------------------------------------------------------------------------*/
00645 LIST RemoveInsignificantProtos(
00646     LIST ProtoList,
00647     BOOL8 KeepSigProtos,
00648     BOOL8 KeepInsigProtos,
00649     int N)
00650 
00651 {
00652   LIST NewProtoList = NIL_LIST;
00653   LIST pProtoList;
00654   PROTOTYPE* Proto;
00655   PROTOTYPE* NewProto;
00656   int i;
00657 
00658   pProtoList = ProtoList;
00659   iterate(pProtoList)
00660   {
00661     Proto = (PROTOTYPE *) first_node (pProtoList);
00662     if ((Proto->Significant && KeepSigProtos) ||
00663         (!Proto->Significant && KeepInsigProtos))
00664     {
00665       NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
00666 
00667       NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00668       NewProto->Significant = Proto->Significant;
00669       NewProto->Style = Proto->Style;
00670       NewProto->NumSamples = Proto->NumSamples;
00671       NewProto->Cluster = NULL;
00672       NewProto->Distrib = NULL;
00673 
00674       for (i=0; i < N; i++)
00675         NewProto->Mean[i] = Proto->Mean[i];
00676       if (Proto->Variance.Elliptical != NULL)
00677       {
00678         NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00679         for (i=0; i < N; i++)
00680           NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
00681       }
00682       else
00683         NewProto->Variance.Elliptical = NULL;
00684       //---------------------------------------------
00685       if (Proto->Magnitude.Elliptical != NULL)
00686       {
00687         NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00688         for (i=0; i < N; i++)
00689           NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
00690       }
00691       else
00692         NewProto->Magnitude.Elliptical = NULL;
00693       //------------------------------------------------
00694       if (Proto->Weight.Elliptical != NULL)
00695       {
00696         NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00697         for (i=0; i < N; i++)
00698           NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
00699       }
00700       else
00701         NewProto->Weight.Elliptical = NULL;
00702 
00703       NewProto->TotalMagnitude = Proto->TotalMagnitude;
00704       NewProto->LogMagnitude = Proto->LogMagnitude;
00705       NewProtoList = push_last(NewProtoList, NewProto);
00706     }
00707   }
00708   FreeProtoList(&ProtoList);
00709   return (NewProtoList);
00710 }       /* RemoveInsignificantProtos */
00711 
00712 /*----------------------------------------------------------------------------*/
00713 MERGE_CLASS FindClass (
00714     LIST        List,
00715     const char  *Label)
00716 {
00717   MERGE_CLASS   MergeClass;
00718 
00719   iterate (List)
00720   {
00721     MergeClass = (MERGE_CLASS) first_node (List);
00722     if (strcmp (MergeClass->Label, Label) == 0)
00723       return (MergeClass);
00724   }
00725   return (NULL);
00726 
00727 }       /* FindClass */
00728 
00729 /*---------------------------------------------------------------------------*/
00730 MERGE_CLASS NewLabeledClass (
00731     const char  *Label)
00732 {
00733   MERGE_CLASS   MergeClass;
00734 
00735   MergeClass = new MERGE_CLASS_NODE;
00736   MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
00737   strcpy (MergeClass->Label, Label);
00738   MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
00739   return (MergeClass);
00740 
00741 }       /* NewLabeledClass */
00742 
00743 /*-----------------------------------------------------------------------------*/
00744 void FreeLabeledClassList (
00745     LIST        ClassList)
00746 
00747 /*
00748  **     Parameters:
00749  **             FontList        list of all fonts in document
00750  **     Globals: none
00751  **     Operation:
00752  **             This routine deallocates all of the space allocated to
00753  **             the specified list of training samples.
00754  **     Return: none
00755  **     Exceptions: none
00756  **     History: Fri Aug 18 17:44:27 1989, DSJ, Created.
00757  */
00758 
00759 {
00760   MERGE_CLASS   MergeClass;
00761 
00762   iterate (ClassList)           /* iterate thru all of the fonts */
00763   {
00764     MergeClass = (MERGE_CLASS) first_node (ClassList);
00765     free (MergeClass->Label);
00766     FreeClass(MergeClass->Class);
00767     delete MergeClass;
00768   }
00769   destroy (ClassList);
00770 
00771 }       /* FreeLabeledClassList */
00772 
00774 CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset,
00775                                 LIST LabeledClassList) {
00776   MERGE_CLASS   MergeClass;
00777   CLASS_TYPE            Class;
00778   int                           NumProtos;
00779   int                           NumConfigs;
00780   int                           NumWords;
00781   int                           i, j;
00782   float                 Values[3];
00783   PROTO                 NewProto;
00784   PROTO                 OldProto;
00785   BIT_VECTOR            NewConfig;
00786   BIT_VECTOR            OldConfig;
00787 
00788   //    printf("Float2Int ...\n");
00789 
00790   CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
00791   iterate(LabeledClassList)
00792   {
00793     UnicityTableEqEq<int>   font_set;
00794     MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
00795     Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
00796     NumProtos = MergeClass->Class->NumProtos;
00797     NumConfigs = MergeClass->Class->NumConfigs;
00798     font_set.move(&MergeClass->Class->font_set);
00799     Class->NumProtos = NumProtos;
00800     Class->MaxNumProtos = NumProtos;
00801     Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
00802     for(i=0; i < NumProtos; i++)
00803     {
00804       NewProto = ProtoIn(Class, i);
00805       OldProto = ProtoIn(MergeClass->Class, i);
00806       Values[0] = OldProto->X;
00807       Values[1] = OldProto->Y;
00808       Values[2] = OldProto->Angle;
00809       Normalize(Values);
00810       NewProto->X = OldProto->X;
00811       NewProto->Y = OldProto->Y;
00812       NewProto->Length = OldProto->Length;
00813       NewProto->Angle = OldProto->Angle;
00814       NewProto->A = Values[0];
00815       NewProto->B = Values[1];
00816       NewProto->C = Values[2];
00817     }
00818 
00819     Class->NumConfigs = NumConfigs;
00820     Class->MaxNumConfigs = NumConfigs;
00821     Class->font_set.move(&font_set);
00822     Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
00823     NumWords = WordsInVectorOfSize(NumProtos);
00824     for(i=0; i < NumConfigs; i++)
00825     {
00826       NewConfig = NewBitVector(NumProtos);
00827       OldConfig = MergeClass->Class->Configurations[i];
00828       for(j=0; j < NumWords; j++)
00829         NewConfig[j] = OldConfig[j];
00830       Class->Configurations[i] = NewConfig;
00831     }
00832   }
00833   return float_classes;
00834 } // SetUpForFloat2Int
00835 
00836 /*--------------------------------------------------------------------------*/
00837 void Normalize (
00838     float  *Values)
00839 {
00840   register float Slope;
00841   register float Intercept;
00842   register float Normalizer;
00843 
00844   Slope      = tan (Values [2] * 2 * PI);
00845   Intercept  = Values [1] - Slope * Values [0];
00846   Normalizer = 1 / sqrt (Slope * Slope + 1.0);
00847 
00848   Values [0] = Slope * Normalizer;
00849   Values [1] = - Normalizer;
00850   Values [2] = Intercept * Normalizer;
00851 } // Normalize
00852 
00853 /*-------------------------------------------------------------------------*/
00854 void FreeNormProtoList (
00855     LIST        CharList)
00856 
00857 {
00858   LABELEDLIST   char_sample;
00859 
00860   iterate (CharList)            /* iterate thru all of the fonts */
00861   {
00862     char_sample = (LABELEDLIST) first_node (CharList);
00863     FreeLabeledList (char_sample);
00864   }
00865   destroy (CharList);
00866 
00867 }       // FreeNormProtoList
00868 
00869 /*---------------------------------------------------------------------------*/
00870 void AddToNormProtosList(
00871     LIST* NormProtoList,
00872     LIST ProtoList,
00873     char* CharName)
00874 {
00875   PROTOTYPE* Proto;
00876   LABELEDLIST LabeledProtoList;
00877 
00878   LabeledProtoList = NewLabeledList(CharName);
00879   iterate(ProtoList)
00880   {
00881     Proto = (PROTOTYPE *) first_node (ProtoList);
00882     LabeledProtoList->List = push(LabeledProtoList->List, Proto);
00883   }
00884   *NormProtoList = push(*NormProtoList, LabeledProtoList);
00885 }
00886 
00887 /*---------------------------------------------------------------------------*/
00888 int NumberOfProtos(
00889     LIST ProtoList,
00890     BOOL8       CountSigProtos,
00891     BOOL8       CountInsigProtos)
00892 {
00893   int N = 0;
00894   PROTOTYPE     *Proto;
00895 
00896   iterate(ProtoList)
00897   {
00898     Proto = (PROTOTYPE *) first_node ( ProtoList );
00899     if (( Proto->Significant && CountSigProtos )        ||
00900         ( ! Proto->Significant && CountInsigProtos ) )
00901       N++;
00902   }
00903   return(N);
00904 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines