tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/classify/blobclass.cpp
Go to the documentation of this file.
00001 /******************************************************************************
00002  **      Filename:       blobclass.c
00003  **      Purpose:        High level blob classification and training routines.
00004  **      Author:         Dan Johnson
00005  **      History:        7/21/89, DSJ, Created.
00006  **
00007  **      (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 
00022 #include "blobclass.h"
00023 #include "extract.h"
00024 #include "efio.h"
00025 #include "featdefs.h"
00026 #include "callcpp.h"
00027 
00028 #include <math.h>
00029 #include <stdio.h>
00030 #include <signal.h>
00031 
00032 #define MAXFILENAME             80
00033 #define MAXMATCHES              10
00034 
00035 static const char kUnknownFontName[] = "UnknownFont";
00036 
00037 STRING_VAR(classify_font_name, kUnknownFontName,
00038            "Default font name to be used in training");
00039 
00043 /* name of current image file being processed */
00044 extern char imagefile[];
00045 
00050 /*---------------------------------------------------------------------------*/
00051 // As all TBLOBs, Blob is in baseline normalized coords.
00052 // See SetupBLCNDenorms in intfx.cpp for other args.
00053 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
00054                TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
00055                const INT_FX_RESULT_STRUCT& fx_info, const char* BlobText) {
00056 /*
00057  **      Parameters:
00058  **              Blob            blob whose micro-features are to be learned
00059  **              Row             row of text that blob came from
00060  **              BlobText        text that corresponds to blob
00061  **              TextLength      number of characters in blob
00062  **      Globals:
00063  **              imagefile       base filename of the page being learned
00064  **              classify_font_name
00065  **                              name of font currently being trained on
00066  **      Operation:
00067  **              Extract micro-features from the specified blob and append
00068  **              them to the appropriate file.
00069  **      Return: none
00070  **      Exceptions: none
00071  **      History: 7/28/89, DSJ, Created.
00072  */
00073 #define TRAIN_SUFFIX    ".tr"
00074   static FILE *FeatureFile = NULL;
00075   STRING Filename(filename);
00076 
00077   // If no fontname was set, try to extract it from the filename
00078   STRING CurrFontName = classify_font_name;
00079   if (CurrFontName == kUnknownFontName) {
00080     // filename is expected to be of the form [lang].[fontname].exp[num]
00081     // The [lang], [fontname] and [num] fields should not have '.' characters.
00082     const char *basename = strrchr(filename.string(), '/');
00083     const char *firstdot = strchr(basename ? basename : filename.string(), '.');
00084     const char *lastdot  = strrchr(filename.string(), '.');
00085     if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
00086       ++firstdot;
00087       CurrFontName = firstdot;
00088       CurrFontName[lastdot - firstdot] = '\0';
00089     }
00090   }
00091 
00092   // if a feature file is not yet open, open it
00093   // the name of the file is the name of the image plus TRAIN_SUFFIX
00094   if (FeatureFile == NULL) {
00095     Filename += TRAIN_SUFFIX;
00096     FeatureFile = Efopen(Filename.string(), "wb");
00097     cprintf("TRAINING ... Font name = %s\n", CurrFontName.string());
00098   }
00099 
00100   LearnBlob(FeatureDefs, FeatureFile, Blob, bl_denorm, cn_denorm, fx_info,
00101             BlobText, CurrFontName.string());
00102 }                                // LearnBlob
00103 
00104 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
00105                TBLOB* Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
00106                const INT_FX_RESULT_STRUCT& fx_info,
00107                const char* BlobText, const char* FontName) {
00108   CHAR_DESC CharDesc;
00109 
00110   ASSERT_HOST(FeatureFile != NULL);
00111 
00112   CharDesc = ExtractBlobFeatures(FeatureDefs, bl_denorm, cn_denorm, fx_info,
00113                                  Blob);
00114   if (CharDesc == NULL) {
00115     cprintf("LearnBLob: CharDesc was NULL. Aborting.\n");
00116     return;
00117   }
00118 
00119   if (ValidCharDescription(FeatureDefs, CharDesc)) {
00120     // label the features with a class name and font name
00121     fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText);
00122 
00123     // write micro-features to file and clean up
00124     WriteCharDescription(FeatureDefs, FeatureFile, CharDesc);
00125   } else {
00126     tprintf("Blob learned was invalid!\n");
00127   }
00128   FreeCharDescription(CharDesc);
00129 
00130 }                                // LearnBlob
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines