tesseract
3.03
|
00001 /****************************************************************************** 00002 ** Filename: blobclass.c 00003 ** Purpose: High level blob classification and training routines. 00004 ** Author: Dan Johnson 00005 ** History: 7/21/89, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 00022 #include "blobclass.h" 00023 #include "extract.h" 00024 #include "efio.h" 00025 #include "featdefs.h" 00026 #include "callcpp.h" 00027 00028 #include <math.h> 00029 #include <stdio.h> 00030 #include <signal.h> 00031 00032 #define MAXFILENAME 80 00033 #define MAXMATCHES 10 00034 00035 static const char kUnknownFontName[] = "UnknownFont"; 00036 00037 STRING_VAR(classify_font_name, kUnknownFontName, 00038 "Default font name to be used in training"); 00039 00043 /* name of current image file being processed */ 00044 extern char imagefile[]; 00045 00050 /*---------------------------------------------------------------------------*/ 00051 // As all TBLOBs, Blob is in baseline normalized coords. 00052 // See SetupBLCNDenorms in intfx.cpp for other args. 00053 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename, 00054 TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm, 00055 const INT_FX_RESULT_STRUCT& fx_info, const char* BlobText) { 00056 /* 00057 ** Parameters: 00058 ** Blob blob whose micro-features are to be learned 00059 ** Row row of text that blob came from 00060 ** BlobText text that corresponds to blob 00061 ** TextLength number of characters in blob 00062 ** Globals: 00063 ** imagefile base filename of the page being learned 00064 ** classify_font_name 00065 ** name of font currently being trained on 00066 ** Operation: 00067 ** Extract micro-features from the specified blob and append 00068 ** them to the appropriate file. 00069 ** Return: none 00070 ** Exceptions: none 00071 ** History: 7/28/89, DSJ, Created. 00072 */ 00073 #define TRAIN_SUFFIX ".tr" 00074 static FILE *FeatureFile = NULL; 00075 STRING Filename(filename); 00076 00077 // If no fontname was set, try to extract it from the filename 00078 STRING CurrFontName = classify_font_name; 00079 if (CurrFontName == kUnknownFontName) { 00080 // filename is expected to be of the form [lang].[fontname].exp[num] 00081 // The [lang], [fontname] and [num] fields should not have '.' characters. 00082 const char *basename = strrchr(filename.string(), '/'); 00083 const char *firstdot = strchr(basename ? basename : filename.string(), '.'); 00084 const char *lastdot = strrchr(filename.string(), '.'); 00085 if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) { 00086 ++firstdot; 00087 CurrFontName = firstdot; 00088 CurrFontName[lastdot - firstdot] = '\0'; 00089 } 00090 } 00091 00092 // if a feature file is not yet open, open it 00093 // the name of the file is the name of the image plus TRAIN_SUFFIX 00094 if (FeatureFile == NULL) { 00095 Filename += TRAIN_SUFFIX; 00096 FeatureFile = Efopen(Filename.string(), "wb"); 00097 cprintf("TRAINING ... Font name = %s\n", CurrFontName.string()); 00098 } 00099 00100 LearnBlob(FeatureDefs, FeatureFile, Blob, bl_denorm, cn_denorm, fx_info, 00101 BlobText, CurrFontName.string()); 00102 } // LearnBlob 00103 00104 void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile, 00105 TBLOB* Blob, const DENORM& bl_denorm, const DENORM& cn_denorm, 00106 const INT_FX_RESULT_STRUCT& fx_info, 00107 const char* BlobText, const char* FontName) { 00108 CHAR_DESC CharDesc; 00109 00110 ASSERT_HOST(FeatureFile != NULL); 00111 00112 CharDesc = ExtractBlobFeatures(FeatureDefs, bl_denorm, cn_denorm, fx_info, 00113 Blob); 00114 if (CharDesc == NULL) { 00115 cprintf("LearnBLob: CharDesc was NULL. Aborting.\n"); 00116 return; 00117 } 00118 00119 if (ValidCharDescription(FeatureDefs, CharDesc)) { 00120 // label the features with a class name and font name 00121 fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText); 00122 00123 // write micro-features to file and clean up 00124 WriteCharDescription(FeatureDefs, FeatureFile, CharDesc); 00125 } else { 00126 tprintf("Blob learned was invalid!\n"); 00127 } 00128 FreeCharDescription(CharDesc); 00129 00130 } // LearnBlob