tesseract
3.03
|
00001 00002 // File: tessdatamanager.h 00003 // Description: Functions to handle loading/combining tesseract data files. 00004 // Author: Daria Antonova 00005 // Created: Wed Jun 03 11:26:43 PST 2009 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 00021 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 00022 00023 #include <stdio.h> 00024 00025 #include "host.h" 00026 #include "strngs.h" 00027 #include "tprintf.h" 00028 00029 static const char kTrainedDataSuffix[] = "traineddata"; 00030 00031 // When adding new tessdata types and file suffixes, please make sure to 00032 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText. 00033 static const char kLangConfigFileSuffix[] = "config"; 00034 static const char kUnicharsetFileSuffix[] = "unicharset"; 00035 static const char kAmbigsFileSuffix[] = "unicharambigs"; 00036 static const char kBuiltInTemplatesFileSuffix[] = "inttemp"; 00037 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable"; 00038 static const char kNormProtoFileSuffix[] = "normproto"; 00039 static const char kPuncDawgFileSuffix[] = "punc-dawg"; 00040 static const char kSystemDawgFileSuffix[] = "word-dawg"; 00041 static const char kNumberDawgFileSuffix[] = "number-dawg"; 00042 static const char kFreqDawgFileSuffix[] = "freq-dawg"; 00043 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs"; 00044 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset"; 00045 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg"; 00046 static const char kShapeTableFileSuffix[] = "shapetable"; 00047 static const char kBigramDawgFileSuffix[] = "bigram-dawg"; 00048 static const char kUnambigDawgFileSuffix[] = "unambig-dawg"; 00049 static const char kParamsModelFileSuffix[] = "params-model"; 00050 00051 namespace tesseract { 00052 00053 enum TessdataType { 00054 TESSDATA_LANG_CONFIG, // 0 00055 TESSDATA_UNICHARSET, // 1 00056 TESSDATA_AMBIGS, // 2 00057 TESSDATA_INTTEMP, // 3 00058 TESSDATA_PFFMTABLE, // 4 00059 TESSDATA_NORMPROTO, // 5 00060 TESSDATA_PUNC_DAWG, // 6 00061 TESSDATA_SYSTEM_DAWG, // 7 00062 TESSDATA_NUMBER_DAWG, // 8 00063 TESSDATA_FREQ_DAWG, // 9 00064 TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated 00065 TESSDATA_CUBE_UNICHARSET, // 11 00066 TESSDATA_CUBE_SYSTEM_DAWG, // 12 00067 TESSDATA_SHAPE_TABLE, // 13 00068 TESSDATA_BIGRAM_DAWG, // 14 00069 TESSDATA_UNAMBIG_DAWG, // 15 00070 TESSDATA_PARAMS_MODEL, // 16 00071 00072 TESSDATA_NUM_ENTRIES 00073 }; 00074 00079 static const char * const kTessdataFileSuffixes[] = { 00080 kLangConfigFileSuffix, // 0 00081 kUnicharsetFileSuffix, // 1 00082 kAmbigsFileSuffix, // 2 00083 kBuiltInTemplatesFileSuffix, // 3 00084 kBuiltInCutoffsFileSuffix, // 4 00085 kNormProtoFileSuffix, // 5 00086 kPuncDawgFileSuffix, // 6 00087 kSystemDawgFileSuffix, // 7 00088 kNumberDawgFileSuffix, // 8 00089 kFreqDawgFileSuffix, // 9 00090 kFixedLengthDawgsFileSuffix, // 10 // deprecated 00091 kCubeUnicharsetFileSuffix, // 11 00092 kCubeSystemDawgFileSuffix, // 12 00093 kShapeTableFileSuffix, // 13 00094 kBigramDawgFileSuffix, // 14 00095 kUnambigDawgFileSuffix, // 15 00096 kParamsModelFileSuffix, // 16 00097 }; 00098 00103 static const bool kTessdataFileIsText[] = { 00104 true, // 0 00105 true, // 1 00106 true, // 2 00107 false, // 3 00108 true, // 4 00109 true, // 5 00110 false, // 6 00111 false, // 7 00112 false, // 8 00113 false, // 9 00114 false, // 10 // deprecated 00115 true, // 11 00116 false, // 12 00117 false, // 13 00118 false, // 14 00119 false, // 15 00120 true, // 16 00121 }; 00122 00130 static const int kMaxNumTessdataEntries = 1000; 00131 00132 00133 class TessdataManager { 00134 public: 00135 TessdataManager() { 00136 data_file_ = NULL; 00137 actual_tessdata_num_entries_ = 0; 00138 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00139 offset_table_[i] = -1; 00140 } 00141 } 00142 ~TessdataManager() {} 00143 int DebugLevel() { return debug_level_; } 00144 00149 bool Init(const char *data_file_name, int debug_level); 00150 00151 // Return the name of the underlying data file. 00152 const STRING &GetDataFileName() const { return data_file_name_; } 00153 00155 inline FILE *GetDataFilePtr() const { return data_file_; } 00156 00162 inline bool SeekToStart(TessdataType tessdata_type) { 00163 if (debug_level_) { 00164 tprintf("TessdataManager: seek to offset %lld - start of tessdata" 00165 "type %d (%s))\n", offset_table_[tessdata_type], 00166 tessdata_type, kTessdataFileSuffixes[tessdata_type]); 00167 } 00168 if (offset_table_[tessdata_type] < 0) { 00169 return false; 00170 } else { 00171 ASSERT_HOST(fseek(data_file_, 00172 static_cast<size_t>(offset_table_[tessdata_type]), 00173 SEEK_SET) == 0); 00174 return true; 00175 } 00176 } 00178 inline inT64 GetEndOffset(TessdataType tessdata_type) const { 00179 int index = tessdata_type + 1; 00180 while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) { 00181 ++index; // skip tessdata types not present in the combined file 00182 } 00183 if (debug_level_) { 00184 tprintf("TessdataManager: end offset for type %d is %lld\n", 00185 tessdata_type, 00186 (index == actual_tessdata_num_entries_) ? -1 00187 : offset_table_[index]); 00188 } 00189 return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1; 00190 } 00192 inline void End() { 00193 if (data_file_ != NULL) { 00194 fclose(data_file_); 00195 data_file_ = NULL; 00196 } 00197 } 00198 bool swap() const { 00199 return swap_; 00200 } 00201 00203 static void WriteMetadata(inT64 *offset_table, 00204 const char *language_data_path_prefix, 00205 FILE *output_file); 00206 00212 static bool CombineDataFiles(const char *language_data_path_prefix, 00213 const char *output_filename); 00214 00220 bool OverwriteComponents(const char *new_traineddata_filename, 00221 char **component_filenames, 00222 int num_new_components); 00223 00234 bool ExtractToFile(const char *filename); 00235 00241 static void CopyFile(FILE *input_file, FILE *output_file, 00242 bool newline_end, inT64 num_bytes_to_copy); 00243 00252 static bool TessdataTypeFromFileSuffix(const char *suffix, 00253 TessdataType *type, 00254 bool *text_file); 00255 00260 static bool TessdataTypeFromFileName(const char *filename, 00261 TessdataType *type, 00262 bool *text_file); 00263 00264 private: 00265 00270 static FILE *GetFilePtr(const char *language_data_path_prefix, 00271 const char *file_suffix, bool text_file); 00272 00277 inT64 offset_table_[TESSDATA_NUM_ENTRIES]; 00286 inT32 actual_tessdata_num_entries_; 00287 STRING data_file_name_; // name of the data file. 00288 FILE *data_file_; 00289 int debug_level_; 00290 // True if the bytes need swapping. 00291 bool swap_; 00292 }; 00293 00294 00295 } // namespace tesseract 00296 00297 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_