tesseract
3.03
|
00001 00002 // File: tessdatamanager.cpp 00003 // Description: Functions to handle loading/combining tesseract data files. 00004 // Author: Daria Antonova 00005 // Created: Wed Jun 03 11:26:43 PST 2009 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifdef _MSC_VER 00021 #pragma warning(disable:4244) // Conversion warnings 00022 #endif 00023 00024 #include "tessdatamanager.h" 00025 00026 #include <stdio.h> 00027 00028 #include "helpers.h" 00029 #include "serialis.h" 00030 #include "strngs.h" 00031 #include "tprintf.h" 00032 #include "params.h" 00033 00034 namespace tesseract { 00035 00036 bool TessdataManager::Init(const char *data_file_name, int debug_level) { 00037 int i; 00038 debug_level_ = debug_level; 00039 data_file_name_ = data_file_name; 00040 data_file_ = fopen(data_file_name, "rb"); 00041 if (data_file_ == NULL) { 00042 tprintf("Error opening data file %s\n", data_file_name); 00043 tprintf("Please make sure the TESSDATA_PREFIX environment variable is set " 00044 "to the parent directory of your \"tessdata\" directory.\n"); 00045 return false; 00046 } 00047 fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_); 00048 swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries); 00049 if (swap_) { 00050 ReverseN(&actual_tessdata_num_entries_, 00051 sizeof(actual_tessdata_num_entries_)); 00052 } 00053 ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES); 00054 fread(offset_table_, sizeof(inT64), 00055 actual_tessdata_num_entries_, data_file_); 00056 if (swap_) { 00057 for (i = 0 ; i < actual_tessdata_num_entries_; ++i) { 00058 ReverseN(&offset_table_[i], sizeof(offset_table_[i])); 00059 } 00060 } 00061 if (debug_level_) { 00062 tprintf("TessdataManager loaded %d types of tesseract data files.\n", 00063 actual_tessdata_num_entries_); 00064 for (i = 0; i < actual_tessdata_num_entries_; ++i) { 00065 tprintf("Offset for type %d is %lld\n", i, offset_table_[i]); 00066 } 00067 } 00068 return true; 00069 } 00070 00071 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file, 00072 bool newline_end, inT64 num_bytes_to_copy) { 00073 if (num_bytes_to_copy == 0) return; 00074 int buffer_size = 1024; 00075 if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) { 00076 buffer_size = num_bytes_to_copy; 00077 } 00078 inT64 num_bytes_copied = 0; 00079 char *chunk = new char[buffer_size]; 00080 int bytes_read; 00081 char last_char = 0x0; 00082 while ((bytes_read = fread(chunk, sizeof(char), 00083 buffer_size, input_file))) { 00084 fwrite(chunk, sizeof(char), bytes_read, output_file); 00085 last_char = chunk[bytes_read-1]; 00086 if (num_bytes_to_copy > 0) { 00087 num_bytes_copied += bytes_read; 00088 if (num_bytes_copied == num_bytes_to_copy) break; 00089 if (num_bytes_copied + buffer_size > num_bytes_to_copy) { 00090 buffer_size = num_bytes_to_copy - num_bytes_copied; 00091 } 00092 } 00093 } 00094 if (newline_end) ASSERT_HOST(last_char == '\n'); 00095 delete[] chunk; 00096 } 00097 00098 void TessdataManager::WriteMetadata(inT64 *offset_table, 00099 const char * language_data_path_prefix, 00100 FILE *output_file) { 00101 fseek(output_file, 0, SEEK_SET); 00102 inT32 num_entries = TESSDATA_NUM_ENTRIES; 00103 fwrite(&num_entries, sizeof(inT32), 1, output_file); 00104 fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file); 00105 fclose(output_file); 00106 00107 tprintf("TessdataManager combined tesseract data files.\n"); 00108 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00109 tprintf("Offset for type %2d (%s%-22s) is %lld\n", i, 00110 language_data_path_prefix, kTessdataFileSuffixes[i], 00111 offset_table[i]); 00112 } 00113 } 00114 00115 bool TessdataManager::CombineDataFiles( 00116 const char *language_data_path_prefix, 00117 const char *output_filename) { 00118 int i; 00119 inT64 offset_table[TESSDATA_NUM_ENTRIES]; 00120 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1; 00121 FILE *output_file = fopen(output_filename, "wb"); 00122 if (output_file == NULL) { 00123 tprintf("Error opening %s for writing\n", output_filename); 00124 return false; 00125 } 00126 // Leave some space for recording the offset_table. 00127 fseek(output_file, 00128 sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); 00129 00130 TessdataType type = TESSDATA_NUM_ENTRIES; 00131 bool text_file = false; 00132 FILE *file_ptr[TESSDATA_NUM_ENTRIES]; 00133 00134 // Load individual tessdata components from files. 00135 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00136 ASSERT_HOST(TessdataTypeFromFileSuffix( 00137 kTessdataFileSuffixes[i], &type, &text_file)); 00138 STRING filename = language_data_path_prefix; 00139 filename += kTessdataFileSuffixes[i]; 00140 file_ptr[i] = fopen(filename.string(), "rb"); 00141 if (file_ptr[i] != NULL) { 00142 offset_table[type] = ftell(output_file); 00143 CopyFile(file_ptr[i], output_file, text_file, -1); 00144 fclose(file_ptr[i]); 00145 } 00146 } 00147 00148 // Make sure that the required components are present. 00149 if (file_ptr[TESSDATA_UNICHARSET] == NULL) { 00150 tprintf("Error opening %sunicharset file\n", language_data_path_prefix); 00151 fclose(output_file); 00152 return false; 00153 } 00154 if (file_ptr[TESSDATA_INTTEMP] != NULL && 00155 (file_ptr[TESSDATA_PFFMTABLE] == NULL || 00156 file_ptr[TESSDATA_NORMPROTO] == NULL)) { 00157 tprintf("Error opening %spffmtable and/or %snormproto files" 00158 " while %sinttemp file was present\n", language_data_path_prefix, 00159 language_data_path_prefix, language_data_path_prefix); 00160 fclose(output_file); 00161 return false; 00162 } 00163 00164 WriteMetadata(offset_table, language_data_path_prefix, output_file); 00165 return true; 00166 } 00167 00168 bool TessdataManager::OverwriteComponents( 00169 const char *new_traineddata_filename, 00170 char **component_filenames, 00171 int num_new_components) { 00172 int i; 00173 inT64 offset_table[TESSDATA_NUM_ENTRIES]; 00174 TessdataType type = TESSDATA_NUM_ENTRIES; 00175 bool text_file = false; 00176 FILE *file_ptr[TESSDATA_NUM_ENTRIES]; 00177 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00178 offset_table[i] = -1; 00179 file_ptr[i] = NULL; 00180 } 00181 FILE *output_file = fopen(new_traineddata_filename, "wb"); 00182 if (output_file == NULL) { 00183 tprintf("Error opening %s for writing\n", new_traineddata_filename); 00184 return false; 00185 } 00186 00187 // Leave some space for recording the offset_table. 00188 fseek(output_file, 00189 sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); 00190 00191 // Open the files with the new components. 00192 for (i = 0; i < num_new_components; ++i) { 00193 if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file)) 00194 file_ptr[type] = fopen(component_filenames[i], "rb"); 00195 } 00196 00197 // Write updated data to the output traineddata file. 00198 for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00199 if (file_ptr[i] != NULL) { 00200 // Get the data from the opened component file. 00201 offset_table[i] = ftell(output_file); 00202 CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1); 00203 fclose(file_ptr[i]); 00204 } else { 00205 // Get this data component from the loaded data file. 00206 if (SeekToStart(static_cast<TessdataType>(i))) { 00207 offset_table[i] = ftell(output_file); 00208 CopyFile(data_file_, output_file, kTessdataFileIsText[i], 00209 GetEndOffset(static_cast<TessdataType>(i)) - 00210 ftell(data_file_) + 1); 00211 } 00212 } 00213 } 00214 const char *language_data_path_prefix = strchr(new_traineddata_filename, '.'); 00215 WriteMetadata(offset_table, language_data_path_prefix, output_file); 00216 return true; 00217 } 00218 00219 bool TessdataManager::TessdataTypeFromFileSuffix( 00220 const char *suffix, TessdataType *type, bool *text_file) { 00221 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00222 if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { 00223 *type = static_cast<TessdataType>(i); 00224 *text_file = kTessdataFileIsText[i]; 00225 return true; 00226 } 00227 } 00228 tprintf("TessdataManager can't determine which tessdata" 00229 " component is represented by %s\n", suffix); 00230 return false; 00231 } 00232 00233 bool TessdataManager::TessdataTypeFromFileName( 00234 const char *filename, TessdataType *type, bool *text_file) { 00235 // Get the file suffix (extension) 00236 const char *suffix = strrchr(filename, '.'); 00237 if (suffix == NULL || *(++suffix) == '\0') return false; 00238 return TessdataTypeFromFileSuffix(suffix, type, text_file); 00239 } 00240 00241 bool TessdataManager::ExtractToFile(const char *filename) { 00242 TessdataType type = TESSDATA_NUM_ENTRIES; 00243 bool text_file = false; 00244 ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName( 00245 filename, &type, &text_file)); 00246 if (!SeekToStart(type)) return false; 00247 00248 FILE *output_file = fopen(filename, "wb"); 00249 if (output_file == NULL) { 00250 tprintf("Error opening %s\n", filename); 00251 exit(1); 00252 } 00253 inT64 begin_offset = ftell(GetDataFilePtr()); 00254 inT64 end_offset = GetEndOffset(type); 00255 tesseract::TessdataManager::CopyFile( 00256 GetDataFilePtr(), output_file, text_file, 00257 end_offset - begin_offset + 1); 00258 fclose(output_file); 00259 return true; 00260 } 00261 00262 } // namespace tesseract