tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccutil/tessdatamanager.cpp
Go to the documentation of this file.
00001 
00002 // File:        tessdatamanager.cpp
00003 // Description: Functions to handle loading/combining tesseract data files.
00004 // Author:      Daria Antonova
00005 // Created:     Wed Jun 03 11:26:43 PST 2009
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifdef _MSC_VER
00021 #pragma warning(disable:4244)  // Conversion warnings
00022 #endif
00023 
00024 #include "tessdatamanager.h"
00025 
00026 #include <stdio.h>
00027 
00028 #include "helpers.h"
00029 #include "serialis.h"
00030 #include "strngs.h"
00031 #include "tprintf.h"
00032 #include "params.h"
00033 
00034 namespace tesseract {
00035 
00036 bool TessdataManager::Init(const char *data_file_name, int debug_level) {
00037   int i;
00038   debug_level_ = debug_level;
00039   data_file_name_ = data_file_name;
00040   data_file_ = fopen(data_file_name, "rb");
00041   if (data_file_ == NULL) {
00042     tprintf("Error opening data file %s\n", data_file_name);
00043     tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
00044             "to the parent directory of your \"tessdata\" directory.\n");
00045     return false;
00046   }
00047   fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
00048   swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
00049   if (swap_) {
00050     ReverseN(&actual_tessdata_num_entries_,
00051              sizeof(actual_tessdata_num_entries_));
00052   }
00053   ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
00054   fread(offset_table_, sizeof(inT64),
00055         actual_tessdata_num_entries_, data_file_);
00056   if (swap_) {
00057     for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
00058       ReverseN(&offset_table_[i], sizeof(offset_table_[i]));
00059     }
00060   }
00061   if (debug_level_) {
00062     tprintf("TessdataManager loaded %d types of tesseract data files.\n",
00063             actual_tessdata_num_entries_);
00064     for (i = 0; i < actual_tessdata_num_entries_; ++i) {
00065       tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
00066     }
00067   }
00068   return true;
00069 }
00070 
00071 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
00072                                bool newline_end, inT64 num_bytes_to_copy) {
00073   if (num_bytes_to_copy == 0) return;
00074   int buffer_size = 1024;
00075   if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
00076     buffer_size = num_bytes_to_copy;
00077   }
00078   inT64 num_bytes_copied = 0;
00079   char *chunk = new char[buffer_size];
00080   int bytes_read;
00081   char last_char = 0x0;
00082   while ((bytes_read = fread(chunk, sizeof(char),
00083                              buffer_size, input_file))) {
00084     fwrite(chunk, sizeof(char), bytes_read, output_file);
00085     last_char = chunk[bytes_read-1];
00086     if (num_bytes_to_copy > 0) {
00087       num_bytes_copied += bytes_read;
00088       if (num_bytes_copied == num_bytes_to_copy) break;
00089       if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
00090         buffer_size = num_bytes_to_copy - num_bytes_copied;
00091       }
00092     }
00093   }
00094   if (newline_end) ASSERT_HOST(last_char == '\n');
00095   delete[] chunk;
00096 }
00097 
00098 void TessdataManager::WriteMetadata(inT64 *offset_table,
00099                                     const char * language_data_path_prefix,
00100                                     FILE *output_file) {
00101   fseek(output_file, 0, SEEK_SET);
00102   inT32 num_entries = TESSDATA_NUM_ENTRIES;
00103   fwrite(&num_entries, sizeof(inT32), 1, output_file);
00104   fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
00105   fclose(output_file);
00106 
00107   tprintf("TessdataManager combined tesseract data files.\n");
00108   for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00109     tprintf("Offset for type %2d (%s%-22s) is %lld\n", i,
00110             language_data_path_prefix, kTessdataFileSuffixes[i],
00111             offset_table[i]);
00112   }
00113 }
00114 
00115 bool TessdataManager::CombineDataFiles(
00116     const char *language_data_path_prefix,
00117     const char *output_filename) {
00118   int i;
00119   inT64 offset_table[TESSDATA_NUM_ENTRIES];
00120   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
00121   FILE *output_file = fopen(output_filename, "wb");
00122   if (output_file == NULL) {
00123     tprintf("Error opening %s for writing\n", output_filename);
00124     return false;
00125   }
00126   // Leave some space for recording the offset_table.
00127   fseek(output_file,
00128         sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
00129 
00130   TessdataType type = TESSDATA_NUM_ENTRIES;
00131   bool text_file = false;
00132   FILE *file_ptr[TESSDATA_NUM_ENTRIES];
00133 
00134   // Load individual tessdata components from files.
00135   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00136     ASSERT_HOST(TessdataTypeFromFileSuffix(
00137         kTessdataFileSuffixes[i], &type, &text_file));
00138     STRING filename = language_data_path_prefix;
00139     filename += kTessdataFileSuffixes[i];
00140     file_ptr[i] =  fopen(filename.string(), "rb");
00141     if (file_ptr[i] != NULL) {
00142       offset_table[type] = ftell(output_file);
00143       CopyFile(file_ptr[i], output_file, text_file, -1);
00144       fclose(file_ptr[i]);
00145     }
00146   }
00147 
00148   // Make sure that the required components are present.
00149   if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
00150     tprintf("Error opening %sunicharset file\n", language_data_path_prefix);
00151     fclose(output_file);
00152     return false;
00153   }
00154   if (file_ptr[TESSDATA_INTTEMP] != NULL &&
00155       (file_ptr[TESSDATA_PFFMTABLE] == NULL ||
00156        file_ptr[TESSDATA_NORMPROTO] == NULL)) {
00157     tprintf("Error opening %spffmtable and/or %snormproto files"
00158             " while %sinttemp file was present\n", language_data_path_prefix,
00159             language_data_path_prefix, language_data_path_prefix);
00160     fclose(output_file);
00161     return false;
00162   }
00163 
00164   WriteMetadata(offset_table, language_data_path_prefix, output_file);
00165   return true;
00166 }
00167 
00168 bool TessdataManager::OverwriteComponents(
00169     const char *new_traineddata_filename,
00170     char **component_filenames,
00171     int num_new_components) {
00172   int i;
00173   inT64 offset_table[TESSDATA_NUM_ENTRIES];
00174   TessdataType type = TESSDATA_NUM_ENTRIES;
00175   bool text_file = false;
00176   FILE *file_ptr[TESSDATA_NUM_ENTRIES];
00177   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00178     offset_table[i] = -1;
00179     file_ptr[i] = NULL;
00180   }
00181   FILE *output_file = fopen(new_traineddata_filename, "wb");
00182   if (output_file == NULL) {
00183     tprintf("Error opening %s for writing\n", new_traineddata_filename);
00184     return false;
00185   }
00186 
00187   // Leave some space for recording the offset_table.
00188   fseek(output_file,
00189         sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
00190 
00191   // Open the files with the new components.
00192   for (i = 0; i < num_new_components; ++i) {
00193     if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file))
00194       file_ptr[type] = fopen(component_filenames[i], "rb");
00195   }
00196 
00197   // Write updated data to the output traineddata file.
00198   for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00199     if (file_ptr[i] != NULL) {
00200       // Get the data from the opened component file.
00201       offset_table[i] = ftell(output_file);
00202       CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
00203       fclose(file_ptr[i]);
00204     } else {
00205       // Get this data component from the loaded data file.
00206       if (SeekToStart(static_cast<TessdataType>(i))) {
00207         offset_table[i] = ftell(output_file);
00208         CopyFile(data_file_, output_file, kTessdataFileIsText[i],
00209                  GetEndOffset(static_cast<TessdataType>(i)) -
00210                  ftell(data_file_) + 1);
00211       }
00212     }
00213   }
00214   const char *language_data_path_prefix = strchr(new_traineddata_filename, '.');
00215   WriteMetadata(offset_table, language_data_path_prefix, output_file);
00216   return true;
00217 }
00218 
00219 bool TessdataManager::TessdataTypeFromFileSuffix(
00220     const char *suffix, TessdataType *type, bool *text_file) {
00221   for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00222     if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
00223       *type = static_cast<TessdataType>(i);
00224       *text_file = kTessdataFileIsText[i];
00225       return true;
00226     }
00227   }
00228   tprintf("TessdataManager can't determine which tessdata"
00229          " component is represented by %s\n", suffix);
00230   return false;
00231 }
00232 
00233 bool TessdataManager::TessdataTypeFromFileName(
00234     const char *filename, TessdataType *type, bool *text_file) {
00235   // Get the file suffix (extension)
00236   const char *suffix = strrchr(filename, '.');
00237   if (suffix == NULL || *(++suffix) == '\0') return false;
00238   return TessdataTypeFromFileSuffix(suffix, type, text_file);
00239 }
00240 
00241 bool TessdataManager::ExtractToFile(const char *filename) {
00242   TessdataType type = TESSDATA_NUM_ENTRIES;
00243   bool text_file = false;
00244   ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(
00245       filename, &type, &text_file));
00246   if (!SeekToStart(type)) return false;
00247 
00248   FILE *output_file = fopen(filename, "wb");
00249   if (output_file == NULL) {
00250     tprintf("Error opening %s\n", filename);
00251     exit(1);
00252   }
00253   inT64 begin_offset = ftell(GetDataFilePtr());
00254   inT64 end_offset = GetEndOffset(type);
00255   tesseract::TessdataManager::CopyFile(
00256       GetDataFilePtr(), output_file, text_file,
00257       end_offset - begin_offset + 1);
00258   fclose(output_file);
00259   return true;
00260 }
00261 
00262 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines