tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccutil/tessdatamanager.h
Go to the documentation of this file.
00001 
00002 // File:        tessdatamanager.h
00003 // Description: Functions to handle loading/combining tesseract data files.
00004 // Author:      Daria Antonova
00005 // Created:     Wed Jun 03 11:26:43 PST 2009
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
00021 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
00022 
00023 #include <stdio.h>
00024 
00025 #include "host.h"
00026 #include "strngs.h"
00027 #include "tprintf.h"
00028 
00029 static const char kTrainedDataSuffix[] = "traineddata";
00030 
00031 // When adding new tessdata types and file suffixes, please make sure to
00032 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
00033 static const char kLangConfigFileSuffix[] = "config";
00034 static const char kUnicharsetFileSuffix[] = "unicharset";
00035 static const char kAmbigsFileSuffix[] = "unicharambigs";
00036 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
00037 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
00038 static const char kNormProtoFileSuffix[] = "normproto";
00039 static const char kPuncDawgFileSuffix[] = "punc-dawg";
00040 static const char kSystemDawgFileSuffix[] = "word-dawg";
00041 static const char kNumberDawgFileSuffix[] = "number-dawg";
00042 static const char kFreqDawgFileSuffix[] = "freq-dawg";
00043 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
00044 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
00045 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
00046 static const char kShapeTableFileSuffix[] = "shapetable";
00047 static const char kBigramDawgFileSuffix[] = "bigram-dawg";
00048 static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
00049 static const char kParamsModelFileSuffix[] = "params-model";
00050 
00051 namespace tesseract {
00052 
00053 enum TessdataType {
00054   TESSDATA_LANG_CONFIG,         // 0
00055   TESSDATA_UNICHARSET,          // 1
00056   TESSDATA_AMBIGS,              // 2
00057   TESSDATA_INTTEMP,             // 3
00058   TESSDATA_PFFMTABLE,           // 4
00059   TESSDATA_NORMPROTO,           // 5
00060   TESSDATA_PUNC_DAWG,           // 6
00061   TESSDATA_SYSTEM_DAWG,         // 7
00062   TESSDATA_NUMBER_DAWG,         // 8
00063   TESSDATA_FREQ_DAWG,           // 9
00064   TESSDATA_FIXED_LENGTH_DAWGS,  // 10  // deprecated
00065   TESSDATA_CUBE_UNICHARSET,     // 11
00066   TESSDATA_CUBE_SYSTEM_DAWG,    // 12
00067   TESSDATA_SHAPE_TABLE,         // 13
00068   TESSDATA_BIGRAM_DAWG,         // 14
00069   TESSDATA_UNAMBIG_DAWG,        // 15
00070   TESSDATA_PARAMS_MODEL,        // 16
00071 
00072   TESSDATA_NUM_ENTRIES
00073 };
00074 
00079 static const char * const kTessdataFileSuffixes[] = {
00080   kLangConfigFileSuffix,        // 0
00081   kUnicharsetFileSuffix,        // 1
00082   kAmbigsFileSuffix,            // 2
00083   kBuiltInTemplatesFileSuffix,  // 3
00084   kBuiltInCutoffsFileSuffix,    // 4
00085   kNormProtoFileSuffix,         // 5
00086   kPuncDawgFileSuffix,          // 6
00087   kSystemDawgFileSuffix,        // 7
00088   kNumberDawgFileSuffix,        // 8
00089   kFreqDawgFileSuffix,          // 9
00090   kFixedLengthDawgsFileSuffix,  // 10  // deprecated
00091   kCubeUnicharsetFileSuffix,    // 11
00092   kCubeSystemDawgFileSuffix,    // 12
00093   kShapeTableFileSuffix,        // 13
00094   kBigramDawgFileSuffix,        // 14
00095   kUnambigDawgFileSuffix,       // 15
00096   kParamsModelFileSuffix,       // 16
00097 };
00098 
00103 static const bool kTessdataFileIsText[] = {
00104   true,                         // 0
00105   true,                         // 1
00106   true,                         // 2
00107   false,                        // 3
00108   true,                         // 4
00109   true,                         // 5
00110   false,                        // 6
00111   false,                        // 7
00112   false,                        // 8
00113   false,                        // 9
00114   false,                        // 10  // deprecated
00115   true,                         // 11
00116   false,                        // 12
00117   false,                        // 13
00118   false,                        // 14
00119   false,                        // 15
00120   true,                         // 16
00121 };
00122 
00130 static const int kMaxNumTessdataEntries = 1000;
00131 
00132 
00133 class TessdataManager {
00134  public:
00135   TessdataManager() {
00136     data_file_ = NULL;
00137     actual_tessdata_num_entries_ = 0;
00138     for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00139       offset_table_[i] = -1;
00140     }
00141   }
00142   ~TessdataManager() {}
00143   int DebugLevel() { return debug_level_; }
00144 
00149   bool Init(const char *data_file_name, int debug_level);
00150 
00151   // Return the name of the underlying data file.
00152   const STRING &GetDataFileName() const { return data_file_name_; }
00153 
00155   inline FILE *GetDataFilePtr() const { return data_file_; }
00156 
00162   inline bool SeekToStart(TessdataType tessdata_type) {
00163     if (debug_level_) {
00164       tprintf("TessdataManager: seek to offset %lld - start of tessdata"
00165               "type %d (%s))\n", offset_table_[tessdata_type],
00166               tessdata_type, kTessdataFileSuffixes[tessdata_type]);
00167     }
00168     if (offset_table_[tessdata_type] < 0) {
00169       return false;
00170     } else {
00171       ASSERT_HOST(fseek(data_file_,
00172                         static_cast<size_t>(offset_table_[tessdata_type]),
00173                         SEEK_SET) == 0);
00174       return true;
00175     }
00176   }
00178   inline inT64 GetEndOffset(TessdataType tessdata_type) const {
00179     int index = tessdata_type + 1;
00180     while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
00181       ++index;  // skip tessdata types not present in the combined file
00182     }
00183     if (debug_level_) {
00184       tprintf("TessdataManager: end offset for type %d is %lld\n",
00185               tessdata_type,
00186               (index == actual_tessdata_num_entries_) ? -1
00187               : offset_table_[index]);
00188     }
00189     return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
00190   }
00192   inline void End() {
00193     if (data_file_ != NULL) {
00194       fclose(data_file_);
00195       data_file_ = NULL;
00196     }
00197   }
00198   bool swap() const {
00199     return swap_;
00200   }
00201 
00203   static void WriteMetadata(inT64 *offset_table,
00204                             const char *language_data_path_prefix,
00205                             FILE *output_file);
00206 
00212   static bool CombineDataFiles(const char *language_data_path_prefix,
00213                                const char *output_filename);
00214 
00220   bool OverwriteComponents(const char *new_traineddata_filename,
00221                             char **component_filenames,
00222                             int num_new_components);
00223 
00234   bool ExtractToFile(const char *filename);
00235 
00241   static void CopyFile(FILE *input_file, FILE *output_file,
00242                        bool newline_end, inT64 num_bytes_to_copy);
00243 
00252   static bool TessdataTypeFromFileSuffix(const char *suffix,
00253                                          TessdataType *type,
00254                                          bool *text_file);
00255 
00260   static bool TessdataTypeFromFileName(const char *filename,
00261                                        TessdataType *type,
00262                                        bool *text_file);
00263 
00264  private:
00265 
00270   static FILE *GetFilePtr(const char *language_data_path_prefix,
00271                           const char *file_suffix, bool text_file);
00272 
00277   inT64 offset_table_[TESSDATA_NUM_ENTRIES];
00286   inT32 actual_tessdata_num_entries_;
00287   STRING data_file_name_;  // name of the data file.
00288   FILE *data_file_;  
00289   int debug_level_;
00290   // True if the bytes need swapping.
00291   bool swap_;
00292 };
00293 
00294 
00295 }  // namespace tesseract
00296 
00297 #endif  // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines