tesseract
3.03
|
00001 00002 // File: combine_tessdata 00003 // Description: Creates a unified traineddata file from several 00004 // data files produced by the training process. 00005 // Author: Daria Antonova 00006 // Created: Wed Jun 03 11:26:43 PST 2009 00007 // 00008 // (C) Copyright 2009, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #include "tessdatamanager.h" 00022 00023 // Main program to combine/extract/overwrite tessdata components 00024 // in [lang].traineddata files. 00025 // 00026 // To combine all the individual tessdata components (unicharset, DAWGs, 00027 // classifier templates, ambiguities, language configs) located at, say, 00028 // /home/$USER/temp/eng.* run: 00029 // 00030 // combine_tessdata /home/$USER/temp/eng. 00031 // 00032 // The result will be a combined tessdata file /home/$USER/temp/eng.traineddata 00033 // 00034 // Specify option -e if you would like to extract individual components 00035 // from a combined traineddata file. For example, to extract language config 00036 // file and the unicharset from tessdata/eng.traineddata run: 00037 // 00038 // combine_tessdata -e tessdata/eng.traineddata 00039 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset 00040 // 00041 // The desired config file and unicharset will be written to 00042 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset 00043 // 00044 // Specify option -o to overwrite individual components of the given 00045 // [lang].traineddata file. For example, to overwrite language config 00046 // and unichar ambiguities files in tessdata/eng.traineddata use: 00047 // 00048 // combine_tessdata -o tessdata/eng.traineddata 00049 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs 00050 // 00051 // As a result, tessdata/eng.traineddata will contain the new language config 00052 // and unichar ambigs, plus all the original DAWGs, classifier teamples, etc. 00053 // 00054 // Note: the file names of the files to extract to and to overwrite from should 00055 // have the appropriate file suffixes (extensions) indicating their tessdata 00056 // component type (.unicharset for the unicharset, .unicharambigs for unichar 00057 // ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h. 00058 // 00059 // Specify option -u to unpack all the components to the specified path: 00060 // 00061 // combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng. 00062 // 00063 // This will create /home/$USER/temp/eng.* files with individual tessdata 00064 // components from tessdata/eng.traineddata. 00065 // 00066 int main(int argc, char **argv) { 00067 int i; 00068 if (argc == 2) { 00069 printf("Combining tessdata files\n"); 00070 STRING lang = argv[1]; 00071 char* last = &argv[1][strlen(argv[1])-1]; 00072 if (*last != '.') 00073 lang += '.'; 00074 STRING output_file = lang; 00075 output_file += kTrainedDataSuffix; 00076 if (!tesseract::TessdataManager::CombineDataFiles( 00077 lang.string(), output_file.string())) { 00078 printf("Error combining tessdata files into %s\n", 00079 output_file.string()); 00080 } else { 00081 printf("Output %s created sucessfully.\n", output_file.string()); 00082 } 00083 } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 || 00084 strcmp(argv[1], "-u") == 0)) { 00085 // Initialize TessdataManager with the data in the given traineddata file. 00086 tesseract::TessdataManager tm; 00087 tm.Init(argv[2], 0); 00088 printf("Extracting tessdata components from %s\n", argv[2]); 00089 if (strcmp(argv[1], "-e") == 0) { 00090 for (i = 3; i < argc; ++i) { 00091 if (tm.ExtractToFile(argv[i])) { 00092 printf("Wrote %s\n", argv[i]); 00093 } else { 00094 printf("Not extracting %s, since this component" 00095 " is not present\n", argv[i]); 00096 } 00097 } 00098 } else { // extract all the components 00099 for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) { 00100 STRING filename = argv[3]; 00101 char* last = &argv[3][strlen(argv[3])-1]; 00102 if (*last != '.') 00103 filename += '.'; 00104 filename += tesseract::kTessdataFileSuffixes[i]; 00105 if (tm.ExtractToFile(filename.string())) { 00106 printf("Wrote %s\n", filename.string()); 00107 } 00108 } 00109 } 00110 tm.End(); 00111 } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) { 00112 // Rename the current traineddata file to a temporary name. 00113 const char *new_traineddata_filename = argv[2]; 00114 STRING traineddata_filename = new_traineddata_filename; 00115 traineddata_filename += ".__tmp__"; 00116 if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) { 00117 tprintf("Failed to create a temporary file %s\n", 00118 traineddata_filename.string()); 00119 exit(1); 00120 } 00121 00122 // Initialize TessdataManager with the data in the given traineddata file. 00123 tesseract::TessdataManager tm; 00124 tm.Init(traineddata_filename.string(), 0); 00125 00126 // Write the updated traineddata file. 00127 tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3); 00128 tm.End(); 00129 } else { 00130 printf("Usage for combining tessdata components:\n" 00131 " %s language_data_path_prefix\n" 00132 " (e.g. %s tessdata/eng.)\n\n", argv[0], argv[0]); 00133 printf("Usage for extracting tessdata components:\n" 00134 " %s -e traineddata_file [output_component_file...]\n" 00135 " (e.g. %s -e eng.traineddata eng.unicharset)\n\n", 00136 argv[0], argv[0]); 00137 printf("Usage for overwriting tessdata components:\n" 00138 " %s -o traineddata_file [input_component_file...]\n" 00139 " (e.g. %s -o eng.traineddata eng.unicharset)\n\n", 00140 argv[0], argv[0]); 00141 printf("Usage for unpacking all tessdata components:\n" 00142 " %s -u traineddata_file output_path_prefix\n" 00143 " (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]); 00144 return 1; 00145 } 00146 }