tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/combine_tessdata.cpp
Go to the documentation of this file.
00001 
00002 // File:        combine_tessdata
00003 // Description: Creates a unified traineddata file from several
00004 //              data files produced by the training process.
00005 // Author:      Daria Antonova
00006 // Created:     Wed Jun 03 11:26:43 PST 2009
00007 //
00008 // (C) Copyright 2009, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #include "tessdatamanager.h"
00022 
00023 // Main program to combine/extract/overwrite tessdata components
00024 // in [lang].traineddata files.
00025 //
00026 // To combine all the individual tessdata components (unicharset, DAWGs,
00027 // classifier templates, ambiguities, language configs) located at, say,
00028 // /home/$USER/temp/eng.* run:
00029 //
00030 //   combine_tessdata /home/$USER/temp/eng.
00031 //
00032 // The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
00033 //
00034 // Specify option -e if you would like to extract individual components
00035 // from a combined traineddata file. For example, to extract language config
00036 // file and the unicharset from tessdata/eng.traineddata run:
00037 //
00038 //   combine_tessdata -e tessdata/eng.traineddata
00039 //   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
00040 //
00041 // The desired config file and unicharset will be written to
00042 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
00043 //
00044 // Specify option -o to overwrite individual components of the given
00045 // [lang].traineddata file. For example, to overwrite language config
00046 // and unichar ambiguities files in tessdata/eng.traineddata use:
00047 //
00048 //   combine_tessdata -o tessdata/eng.traineddata
00049 //   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
00050 //
00051 // As a result, tessdata/eng.traineddata will contain the new language config
00052 // and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
00053 //
00054 // Note: the file names of the files to extract to and to overwrite from should
00055 // have the appropriate file suffixes (extensions) indicating their tessdata
00056 // component type (.unicharset for the unicharset, .unicharambigs for unichar
00057 // ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
00058 //
00059 // Specify option -u to unpack all the components to the specified path:
00060 //
00061 // combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
00062 //
00063 // This will create  /home/$USER/temp/eng.* files with individual tessdata
00064 // components from tessdata/eng.traineddata.
00065 //
00066 int main(int argc, char **argv) {
00067   int i;
00068   if (argc == 2) {
00069     printf("Combining tessdata files\n");
00070     STRING lang = argv[1];
00071     char* last = &argv[1][strlen(argv[1])-1];
00072     if (*last != '.')
00073       lang += '.';
00074     STRING output_file = lang;
00075     output_file += kTrainedDataSuffix;
00076     if (!tesseract::TessdataManager::CombineDataFiles(
00077         lang.string(), output_file.string())) {
00078       printf("Error combining tessdata files into %s\n",
00079              output_file.string());
00080     } else {
00081       printf("Output %s created sucessfully.\n", output_file.string());
00082     }
00083   } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
00084                            strcmp(argv[1], "-u") == 0)) {
00085     // Initialize TessdataManager with the data in the given traineddata file.
00086     tesseract::TessdataManager tm;
00087     tm.Init(argv[2], 0);
00088     printf("Extracting tessdata components from %s\n", argv[2]);
00089     if (strcmp(argv[1], "-e") == 0) {
00090       for (i = 3; i < argc; ++i) {
00091         if (tm.ExtractToFile(argv[i])) {
00092           printf("Wrote %s\n", argv[i]);
00093         } else {
00094           printf("Not extracting %s, since this component"
00095                  " is not present\n", argv[i]);
00096         }
00097       }
00098     } else {  // extract all the components
00099       for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
00100         STRING filename = argv[3];
00101         char* last = &argv[3][strlen(argv[3])-1];
00102         if (*last != '.')
00103           filename += '.';
00104         filename += tesseract::kTessdataFileSuffixes[i];
00105         if (tm.ExtractToFile(filename.string())) {
00106           printf("Wrote %s\n", filename.string());
00107         }
00108       }
00109     }
00110     tm.End();
00111   } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
00112     // Rename the current traineddata file to a temporary name.
00113     const char *new_traineddata_filename = argv[2];
00114     STRING traineddata_filename = new_traineddata_filename;
00115     traineddata_filename += ".__tmp__";
00116     if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {
00117       tprintf("Failed to create a temporary file %s\n",
00118               traineddata_filename.string());
00119       exit(1);
00120     }
00121 
00122     // Initialize TessdataManager with the data in the given traineddata file.
00123     tesseract::TessdataManager tm;
00124     tm.Init(traineddata_filename.string(), 0);
00125 
00126     // Write the updated traineddata file.
00127     tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
00128     tm.End();
00129   } else {
00130     printf("Usage for combining tessdata components:\n"
00131            "  %s language_data_path_prefix\n"
00132            "  (e.g. %s tessdata/eng.)\n\n", argv[0], argv[0]);
00133     printf("Usage for extracting tessdata components:\n"
00134            "  %s -e traineddata_file [output_component_file...]\n"
00135            "  (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
00136            argv[0], argv[0]);
00137     printf("Usage for overwriting tessdata components:\n"
00138            "  %s -o traineddata_file [input_component_file...]\n"
00139            "  (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
00140            argv[0], argv[0]);
00141     printf("Usage for unpacking all tessdata components:\n"
00142            "  %s -u traineddata_file output_path_prefix\n"
00143            "  (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
00144     return 1;
00145   }
00146 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines