tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/tessedit.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        tessedit.cpp  (Formerly tessedit.c)
00003  * Description: Main program for merge of tess and editor.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Tue Jan 07 15:21:46 GMT 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include          "stderr.h"
00021 #include          "basedir.h"
00022 #include          "tessvars.h"
00023 #include          "control.h"
00024 #include          "reject.h"
00025 #include          "pageres.h"
00026 #include          "nwmain.h"
00027 #include          "pgedit.h"
00028 #include          "tprintf.h"
00029 #include          "tessedit.h"
00030 #include "stopper.h"
00031 #include "intmatcher.h"
00032 #include "chop.h"
00033 #include "efio.h"
00034 #include "danerror.h"
00035 #include "globals.h"
00036 #include "tesseractclass.h"
00037 #include "params.h"
00038 
00039 #define VARDIR        "configs/" /*variables files */
00040                                  //config under api
00041 #define API_CONFIG      "configs/api_config"
00042 
00043 ETEXT_DESC *global_monitor = NULL;  // progress monitor
00044 
00045 namespace tesseract {
00046 
00047 // Read a "config" file containing a set of variable, value pairs.
00048 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
00049 // and also accepts a relative or absolute path name.
00050 void Tesseract::read_config_file(const char *filename,
00051                                  SetParamConstraint constraint) {
00052   STRING path = datadir;
00053   path += "configs/";
00054   path += filename;
00055   FILE* fp;
00056   if ((fp = fopen(path.string(), "rb")) != NULL) {
00057     fclose(fp);
00058   } else {
00059     path = datadir;
00060     path += "tessconfigs/";
00061     path += filename;
00062     if ((fp = fopen(path.string(), "rb")) != NULL) {
00063       fclose(fp);
00064     } else {
00065       path = filename;
00066     }
00067   }
00068   ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
00069 }
00070 
00071 // Returns false if a unicharset file for the specified language was not found
00072 // or was invalid.
00073 // This function initializes TessdataManager. After TessdataManager is
00074 // no longer needed, TessdataManager::End() should be called.
00075 //
00076 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
00077 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
00078 // from the language-specific config file (stored in [lang].traineddata), from
00079 // the config files specified on the command line or left as the default
00080 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
00081 bool Tesseract::init_tesseract_lang_data(
00082     const char *arg0, const char *textbase, const char *language,
00083     OcrEngineMode oem, char **configs, int configs_size,
00084     const GenericVector<STRING> *vars_vec,
00085     const GenericVector<STRING> *vars_values,
00086     bool set_only_non_debug_params) {
00087   // Set the basename, compute the data directory.
00088   main_setup(arg0, textbase);
00089 
00090   // Set the language data path prefix
00091   lang = language != NULL ? language : "eng";
00092   language_data_path_prefix = datadir;
00093   language_data_path_prefix += lang;
00094   language_data_path_prefix += ".";
00095 
00096   // Initialize TessdataManager.
00097   STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
00098   if (!tessdata_manager.Init(tessdata_path.string(),
00099                              tessdata_manager_debug_level)) {
00100     return false;
00101   }
00102 
00103   // If a language specific config file (lang.config) exists, load it in.
00104   if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
00105     ParamUtils::ReadParamsFromFp(
00106         tessdata_manager.GetDataFilePtr(),
00107         tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
00108         SET_PARAM_CONSTRAINT_NONE, this->params());
00109     if (tessdata_manager_debug_level) {
00110       tprintf("Loaded language config file\n");
00111     }
00112   }
00113 
00114   SetParamConstraint set_params_constraint = set_only_non_debug_params ?
00115       SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
00116   // Load tesseract variables from config files. This is done after loading
00117   // language-specific variables from [lang].traineddata file, so that custom
00118   // config files can override values in [lang].traineddata file.
00119   for (int i = 0; i < configs_size; ++i) {
00120     read_config_file(configs[i], set_params_constraint);
00121   }
00122 
00123   // Set params specified in vars_vec (done after setting params from config
00124   // files, so that params in vars_vec can override those from files).
00125   if (vars_vec != NULL && vars_values != NULL) {
00126     for (int i = 0; i < vars_vec->size(); ++i) {
00127       if (!ParamUtils::SetParam((*vars_vec)[i].string(),
00128                                 (*vars_values)[i].string(),
00129                                 set_params_constraint, this->params())) {
00130         tprintf("Error setting param %s\n", (*vars_vec)[i].string());
00131         exit(1);
00132       }
00133     }
00134   }
00135 
00136   if (((STRING &)tessedit_write_params_to_file).length() > 0) {
00137     FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
00138     if (params_file != NULL) {
00139       ParamUtils::PrintParams(params_file, this->params());
00140       fclose(params_file);
00141       if (tessdata_manager_debug_level > 0) {
00142         tprintf("Wrote parameters to %s\n",
00143                 tessedit_write_params_to_file.string());
00144       }
00145     } else {
00146       tprintf("Failed to open %s for writing params.\n",
00147               tessedit_write_params_to_file.string());
00148     }
00149   }
00150 
00151   // Determine which ocr engine(s) should be loaded and used for recognition.
00152   if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
00153   if (tessdata_manager_debug_level) {
00154     tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
00155             static_cast<int>(tessedit_ocr_engine_mode));
00156   }
00157 
00158   // If we are only loading the config file (and so not planning on doing any
00159   // recognition) then there's nothing else do here.
00160   if (tessedit_init_config_only) {
00161     if (tessdata_manager_debug_level) {
00162       tprintf("Returning after loading config file\n");
00163     }
00164     return true;
00165   }
00166 
00167   // Load the unicharset
00168   if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
00169       !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
00170     return false;
00171   }
00172   if (unicharset.size() > MAX_NUM_CLASSES) {
00173     tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
00174     return false;
00175   }
00176   if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
00177   right_to_left_ = unicharset.major_right_to_left();
00178 
00179   // Setup initial unichar ambigs table and read universal ambigs.
00180   UNICHARSET encoder_unicharset;
00181   encoder_unicharset.CopyFrom(unicharset);
00182   unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
00183   unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
00184 
00185   if (!tessedit_ambigs_training &&
00186       tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
00187     TFile ambigs_file;
00188     ambigs_file.Open(tessdata_manager.GetDataFilePtr(),
00189                      tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1);
00190     unichar_ambigs.LoadUnicharAmbigs(
00191         encoder_unicharset,
00192         &ambigs_file,
00193         ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
00194     if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
00195   }
00196 
00197   // Load Cube objects if necessary.
00198   if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
00199     ASSERT_HOST(init_cube_objects(false, &tessdata_manager));
00200     if (tessdata_manager_debug_level)
00201       tprintf("Loaded Cube w/out combiner\n");
00202   } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
00203     ASSERT_HOST(init_cube_objects(true, &tessdata_manager));
00204     if (tessdata_manager_debug_level)
00205       tprintf("Loaded Cube with combiner\n");
00206   }
00207 
00208   // Init ParamsModel.
00209   // Load pass1 and pass2 weights (for now these two sets are the same, but in
00210   // the future separate sets of weights can be generated).
00211   for (int p = ParamsModel::PTRAIN_PASS1;
00212       p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
00213     language_model_->getParamsModel().SetPass(
00214         static_cast<ParamsModel::PassEnum>(p));
00215     if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) {
00216       if (!language_model_->getParamsModel().LoadFromFp(
00217           lang.string(), tessdata_manager.GetDataFilePtr(),
00218           tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) {
00219         return false;
00220       }
00221     }
00222   }
00223   if (tessdata_manager_debug_level) language_model_->getParamsModel().Print();
00224 
00225   return true;
00226 }
00227 
00228 // Helper returns true if the given string is in the vector of strings.
00229 static bool IsStrInList(const STRING& str,
00230                         const GenericVector<STRING>& str_list) {
00231   for (int i = 0; i < str_list.size(); ++i) {
00232     if (str_list[i] == str)
00233       return true;
00234   }
00235   return false;
00236 }
00237 
00238 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
00239 // Langs with no prefix get appended to to_load, provided they
00240 // are not in there already.
00241 // Langs with ~ prefix get appended to not_to_load, provided they are not in
00242 // there already.
00243 void Tesseract::ParseLanguageString(const char* lang_str,
00244                                     GenericVector<STRING>* to_load,
00245                                     GenericVector<STRING>* not_to_load) {
00246   STRING remains(lang_str);
00247   while (remains.length() > 0) {
00248     // Find the start of the lang code and which vector to add to.
00249     const char* start = remains.string();
00250     while (*start == '+')
00251       ++start;
00252     GenericVector<STRING>* target = to_load;
00253     if (*start == '~') {
00254       target = not_to_load;
00255       ++start;
00256     }
00257     // Find the index of the end of the lang code in string start.
00258     int end = strlen(start);
00259     const char* plus = strchr(start, '+');
00260     if (plus != NULL && plus - start < end)
00261       end = plus - start;
00262     STRING lang_code(start);
00263     lang_code.truncate_at(end);
00264     STRING next(start + end);
00265     remains = next;
00266     // Check whether lang_code is already in the target vector and add.
00267     if (!IsStrInList(lang_code, *target)) {
00268       if (tessdata_manager_debug_level)
00269         tprintf("Adding language '%s' to list\n", lang_code.string());
00270       target->push_back(lang_code);
00271     }
00272   }
00273 }
00274 
00275 // Initialize for potentially a set of languages defined by the language
00276 // string and recursively any additional languages required by any language
00277 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
00278 // See init_tesseract_internal for args.
00279 int Tesseract::init_tesseract(
00280     const char *arg0, const char *textbase, const char *language,
00281     OcrEngineMode oem, char **configs, int configs_size,
00282     const GenericVector<STRING> *vars_vec,
00283     const GenericVector<STRING> *vars_values,
00284     bool set_only_non_debug_params) {
00285   GenericVector<STRING> langs_to_load;
00286   GenericVector<STRING> langs_not_to_load;
00287   ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
00288 
00289   sub_langs_.delete_data_pointers();
00290   sub_langs_.clear();
00291   // Find the first loadable lang and load into this.
00292   // Add any languages that this language requires
00293   bool loaded_primary = false;
00294   // Load the rest into sub_langs_.
00295   for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
00296     if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
00297       const char *lang_str = langs_to_load[lang_index].string();
00298       Tesseract *tess_to_init;
00299       if (!loaded_primary) {
00300         tess_to_init = this;
00301       } else {
00302         tess_to_init = new Tesseract;
00303       }
00304 
00305       int result = tess_to_init->init_tesseract_internal(
00306           arg0, textbase, lang_str, oem, configs, configs_size,
00307           vars_vec, vars_values, set_only_non_debug_params);
00308 
00309       if (!loaded_primary) {
00310         if (result < 0) {
00311           tprintf("Failed loading language '%s'\n", lang_str);
00312         } else {
00313           if (tessdata_manager_debug_level)
00314             tprintf("Loaded language '%s' as main language\n", lang_str);
00315           ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
00316                               &langs_to_load, &langs_not_to_load);
00317           loaded_primary = true;
00318         }
00319       } else {
00320         if (result < 0) {
00321           tprintf("Failed loading language '%s'\n", lang_str);
00322           delete tess_to_init;
00323         } else {
00324           if (tessdata_manager_debug_level)
00325             tprintf("Loaded language '%s' as secondary language\n", lang_str);
00326           sub_langs_.push_back(tess_to_init);
00327           // Add any languages that this language requires
00328           ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
00329                               &langs_to_load, &langs_not_to_load);
00330         }
00331       }
00332     }
00333   }
00334   if (!loaded_primary) {
00335     tprintf("Tesseract couldn't load any languages!\n");
00336     return -1;  // Couldn't load any language!
00337   }
00338   if (!sub_langs_.empty()) {
00339     // In multilingual mode word ratings have to be directly comparable,
00340     // so use the same language model weights for all languages:
00341     // use the primary language's params model if
00342     // tessedit_use_primary_params_model is set,
00343     // otherwise use default language model weights.
00344     if (tessedit_use_primary_params_model) {
00345       for (int s = 0; s < sub_langs_.size(); ++s) {
00346         sub_langs_[s]->language_model_->getParamsModel().Copy(
00347             this->language_model_->getParamsModel());
00348       }
00349       tprintf("Using params model of the primary language\n");
00350       if (tessdata_manager_debug_level)  {
00351         this->language_model_->getParamsModel().Print();
00352       }
00353     } else {
00354       this->language_model_->getParamsModel().Clear();
00355       for (int s = 0; s < sub_langs_.size(); ++s) {
00356         sub_langs_[s]->language_model_->getParamsModel().Clear();
00357       }
00358       tprintf("Using default language params\n");
00359     }
00360   }
00361 
00362   SetupUniversalFontIds();
00363   return 0;
00364 }
00365 
00366 // Common initialization for a single language.
00367 // arg0 is the datapath for the tessdata directory, which could be the
00368 // path of the tessdata directory with no trailing /, or (if tessdata
00369 // lives in the same directory as the executable, the path of the executable,
00370 // hence the name arg0.
00371 // textbase is an optional output file basename (used only for training)
00372 // language is the language code to load.
00373 // oem controls which engine(s) will operate on the image
00374 // configs (argv) is an array of config filenames to load variables from.
00375 // May be NULL.
00376 // configs_size (argc) is the number of elements in configs.
00377 // vars_vec is an optional vector of variables to set.
00378 // vars_values is an optional corresponding vector of values for the variables
00379 // in vars_vec.
00380 // If set_only_init_params is true, then only the initialization variables
00381 // will be set.
00382 int Tesseract::init_tesseract_internal(
00383     const char *arg0, const char *textbase, const char *language,
00384     OcrEngineMode oem, char **configs, int configs_size,
00385     const GenericVector<STRING> *vars_vec,
00386     const GenericVector<STRING> *vars_values,
00387     bool set_only_non_debug_params) {
00388   if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
00389                                 configs_size, vars_vec, vars_values,
00390                                 set_only_non_debug_params)) {
00391     return -1;
00392   }
00393   if (tessedit_init_config_only) {
00394     tessdata_manager.End();
00395     return 0;
00396   }
00397   // If only Cube will be used, skip loading Tesseract classifier's
00398   // pre-trained templates.
00399   bool init_tesseract_classifier =
00400     (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY ||
00401      tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED);
00402   // If only Cube will be used and if it has its own Unicharset,
00403   // skip initializing permuter and loading Tesseract Dawgs.
00404   bool init_dict =
00405     !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY &&
00406       tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET));
00407   program_editup(textbase, init_tesseract_classifier, init_dict);
00408   tessdata_manager.End();
00409   return 0;                      //Normal exit
00410 }
00411 
00412 // Helper builds the all_fonts table by adding new fonts from new_fonts.
00413 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
00414                          UnicityTable<FontInfo>* all_fonts) {
00415   for (int i = 0; i < new_fonts.size(); ++i) {
00416     // UnicityTable uniques as we go.
00417     all_fonts->push_back(new_fonts.get(i));
00418   }
00419 }
00420 
00421 // Helper assigns an id to lang_fonts using the index in all_fonts table.
00422 static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
00423                       UnicityTable<FontInfo>* lang_fonts) {
00424   for (int i = 0; i < lang_fonts->size(); ++i) {
00425     int index = all_fonts.get_id(lang_fonts->get(i));
00426     lang_fonts->get_mutable(i)->universal_id = index;
00427   }
00428 }
00429 
00430 // Set the universal_id member of each font to be unique among all
00431 // instances of the same font loaded.
00432 void Tesseract::SetupUniversalFontIds() {
00433   // Note that we can get away with bitwise copying FontInfo in
00434   // all_fonts, as it is a temporary structure and we avoid setting the
00435   // delete callback.
00436   UnicityTable<FontInfo> all_fonts;
00437   all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo));
00438 
00439   // Create the universal ID table.
00440   CollectFonts(get_fontinfo_table(), &all_fonts);
00441   for (int i = 0; i < sub_langs_.size(); ++i) {
00442     CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
00443   }
00444   // Assign ids from the table to each font table.
00445   AssignIds(all_fonts, &get_fontinfo_table());
00446   for (int i = 0; i < sub_langs_.size(); ++i) {
00447     AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
00448   }
00449   font_table_size_ = all_fonts.size();
00450 }
00451 
00452 // init the LM component
00453 int Tesseract::init_tesseract_lm(const char *arg0,
00454                    const char *textbase,
00455                    const char *language) {
00456   if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
00457                                 NULL, 0, NULL, NULL, false))
00458     return -1;
00459   getDict().Load(Dict::GlobalDawgCache());
00460   tessdata_manager.End();
00461   return 0;
00462 }
00463 
00464 void Tesseract::end_tesseract() {
00465   end_recog();
00466 }
00467 
00468 /* Define command type identifiers */
00469 
00470 enum CMD_EVENTS
00471 {
00472   ACTION_1_CMD_EVENT,
00473   RECOG_WERDS,
00474   RECOG_PSEUDO,
00475   ACTION_2_CMD_EVENT
00476 };
00477 
00478 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines