tesseract
3.03
|
00001 /********************************************************************** 00002 * File: tessedit.cpp (Formerly tessedit.c) 00003 * Description: Main program for merge of tess and editor. 00004 * Author: Ray Smith 00005 * Created: Tue Jan 07 15:21:46 GMT 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include "stderr.h" 00021 #include "basedir.h" 00022 #include "tessvars.h" 00023 #include "control.h" 00024 #include "reject.h" 00025 #include "pageres.h" 00026 #include "nwmain.h" 00027 #include "pgedit.h" 00028 #include "tprintf.h" 00029 #include "tessedit.h" 00030 #include "stopper.h" 00031 #include "intmatcher.h" 00032 #include "chop.h" 00033 #include "efio.h" 00034 #include "danerror.h" 00035 #include "globals.h" 00036 #include "tesseractclass.h" 00037 #include "params.h" 00038 00039 #define VARDIR "configs/" /*variables files */ 00040 //config under api 00041 #define API_CONFIG "configs/api_config" 00042 00043 ETEXT_DESC *global_monitor = NULL; // progress monitor 00044 00045 namespace tesseract { 00046 00047 // Read a "config" file containing a set of variable, value pairs. 00048 // Searches the standard places: tessdata/configs, tessdata/tessconfigs 00049 // and also accepts a relative or absolute path name. 00050 void Tesseract::read_config_file(const char *filename, 00051 SetParamConstraint constraint) { 00052 STRING path = datadir; 00053 path += "configs/"; 00054 path += filename; 00055 FILE* fp; 00056 if ((fp = fopen(path.string(), "rb")) != NULL) { 00057 fclose(fp); 00058 } else { 00059 path = datadir; 00060 path += "tessconfigs/"; 00061 path += filename; 00062 if ((fp = fopen(path.string(), "rb")) != NULL) { 00063 fclose(fp); 00064 } else { 00065 path = filename; 00066 } 00067 } 00068 ParamUtils::ReadParamsFile(path.string(), constraint, this->params()); 00069 } 00070 00071 // Returns false if a unicharset file for the specified language was not found 00072 // or was invalid. 00073 // This function initializes TessdataManager. After TessdataManager is 00074 // no longer needed, TessdataManager::End() should be called. 00075 // 00076 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless 00077 // it is OEM_DEFAULT, in which case the value of the variable will be obtained 00078 // from the language-specific config file (stored in [lang].traineddata), from 00079 // the config files specified on the command line or left as the default 00080 // OEM_TESSERACT_ONLY if none of the configs specify this variable. 00081 bool Tesseract::init_tesseract_lang_data( 00082 const char *arg0, const char *textbase, const char *language, 00083 OcrEngineMode oem, char **configs, int configs_size, 00084 const GenericVector<STRING> *vars_vec, 00085 const GenericVector<STRING> *vars_values, 00086 bool set_only_non_debug_params) { 00087 // Set the basename, compute the data directory. 00088 main_setup(arg0, textbase); 00089 00090 // Set the language data path prefix 00091 lang = language != NULL ? language : "eng"; 00092 language_data_path_prefix = datadir; 00093 language_data_path_prefix += lang; 00094 language_data_path_prefix += "."; 00095 00096 // Initialize TessdataManager. 00097 STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix; 00098 if (!tessdata_manager.Init(tessdata_path.string(), 00099 tessdata_manager_debug_level)) { 00100 return false; 00101 } 00102 00103 // If a language specific config file (lang.config) exists, load it in. 00104 if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) { 00105 ParamUtils::ReadParamsFromFp( 00106 tessdata_manager.GetDataFilePtr(), 00107 tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG), 00108 SET_PARAM_CONSTRAINT_NONE, this->params()); 00109 if (tessdata_manager_debug_level) { 00110 tprintf("Loaded language config file\n"); 00111 } 00112 } 00113 00114 SetParamConstraint set_params_constraint = set_only_non_debug_params ? 00115 SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE; 00116 // Load tesseract variables from config files. This is done after loading 00117 // language-specific variables from [lang].traineddata file, so that custom 00118 // config files can override values in [lang].traineddata file. 00119 for (int i = 0; i < configs_size; ++i) { 00120 read_config_file(configs[i], set_params_constraint); 00121 } 00122 00123 // Set params specified in vars_vec (done after setting params from config 00124 // files, so that params in vars_vec can override those from files). 00125 if (vars_vec != NULL && vars_values != NULL) { 00126 for (int i = 0; i < vars_vec->size(); ++i) { 00127 if (!ParamUtils::SetParam((*vars_vec)[i].string(), 00128 (*vars_values)[i].string(), 00129 set_params_constraint, this->params())) { 00130 tprintf("Error setting param %s\n", (*vars_vec)[i].string()); 00131 exit(1); 00132 } 00133 } 00134 } 00135 00136 if (((STRING &)tessedit_write_params_to_file).length() > 0) { 00137 FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb"); 00138 if (params_file != NULL) { 00139 ParamUtils::PrintParams(params_file, this->params()); 00140 fclose(params_file); 00141 if (tessdata_manager_debug_level > 0) { 00142 tprintf("Wrote parameters to %s\n", 00143 tessedit_write_params_to_file.string()); 00144 } 00145 } else { 00146 tprintf("Failed to open %s for writing params.\n", 00147 tessedit_write_params_to_file.string()); 00148 } 00149 } 00150 00151 // Determine which ocr engine(s) should be loaded and used for recognition. 00152 if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem); 00153 if (tessdata_manager_debug_level) { 00154 tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n", 00155 static_cast<int>(tessedit_ocr_engine_mode)); 00156 } 00157 00158 // If we are only loading the config file (and so not planning on doing any 00159 // recognition) then there's nothing else do here. 00160 if (tessedit_init_config_only) { 00161 if (tessdata_manager_debug_level) { 00162 tprintf("Returning after loading config file\n"); 00163 } 00164 return true; 00165 } 00166 00167 // Load the unicharset 00168 if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) || 00169 !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) { 00170 return false; 00171 } 00172 if (unicharset.size() > MAX_NUM_CLASSES) { 00173 tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); 00174 return false; 00175 } 00176 if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n"); 00177 right_to_left_ = unicharset.major_right_to_left(); 00178 00179 // Setup initial unichar ambigs table and read universal ambigs. 00180 UNICHARSET encoder_unicharset; 00181 encoder_unicharset.CopyFrom(unicharset); 00182 unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption); 00183 unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset); 00184 00185 if (!tessedit_ambigs_training && 00186 tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) { 00187 TFile ambigs_file; 00188 ambigs_file.Open(tessdata_manager.GetDataFilePtr(), 00189 tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1); 00190 unichar_ambigs.LoadUnicharAmbigs( 00191 encoder_unicharset, 00192 &ambigs_file, 00193 ambigs_debug_level, use_ambigs_for_adaption, &unicharset); 00194 if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n"); 00195 } 00196 00197 // Load Cube objects if necessary. 00198 if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { 00199 ASSERT_HOST(init_cube_objects(false, &tessdata_manager)); 00200 if (tessdata_manager_debug_level) 00201 tprintf("Loaded Cube w/out combiner\n"); 00202 } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { 00203 ASSERT_HOST(init_cube_objects(true, &tessdata_manager)); 00204 if (tessdata_manager_debug_level) 00205 tprintf("Loaded Cube with combiner\n"); 00206 } 00207 00208 // Init ParamsModel. 00209 // Load pass1 and pass2 weights (for now these two sets are the same, but in 00210 // the future separate sets of weights can be generated). 00211 for (int p = ParamsModel::PTRAIN_PASS1; 00212 p < ParamsModel::PTRAIN_NUM_PASSES; ++p) { 00213 language_model_->getParamsModel().SetPass( 00214 static_cast<ParamsModel::PassEnum>(p)); 00215 if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) { 00216 if (!language_model_->getParamsModel().LoadFromFp( 00217 lang.string(), tessdata_manager.GetDataFilePtr(), 00218 tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) { 00219 return false; 00220 } 00221 } 00222 } 00223 if (tessdata_manager_debug_level) language_model_->getParamsModel().Print(); 00224 00225 return true; 00226 } 00227 00228 // Helper returns true if the given string is in the vector of strings. 00229 static bool IsStrInList(const STRING& str, 00230 const GenericVector<STRING>& str_list) { 00231 for (int i = 0; i < str_list.size(); ++i) { 00232 if (str_list[i] == str) 00233 return true; 00234 } 00235 return false; 00236 } 00237 00238 // Parse a string of the form [~]<lang>[+[~]<lang>]*. 00239 // Langs with no prefix get appended to to_load, provided they 00240 // are not in there already. 00241 // Langs with ~ prefix get appended to not_to_load, provided they are not in 00242 // there already. 00243 void Tesseract::ParseLanguageString(const char* lang_str, 00244 GenericVector<STRING>* to_load, 00245 GenericVector<STRING>* not_to_load) { 00246 STRING remains(lang_str); 00247 while (remains.length() > 0) { 00248 // Find the start of the lang code and which vector to add to. 00249 const char* start = remains.string(); 00250 while (*start == '+') 00251 ++start; 00252 GenericVector<STRING>* target = to_load; 00253 if (*start == '~') { 00254 target = not_to_load; 00255 ++start; 00256 } 00257 // Find the index of the end of the lang code in string start. 00258 int end = strlen(start); 00259 const char* plus = strchr(start, '+'); 00260 if (plus != NULL && plus - start < end) 00261 end = plus - start; 00262 STRING lang_code(start); 00263 lang_code.truncate_at(end); 00264 STRING next(start + end); 00265 remains = next; 00266 // Check whether lang_code is already in the target vector and add. 00267 if (!IsStrInList(lang_code, *target)) { 00268 if (tessdata_manager_debug_level) 00269 tprintf("Adding language '%s' to list\n", lang_code.string()); 00270 target->push_back(lang_code); 00271 } 00272 } 00273 } 00274 00275 // Initialize for potentially a set of languages defined by the language 00276 // string and recursively any additional languages required by any language 00277 // traineddata file (via tessedit_load_sublangs in its config) that is loaded. 00278 // See init_tesseract_internal for args. 00279 int Tesseract::init_tesseract( 00280 const char *arg0, const char *textbase, const char *language, 00281 OcrEngineMode oem, char **configs, int configs_size, 00282 const GenericVector<STRING> *vars_vec, 00283 const GenericVector<STRING> *vars_values, 00284 bool set_only_non_debug_params) { 00285 GenericVector<STRING> langs_to_load; 00286 GenericVector<STRING> langs_not_to_load; 00287 ParseLanguageString(language, &langs_to_load, &langs_not_to_load); 00288 00289 sub_langs_.delete_data_pointers(); 00290 sub_langs_.clear(); 00291 // Find the first loadable lang and load into this. 00292 // Add any languages that this language requires 00293 bool loaded_primary = false; 00294 // Load the rest into sub_langs_. 00295 for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) { 00296 if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) { 00297 const char *lang_str = langs_to_load[lang_index].string(); 00298 Tesseract *tess_to_init; 00299 if (!loaded_primary) { 00300 tess_to_init = this; 00301 } else { 00302 tess_to_init = new Tesseract; 00303 } 00304 00305 int result = tess_to_init->init_tesseract_internal( 00306 arg0, textbase, lang_str, oem, configs, configs_size, 00307 vars_vec, vars_values, set_only_non_debug_params); 00308 00309 if (!loaded_primary) { 00310 if (result < 0) { 00311 tprintf("Failed loading language '%s'\n", lang_str); 00312 } else { 00313 if (tessdata_manager_debug_level) 00314 tprintf("Loaded language '%s' as main language\n", lang_str); 00315 ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), 00316 &langs_to_load, &langs_not_to_load); 00317 loaded_primary = true; 00318 } 00319 } else { 00320 if (result < 0) { 00321 tprintf("Failed loading language '%s'\n", lang_str); 00322 delete tess_to_init; 00323 } else { 00324 if (tessdata_manager_debug_level) 00325 tprintf("Loaded language '%s' as secondary language\n", lang_str); 00326 sub_langs_.push_back(tess_to_init); 00327 // Add any languages that this language requires 00328 ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), 00329 &langs_to_load, &langs_not_to_load); 00330 } 00331 } 00332 } 00333 } 00334 if (!loaded_primary) { 00335 tprintf("Tesseract couldn't load any languages!\n"); 00336 return -1; // Couldn't load any language! 00337 } 00338 if (!sub_langs_.empty()) { 00339 // In multilingual mode word ratings have to be directly comparable, 00340 // so use the same language model weights for all languages: 00341 // use the primary language's params model if 00342 // tessedit_use_primary_params_model is set, 00343 // otherwise use default language model weights. 00344 if (tessedit_use_primary_params_model) { 00345 for (int s = 0; s < sub_langs_.size(); ++s) { 00346 sub_langs_[s]->language_model_->getParamsModel().Copy( 00347 this->language_model_->getParamsModel()); 00348 } 00349 tprintf("Using params model of the primary language\n"); 00350 if (tessdata_manager_debug_level) { 00351 this->language_model_->getParamsModel().Print(); 00352 } 00353 } else { 00354 this->language_model_->getParamsModel().Clear(); 00355 for (int s = 0; s < sub_langs_.size(); ++s) { 00356 sub_langs_[s]->language_model_->getParamsModel().Clear(); 00357 } 00358 tprintf("Using default language params\n"); 00359 } 00360 } 00361 00362 SetupUniversalFontIds(); 00363 return 0; 00364 } 00365 00366 // Common initialization for a single language. 00367 // arg0 is the datapath for the tessdata directory, which could be the 00368 // path of the tessdata directory with no trailing /, or (if tessdata 00369 // lives in the same directory as the executable, the path of the executable, 00370 // hence the name arg0. 00371 // textbase is an optional output file basename (used only for training) 00372 // language is the language code to load. 00373 // oem controls which engine(s) will operate on the image 00374 // configs (argv) is an array of config filenames to load variables from. 00375 // May be NULL. 00376 // configs_size (argc) is the number of elements in configs. 00377 // vars_vec is an optional vector of variables to set. 00378 // vars_values is an optional corresponding vector of values for the variables 00379 // in vars_vec. 00380 // If set_only_init_params is true, then only the initialization variables 00381 // will be set. 00382 int Tesseract::init_tesseract_internal( 00383 const char *arg0, const char *textbase, const char *language, 00384 OcrEngineMode oem, char **configs, int configs_size, 00385 const GenericVector<STRING> *vars_vec, 00386 const GenericVector<STRING> *vars_values, 00387 bool set_only_non_debug_params) { 00388 if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, 00389 configs_size, vars_vec, vars_values, 00390 set_only_non_debug_params)) { 00391 return -1; 00392 } 00393 if (tessedit_init_config_only) { 00394 tessdata_manager.End(); 00395 return 0; 00396 } 00397 // If only Cube will be used, skip loading Tesseract classifier's 00398 // pre-trained templates. 00399 bool init_tesseract_classifier = 00400 (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY || 00401 tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED); 00402 // If only Cube will be used and if it has its own Unicharset, 00403 // skip initializing permuter and loading Tesseract Dawgs. 00404 bool init_dict = 00405 !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY && 00406 tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET)); 00407 program_editup(textbase, init_tesseract_classifier, init_dict); 00408 tessdata_manager.End(); 00409 return 0; //Normal exit 00410 } 00411 00412 // Helper builds the all_fonts table by adding new fonts from new_fonts. 00413 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts, 00414 UnicityTable<FontInfo>* all_fonts) { 00415 for (int i = 0; i < new_fonts.size(); ++i) { 00416 // UnicityTable uniques as we go. 00417 all_fonts->push_back(new_fonts.get(i)); 00418 } 00419 } 00420 00421 // Helper assigns an id to lang_fonts using the index in all_fonts table. 00422 static void AssignIds(const UnicityTable<FontInfo>& all_fonts, 00423 UnicityTable<FontInfo>* lang_fonts) { 00424 for (int i = 0; i < lang_fonts->size(); ++i) { 00425 int index = all_fonts.get_id(lang_fonts->get(i)); 00426 lang_fonts->get_mutable(i)->universal_id = index; 00427 } 00428 } 00429 00430 // Set the universal_id member of each font to be unique among all 00431 // instances of the same font loaded. 00432 void Tesseract::SetupUniversalFontIds() { 00433 // Note that we can get away with bitwise copying FontInfo in 00434 // all_fonts, as it is a temporary structure and we avoid setting the 00435 // delete callback. 00436 UnicityTable<FontInfo> all_fonts; 00437 all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo)); 00438 00439 // Create the universal ID table. 00440 CollectFonts(get_fontinfo_table(), &all_fonts); 00441 for (int i = 0; i < sub_langs_.size(); ++i) { 00442 CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts); 00443 } 00444 // Assign ids from the table to each font table. 00445 AssignIds(all_fonts, &get_fontinfo_table()); 00446 for (int i = 0; i < sub_langs_.size(); ++i) { 00447 AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table()); 00448 } 00449 font_table_size_ = all_fonts.size(); 00450 } 00451 00452 // init the LM component 00453 int Tesseract::init_tesseract_lm(const char *arg0, 00454 const char *textbase, 00455 const char *language) { 00456 if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, 00457 NULL, 0, NULL, NULL, false)) 00458 return -1; 00459 getDict().Load(Dict::GlobalDawgCache()); 00460 tessdata_manager.End(); 00461 return 0; 00462 } 00463 00464 void Tesseract::end_tesseract() { 00465 end_recog(); 00466 } 00467 00468 /* Define command type identifiers */ 00469 00470 enum CMD_EVENTS 00471 { 00472 ACTION_1_CMD_EVENT, 00473 RECOG_WERDS, 00474 RECOG_PSEUDO, 00475 ACTION_2_CMD_EVENT 00476 }; 00477 00478 } // namespace tesseract