tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/api/baseapi.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        baseapi.cpp
00003  * Description: Simple API for calling tesseract.
00004  * Author:      Ray Smith
00005  * Created:     Fri Oct 06 15:35:01 PDT 2006
00006  *
00007  * (C) Copyright 2006, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // Include automatically generated configuration file if running autoconf.
00021 #ifdef HAVE_CONFIG_H
00022 #include "config_auto.h"
00023 #endif
00024 
00025 #ifdef __linux__
00026 #include <signal.h>
00027 #endif
00028 
00029 #if defined(_WIN32)
00030 #ifdef _MSC_VER
00031 #include "mathfix.h"
00032 #elif MINGW
00033 // workaround for stdlib.h with -std=c++11 for _splitpath and _MAX_FNAME
00034 #undef __STRICT_ANSI__
00035 #endif  // _MSC_VER
00036 #include <stdlib.h>
00037 #include <windows.h>
00038 #else
00039 #include <dirent.h>
00040 #include <libgen.h>
00041 #include <string.h>
00042 #endif  // _WIN32
00043 
00044 #if !defined(VERSION)
00045 #include "version.h"
00046 #endif
00047 
00048 #include "allheaders.h"
00049 
00050 #include "baseapi.h"
00051 #include "resultiterator.h"
00052 #include "mutableiterator.h"
00053 #include "thresholder.h"
00054 #include "tesseractclass.h"
00055 #include "pageres.h"
00056 #include "paragraphs.h"
00057 #include "tessvars.h"
00058 #include "control.h"
00059 #include "dict.h"
00060 #include "pgedit.h"
00061 #include "paramsd.h"
00062 #include "output.h"
00063 #include "globaloc.h"
00064 #include "globals.h"
00065 #include "edgblob.h"
00066 #include "equationdetect.h"
00067 #include "tessbox.h"
00068 #include "makerow.h"
00069 #include "otsuthr.h"
00070 #include "osdetect.h"
00071 #include "params.h"
00072 #include "renderer.h"
00073 #include "strngs.h"
00074 #include "openclwrapper.h"
00075 
00076 namespace tesseract {
00077 
00079 const int kMinRectSize = 10;
00081 const char kTesseractReject = '~';
00083 const char kUNLVReject = '~';
00085 const char kUNLVSuspect = '^';
00090 const char* kInputFile = "noname.tif";
00094 const char* kOldVarsFile = "failed_vars.txt";
00096 const int kMaxIntSize = 22;
00101 const int kMinCredibleResolution = 70;
00103 const int kMaxCredibleResolution = 2400;
00104 
00105 TessBaseAPI::TessBaseAPI()
00106   : tesseract_(NULL),
00107     osd_tesseract_(NULL),
00108     equ_detect_(NULL),
00109     // Thresholder is initialized to NULL here, but will be set before use by:
00110     // A constructor of a derived API,  SetThresholder(), or
00111     // created implicitly when used in InternalSetImage.
00112     thresholder_(NULL),
00113     paragraph_models_(NULL),
00114     block_list_(NULL),
00115     page_res_(NULL),
00116     input_file_(NULL),
00117     input_image_(NULL),
00118     output_file_(NULL),
00119     datapath_(NULL),
00120     language_(NULL),
00121     last_oem_requested_(OEM_DEFAULT),
00122     recognition_done_(false),
00123     truth_cb_(NULL),
00124     rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0),
00125     image_width_(0), image_height_(0) {
00126 }
00127 
00128 TessBaseAPI::~TessBaseAPI() {
00129   End();
00130 }
00131 
00135 const char* TessBaseAPI::Version() {
00136   return VERSION;
00137 }
00138 
00146 #ifdef USE_OPENCL
00147 #if USE_DEVICE_SELECTION
00148 #include "opencl_device_selection.h"
00149 #endif
00150 #endif
00151 size_t TessBaseAPI::getOpenCLDevice(void **data) {
00152 #ifdef USE_OPENCL
00153 #if USE_DEVICE_SELECTION
00154   ds_device device = OpenclDevice::getDeviceSelection();
00155   if (device.type == DS_DEVICE_OPENCL_DEVICE) {
00156     *data = reinterpret_cast<void*>(new cl_device_id);
00157     memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id));
00158     return sizeof(cl_device_id);
00159   }
00160 #endif
00161 #endif
00162 
00163   *data = NULL;
00164   return 0;
00165 }
00166 
00171 void TessBaseAPI::CatchSignals() {
00172 #ifdef __linux__
00173   struct sigaction action;
00174   memset(&action, 0, sizeof(action));
00175   action.sa_handler = &signal_exit;
00176   action.sa_flags = SA_RESETHAND;
00177   sigaction(SIGSEGV, &action, NULL);
00178   sigaction(SIGFPE, &action, NULL);
00179   sigaction(SIGBUS, &action, NULL);
00180 #else
00181   // Warn API users that an implementation is needed.
00182   tprintf("CatchSignals has no non-linux implementation!\n");
00183 #endif
00184 }
00185 
00190 void TessBaseAPI::SetInputName(const char* name) {
00191   if (input_file_ == NULL)
00192     input_file_ = new STRING(name);
00193   else
00194     *input_file_ = name;
00195 }
00196 
00198 void TessBaseAPI::SetOutputName(const char* name) {
00199   if (output_file_ == NULL)
00200     output_file_ = new STRING(name);
00201   else
00202     *output_file_ = name;
00203 }
00204 
00205 bool TessBaseAPI::SetVariable(const char* name, const char* value) {
00206   if (tesseract_ == NULL) tesseract_ = new Tesseract;
00207   return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
00208                               tesseract_->params());
00209 }
00210 
00211 bool TessBaseAPI::SetDebugVariable(const char* name, const char* value) {
00212   if (tesseract_ == NULL) tesseract_ = new Tesseract;
00213   return ParamUtils::SetParam(name, value, SET_PARAM_CONSTRAINT_DEBUG_ONLY,
00214                               tesseract_->params());
00215 }
00216 
00217 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
00218   IntParam *p = ParamUtils::FindParam<IntParam>(
00219       name, GlobalParams()->int_params, tesseract_->params()->int_params);
00220   if (p == NULL) return false;
00221   *value = (inT32)(*p);
00222   return true;
00223 }
00224 
00225 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
00226   BoolParam *p = ParamUtils::FindParam<BoolParam>(
00227       name, GlobalParams()->bool_params, tesseract_->params()->bool_params);
00228   if (p == NULL) return false;
00229   *value = (BOOL8)(*p);
00230   return true;
00231 }
00232 
00233 const char *TessBaseAPI::GetStringVariable(const char *name) const {
00234   StringParam *p = ParamUtils::FindParam<StringParam>(
00235       name, GlobalParams()->string_params, tesseract_->params()->string_params);
00236   return (p != NULL) ? p->string() : NULL;
00237 }
00238 
00239 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
00240   DoubleParam *p = ParamUtils::FindParam<DoubleParam>(
00241       name, GlobalParams()->double_params, tesseract_->params()->double_params);
00242   if (p == NULL) return false;
00243   *value = (double)(*p);
00244   return true;
00245 }
00246 
00248 bool TessBaseAPI::GetVariableAsString(const char *name, STRING *val) {
00249   return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
00250 }
00251 
00253 void TessBaseAPI::PrintVariables(FILE *fp) const {
00254   ParamUtils::PrintParams(fp, tesseract_->params());
00255 }
00256 
00265 int TessBaseAPI::Init(const char* datapath, const char* language,
00266                       OcrEngineMode oem, char **configs, int configs_size,
00267                       const GenericVector<STRING> *vars_vec,
00268                       const GenericVector<STRING> *vars_values,
00269                       bool set_only_non_debug_params) {
00270   PERF_COUNT_START("TessBaseAPI::Init")
00271   // Default language is "eng".
00272   if (language == NULL) language = "eng";
00273   // If the datapath, OcrEngineMode or the language have changed - start again.
00274   // Note that the language_ field stores the last requested language that was
00275   // initialized successfully, while tesseract_->lang stores the language
00276   // actually used. They differ only if the requested language was NULL, in
00277   // which case tesseract_->lang is set to the Tesseract default ("eng").
00278   if (tesseract_ != NULL &&
00279       (datapath_ == NULL || language_ == NULL ||
00280        *datapath_ != datapath || last_oem_requested_ != oem ||
00281        (*language_ != language && tesseract_->lang != language))) {
00282     delete tesseract_;
00283     tesseract_ = NULL;
00284   }
00285   // PERF_COUNT_SUB("delete tesseract_")
00286 #ifdef USE_OPENCL
00287   OpenclDevice od;
00288   od.InitEnv();
00289 #endif
00290   PERF_COUNT_SUB("OD::InitEnv()")
00291   bool reset_classifier = true;
00292   if (tesseract_ == NULL) {
00293     reset_classifier = false;
00294     tesseract_ = new Tesseract;
00295     if (tesseract_->init_tesseract(
00296         datapath, output_file_ != NULL ? output_file_->string() : NULL,
00297         language, oem, configs, configs_size, vars_vec, vars_values,
00298         set_only_non_debug_params) != 0) {
00299       return -1;
00300     }
00301   }
00302   PERF_COUNT_SUB("update tesseract_")
00303   // Update datapath and language requested for the last valid initialization.
00304   if (datapath_ == NULL)
00305     datapath_ = new STRING(datapath);
00306   else
00307     *datapath_ = datapath;
00308   if ((strcmp(datapath_->string(), "") == 0) &&
00309       (strcmp(tesseract_->datadir.string(), "") != 0))
00310      *datapath_ = tesseract_->datadir;
00311 
00312   if (language_ == NULL)
00313     language_ = new STRING(language);
00314   else
00315     *language_ = language;
00316   last_oem_requested_ = oem;
00317   // PERF_COUNT_SUB("update last_oem_requested_")
00318   // For same language and datapath, just reset the adaptive classifier.
00319   if (reset_classifier) {
00320     tesseract_->ResetAdaptiveClassifier();
00321     PERF_COUNT_SUB("tesseract_->ResetAdaptiveClassifier()")
00322   }
00323   PERF_COUNT_END
00324   return 0;
00325 }
00326 
00335 const char* TessBaseAPI::GetInitLanguagesAsString() const {
00336   return (language_ == NULL || language_->string() == NULL) ?
00337       "" : language_->string();
00338 }
00339 
00345 void TessBaseAPI::GetLoadedLanguagesAsVector(
00346     GenericVector<STRING>* langs) const {
00347   langs->clear();
00348   if (tesseract_ != NULL) {
00349     langs->push_back(tesseract_->lang);
00350     int num_subs = tesseract_->num_sub_langs();
00351     for (int i = 0; i < num_subs; ++i)
00352       langs->push_back(tesseract_->get_sub_lang(i)->lang);
00353   }
00354 }
00355 
00359 void TessBaseAPI::GetAvailableLanguagesAsVector(
00360     GenericVector<STRING>* langs) const {
00361   langs->clear();
00362   if (tesseract_ != NULL) {
00363 #ifdef _WIN32
00364     STRING pattern = tesseract_->datadir + "/*." + kTrainedDataSuffix;
00365     char fname[_MAX_FNAME];
00366     WIN32_FIND_DATA data;
00367     BOOL result = TRUE;
00368     HANDLE handle = FindFirstFile(pattern.string(), &data);
00369     if (handle != INVALID_HANDLE_VALUE) {
00370       for (; result; result = FindNextFile(handle, &data)) {
00371         _splitpath(data.cFileName, NULL, NULL, fname, NULL);
00372         langs->push_back(STRING(fname));
00373       }
00374       FindClose(handle);
00375     }
00376 #else  // _WIN32
00377     DIR *dir;
00378     struct dirent *dirent;
00379     char *dot;
00380 
00381     STRING extension = STRING(".") + kTrainedDataSuffix;
00382 
00383     dir = opendir(tesseract_->datadir.string());
00384     if (dir != NULL) {
00385       while ((dirent = readdir(dir))) {
00386         // Skip '.', '..', and hidden files
00387         if (dirent->d_name[0] != '.') {
00388           if (strstr(dirent->d_name, extension.string()) != NULL) {
00389             dot = strrchr(dirent->d_name, '.');
00390             // This ensures that .traineddata is at the end of the file name
00391             if (strncmp(dot, extension.string(),
00392                         strlen(extension.string())) == 0) {
00393               *dot = '\0';
00394               langs->push_back(STRING(dirent->d_name));
00395             }
00396           }
00397         }
00398       }
00399       closedir(dir);
00400     }
00401 #endif
00402   }
00403 }
00404 
00411 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
00412   if (tesseract_ == NULL)
00413     tesseract_ = new Tesseract;
00414   else
00415     ParamUtils::ResetToDefaults(tesseract_->params());
00416   return tesseract_->init_tesseract_lm(datapath, NULL, language);
00417 }
00418 
00423 void TessBaseAPI::InitForAnalysePage() {
00424   if (tesseract_ == NULL) {
00425     tesseract_ = new Tesseract;
00426     tesseract_->InitAdaptiveClassifier(false);
00427   }
00428 }
00429 
00435 void TessBaseAPI::ReadConfigFile(const char* filename) {
00436   tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_NON_INIT_ONLY);
00437 }
00438 
00440 void TessBaseAPI::ReadDebugConfigFile(const char* filename) {
00441   tesseract_->read_config_file(filename, SET_PARAM_CONSTRAINT_DEBUG_ONLY);
00442 }
00443 
00449 void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
00450   if (tesseract_ == NULL)
00451     tesseract_ = new Tesseract;
00452   tesseract_->tessedit_pageseg_mode.set_value(mode);
00453 }
00454 
00456 PageSegMode TessBaseAPI::GetPageSegMode() const {
00457   if (tesseract_ == NULL)
00458     return PSM_SINGLE_BLOCK;
00459   return static_cast<PageSegMode>(
00460     static_cast<int>(tesseract_->tessedit_pageseg_mode));
00461 }
00462 
00476 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
00477                                  int bytes_per_pixel,
00478                                  int bytes_per_line,
00479                                  int left, int top,
00480                                  int width, int height) {
00481   if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
00482     return NULL;  // Nothing worth doing.
00483 
00484   // Since this original api didn't give the exact size of the image,
00485   // we have to invent a reasonable value.
00486   int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
00487   SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top,
00488            bytes_per_pixel, bytes_per_line);
00489   SetRectangle(left, top, width, height);
00490 
00491   return GetUTF8Text();
00492 }
00493 
00498 void TessBaseAPI::ClearAdaptiveClassifier() {
00499   if (tesseract_ == NULL)
00500     return;
00501   tesseract_->ResetAdaptiveClassifier();
00502   tesseract_->ResetDocumentDictionary();
00503 }
00504 
00514 void TessBaseAPI::SetImage(const unsigned char* imagedata,
00515                            int width, int height,
00516                            int bytes_per_pixel, int bytes_per_line) {
00517   if (InternalSetImage())
00518     thresholder_->SetImage(imagedata, width, height,
00519                            bytes_per_pixel, bytes_per_line);
00520 }
00521 
00522 void TessBaseAPI::SetSourceResolution(int ppi) {
00523   if (thresholder_)
00524     thresholder_->SetSourceYResolution(ppi);
00525   else
00526     tprintf("Please call SetImage before SetSourceResolution.\n");
00527 }
00528 
00539 void TessBaseAPI::SetImage(const Pix* pix) {
00540   if (InternalSetImage())
00541     thresholder_->SetImage(pix);
00542 }
00543 
00549 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
00550   if (thresholder_ == NULL)
00551     return;
00552   thresholder_->SetRectangle(left, top, width, height);
00553   ClearResults();
00554 }
00555 
00560 Pix* TessBaseAPI::GetThresholdedImage() {
00561   if (tesseract_ == NULL)
00562     return NULL;
00563   if (tesseract_->pix_binary() == NULL)
00564     Threshold(tesseract_->mutable_pix_binary());
00565   return pixClone(tesseract_->pix_binary());
00566 }
00567 
00573 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
00574   return GetComponentImages(RIL_BLOCK, false, pixa, NULL);
00575 }
00576 
00585 Boxa* TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding,
00586                                 Pixa** pixa, int** blockids, int** paraids) {
00587   return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding,
00588                             pixa, blockids, paraids);
00589 }
00590 
00599 Boxa* TessBaseAPI::GetStrips(Pixa** pixa, int** blockids) {
00600   return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
00601 }
00602 
00608 Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
00609   return GetComponentImages(RIL_WORD, true, pixa, NULL);
00610 }
00611 
00618 Boxa* TessBaseAPI::GetConnectedComponents(Pixa** pixa) {
00619   return GetComponentImages(RIL_SYMBOL, true, pixa, NULL);
00620 }
00621 
00630 Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level,
00631                                       bool text_only, bool raw_image,
00632                                       const int raw_padding,
00633                                       Pixa** pixa, int** blockids,
00634                                       int** paraids) {
00635   PageIterator* page_it = GetIterator();
00636   if (page_it == NULL)
00637     page_it = AnalyseLayout();
00638   if (page_it == NULL)
00639     return NULL;  // Failed.
00640 
00641   // Count the components to get a size for the arrays.
00642   int component_count = 0;
00643   int left, top, right, bottom;
00644 
00645   TessResultCallback<bool>* get_bbox = NULL;
00646   if (raw_image) {
00647     // Get bounding box in original raw image with padding.
00648     get_bbox = NewPermanentTessCallback(page_it, &PageIterator::BoundingBox,
00649                                         level, raw_padding,
00650                                         &left, &top, &right, &bottom);
00651   } else {
00652     // Get bounding box from binarized imaged. Note that this could be
00653     // differently scaled from the original image.
00654     get_bbox = NewPermanentTessCallback(page_it,
00655                                         &PageIterator::BoundingBoxInternal,
00656                                         level, &left, &top, &right, &bottom);
00657   }
00658   do {
00659     if (get_bbox->Run() &&
00660         (!text_only || PTIsTextType(page_it->BlockType())))
00661       ++component_count;
00662   } while (page_it->Next(level));
00663 
00664   Boxa* boxa = boxaCreate(component_count);
00665   if (pixa != NULL)
00666     *pixa = pixaCreate(component_count);
00667   if (blockids != NULL)
00668     *blockids = new int[component_count];
00669   if (paraids != NULL)
00670     *paraids = new int[component_count];
00671 
00672   int blockid = 0;
00673   int paraid = 0;
00674   int component_index = 0;
00675   page_it->Begin();
00676   do {
00677     if (get_bbox->Run() &&
00678         (!text_only || PTIsTextType(page_it->BlockType()))) {
00679       Box* lbox = boxCreate(left, top, right - left, bottom - top);
00680       boxaAddBox(boxa, lbox, L_INSERT);
00681       if (pixa != NULL) {
00682         Pix* pix = NULL;
00683         if (raw_image) {
00684           pix = page_it->GetImage(level, raw_padding, &left, &top);
00685         } else {
00686           pix = page_it->GetBinaryImage(level);
00687         }
00688         pixaAddPix(*pixa, pix, L_INSERT);
00689         pixaAddBox(*pixa, lbox, L_CLONE);
00690       }
00691       if (paraids != NULL) {
00692         (*paraids)[component_index] = paraid;
00693         if (page_it->IsAtFinalElement(RIL_PARA, level))
00694           ++paraid;
00695       }
00696       if (blockids != NULL) {
00697         (*blockids)[component_index] = blockid;
00698         if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
00699           ++blockid;
00700           paraid = 0;
00701         }
00702       }
00703       ++component_index;
00704     }
00705   } while (page_it->Next(level));
00706   delete page_it;
00707   delete get_bbox;
00708   return boxa;
00709 }
00710 
00711 int TessBaseAPI::GetThresholdedImageScaleFactor() const {
00712   if (thresholder_ == NULL) {
00713     return 0;
00714   }
00715   return thresholder_->GetScaleFactor();
00716 }
00717 
00719 void TessBaseAPI::DumpPGM(const char* filename) {
00720   if (tesseract_ == NULL)
00721     return;
00722   FILE *fp = fopen(filename, "wb");
00723   Pix* pix = tesseract_->pix_binary();
00724   int width = pixGetWidth(pix);
00725   int height = pixGetHeight(pix);
00726   l_uint32* data = pixGetData(pix);
00727   fprintf(fp, "P5 %d %d 255\n", width, height);
00728   for (int y = 0; y < height; ++y, data += pixGetWpl(pix)) {
00729     for (int x = 0; x < width; ++x) {
00730       uinT8 b = GET_DATA_BIT(data, x) ? 0 : 255;
00731       fwrite(&b, 1, 1, fp);
00732     }
00733   }
00734   fclose(fp);
00735 }
00736 
00743 int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks,
00744                 Boxa* boxa_words, Pixa* pixa_words,
00745                 const FCOORD& reskew, Pix* page_pix,
00746                 PAGE_RES* page_res) {
00747   int block_count = boxaGetCount(boxa_blocks);
00748   ASSERT_HOST(block_count == pixaGetCount(pixa_blocks));
00749   // Write each block to the current directory as junk_write_display.nnn.png.
00750   for (int i = 0; i < block_count; ++i) {
00751     Pix* pix = pixaGetPix(pixa_blocks, i, L_CLONE);
00752     pixDisplayWrite(pix, 1);
00753   }
00754   int word_count = boxaGetCount(boxa_words);
00755   ASSERT_HOST(word_count == pixaGetCount(pixa_words));
00756   int pr_word = 0;
00757   PAGE_RES_IT page_res_it(page_res);
00758   for (page_res_it.restart_page(); page_res_it.word () != NULL;
00759        page_res_it.forward(), ++pr_word) {
00760     WERD_RES *word = page_res_it.word();
00761     WERD_CHOICE* choice = word->best_choice;
00762     // Write the first 100 words to files names wordims/<wordstring>.tif.
00763     if (pr_word < 100) {
00764       STRING filename("wordims/");
00765       if (choice != NULL) {
00766         filename += choice->unichar_string();
00767       } else {
00768         char numbuf[32];
00769         filename += "unclassified";
00770         snprintf(numbuf, 32, "%03d", pr_word);
00771         filename += numbuf;
00772       }
00773       filename += ".tif";
00774       Pix* pix = pixaGetPix(pixa_words, pr_word, L_CLONE);
00775       pixWrite(filename.string(), pix, IFF_TIFF_G4);
00776     }
00777   }
00778   ASSERT_HOST(pr_word == word_count);
00779   return 0;
00780 }
00781 
00793 PageIterator* TessBaseAPI::AnalyseLayout() {
00794   if (FindLines() == 0) {
00795     if (block_list_->empty())
00796       return NULL;  // The page was empty.
00797     page_res_ = new PAGE_RES(block_list_, NULL);
00798     DetectParagraphs(false);
00799     return new PageIterator(
00800         page_res_, tesseract_, thresholder_->GetScaleFactor(),
00801         thresholder_->GetScaledYResolution(),
00802         rect_left_, rect_top_, rect_width_, rect_height_);
00803   }
00804   return NULL;
00805 }
00806 
00811 int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
00812   if (tesseract_ == NULL)
00813     return -1;
00814   if (FindLines() != 0)
00815     return -1;
00816   if (page_res_ != NULL)
00817     delete page_res_;
00818   if (block_list_->empty()) {
00819     page_res_ = new PAGE_RES(block_list_, &tesseract_->prev_word_best_choice_);
00820     return 0; // Empty page.
00821   }
00822 
00823   tesseract_->SetBlackAndWhitelist();
00824   recognition_done_ = true;
00825   if (tesseract_->tessedit_resegment_from_line_boxes)
00826     page_res_ = tesseract_->ApplyBoxes(*input_file_, true, block_list_);
00827   else if (tesseract_->tessedit_resegment_from_boxes)
00828     page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_);
00829   else
00830     page_res_ = new PAGE_RES(block_list_, &tesseract_->prev_word_best_choice_);
00831   if (tesseract_->tessedit_make_boxes_from_boxes) {
00832     tesseract_->CorrectClassifyWords(page_res_);
00833     return 0;
00834   }
00835 
00836   if (truth_cb_ != NULL) {
00837     tesseract_->wordrec_run_blamer.set_value(true);
00838     PageIterator *page_it = new PageIterator(
00839             page_res_, tesseract_, thresholder_->GetScaleFactor(),
00840             thresholder_->GetScaledYResolution(),
00841             rect_left_, rect_top_, rect_width_, rect_height_);
00842     truth_cb_->Run(tesseract_->getDict().getUnicharset(),
00843                    image_height_, page_it, this->tesseract()->pix_grey());
00844     delete page_it;
00845   }
00846 
00847   int result = 0;
00848   if (tesseract_->interactive_display_mode) {
00849     #ifndef GRAPHICS_DISABLED
00850     tesseract_->pgeditor_main(rect_width_, rect_height_, page_res_);
00851     #endif  // GRAPHICS_DISABLED
00852     // The page_res is invalid after an interactive session, so cleanup
00853     // in a way that lets us continue to the next page without crashing.
00854     delete page_res_;
00855     page_res_ = NULL;
00856     return -1;
00857   } else if (tesseract_->tessedit_train_from_boxes) {
00858     tesseract_->ApplyBoxTraining(*output_file_, page_res_);
00859   } else if (tesseract_->tessedit_ambigs_training) {
00860     FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
00861     // OCR the page segmented into words by tesseract.
00862     tesseract_->recog_training_segmented(
00863         *input_file_, page_res_, monitor, training_output_file);
00864     fclose(training_output_file);
00865   } else {
00866     // Now run the main recognition.
00867     bool wait_for_text = true;
00868     GetBoolVariable("paragraph_text_based", &wait_for_text);
00869     if (!wait_for_text) DetectParagraphs(false);
00870     if (tesseract_->recog_all_words(page_res_, monitor, NULL, NULL, 0)) {
00871       if (wait_for_text) DetectParagraphs(true);
00872     } else {
00873       result = -1;
00874     }
00875   }
00876   return result;
00877 }
00878 
00880 int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) {
00881   if (tesseract_ == NULL)
00882     return -1;
00883   if (thresholder_ == NULL || thresholder_->IsEmpty()) {
00884     tprintf("Please call SetImage before attempting recognition.");
00885     return -1;
00886   }
00887   if (page_res_ != NULL)
00888     ClearResults();
00889   if (FindLines() != 0)
00890     return -1;
00891   // Additional conditions under which chopper test cannot be run
00892   if (tesseract_->interactive_display_mode) return -1;
00893 
00894   recognition_done_ = true;
00895 
00896   page_res_ = new PAGE_RES(block_list_, &(tesseract_->prev_word_best_choice_));
00897 
00898   PAGE_RES_IT page_res_it(page_res_);
00899 
00900   while (page_res_it.word() != NULL) {
00901     WERD_RES *word_res = page_res_it.word();
00902     GenericVector<TBOX> boxes;
00903     tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block,
00904                                   page_res_it.row()->row, word_res);
00905     page_res_it.forward();
00906   }
00907   return 0;
00908 }
00909 
00926 bool TessBaseAPI::ProcessPages(const char* filename,
00927                                const char* retry_config, int timeout_millisec,
00928                                STRING* text_out) {
00929   TessResultRenderer* renderer = NewRenderer();
00930 
00931   if (!ProcessPages(filename, retry_config, timeout_millisec, renderer)) {
00932     delete renderer;
00933     return false;
00934   }
00935 
00936   const char* out_data;
00937   inT32 out_len;
00938   bool success = renderer->GetOutput(&out_data, &out_len);
00939   if (success) {
00940     // TODO(ewiseblatt): 20111103
00941     // if text_out->size() != out_len then we have binary data which STRING wont
00942     // support so this should fail. Really want to eliminate this interface
00943     // alltogether so not worrying about at this time.
00944     text_out->assign(out_data, out_len);
00945   }
00946   delete renderer;
00947   return success;
00948 }
00949 
00950 void TessBaseAPI::SetInputImage(Pix *pix) {
00951   if (input_image_)
00952     pixDestroy(&input_image_);
00953   input_image_ = pixClone(pix);
00954 }
00955 
00956 Pix* TessBaseAPI::GetInputImage() {
00957   return input_image_;
00958 }
00959 
00960 const char * TessBaseAPI::GetInputName() {
00961   if (input_file_)
00962     return input_file_->c_str();
00963   return NULL;
00964 }
00965 
00966 const char *  TessBaseAPI::GetDatapath() {
00967   return tesseract_->datadir.c_str();
00968 }
00969 
00970 int TessBaseAPI::GetSourceYResolution() {
00971   return thresholder_->GetSourceYResolution();
00972 }
00973 
00974 bool TessBaseAPI::ProcessPages(const char* filename,
00975                                const char* retry_config, int timeout_millisec,
00976                                TessResultRenderer* renderer) {
00977   PERF_COUNT_START("ProcessPages")
00978   int page = tesseract_->tessedit_page_number;
00979   if (page < 0)
00980     page = 0;
00981   FILE* fp = fopen(filename, "rb");
00982   if (fp == NULL) {
00983     tprintf("Image file %s cannot be opened!\n", filename);
00984     return false;
00985   }
00986   // Find the number of pages if a tiff file, or zero otherwise.
00987   int npages = 0;
00988   int format;
00989   Pix *pix;
00990   pix = pixRead(filename);
00991   format = pixGetInputFormat(pix);
00992   if (format == IFF_TIFF || format == IFF_TIFF_PACKBITS ||
00993       format == IFF_TIFF_RLE || format == IFF_TIFF_G3 ||
00994       format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
00995       format == IFF_TIFF_ZIP)
00996     tiffGetCount(fp, &npages);
00997   fclose(fp);
00998 
00999   bool success = true;
01000   const char* kUnknownTitle = "";
01001   if (renderer && !renderer->BeginDocument(kUnknownTitle)) {
01002     success = false;
01003   }
01004 
01005 #ifdef USE_OPENCL
01006   OpenclDevice od;
01007 #endif
01008 
01009   if (npages > 0) {
01010     pixDestroy(&pix);
01011     for (; page < npages; ++page) {
01012       // only use opencl if compiled w/ OpenCL and selected device is opencl
01013 #ifdef USE_OPENCL
01014       if ( od.selectedDeviceIsOpenCL() ) {
01015         pix = od.pixReadTiffCl(filename, page);
01016       } else {
01017 #endif
01018         pix = pixReadTiff(filename, page);
01019 #ifdef USE_OPENCL
01020       }
01021 #endif
01022 
01023       if (pix == NULL) break;
01024 
01025       if ((page >= 0) && (npages > 1))
01026         tprintf("Page %d of %d\n", page + 1, npages);
01027       char page_str[kMaxIntSize];
01028       snprintf(page_str, kMaxIntSize - 1, "%d", page);
01029       SetVariable("applybox_page", page_str);
01030       success &= ProcessPage(pix, page, filename, retry_config,
01031                              timeout_millisec, renderer);
01032       pixDestroy(&pix);
01033       if (tesseract_->tessedit_page_number >= 0 || npages == 1) {
01034         break;
01035       }
01036     }
01037   } else {
01038     // The file is not a tiff file.
01039     if (pix != NULL) {
01040       success &= ProcessPage(pix, 0, filename, retry_config,
01041                              timeout_millisec, renderer);
01042       pixDestroy(&pix);
01043     } else {
01044       // The file is not an image file, so try it as a list of filenames.
01045       FILE* fimg = fopen(filename, "rb");
01046       if (fimg == NULL) {
01047         tprintf("File %s cannot be opened!\n", filename);
01048         return false;
01049       }
01050       tprintf("Reading %s as a list of filenames...\n", filename);
01051       char pagename[MAX_PATH];
01052       // Skip to the requested page number.
01053       for (int i = 0; i < page &&
01054            fgets(pagename, sizeof(pagename), fimg) != NULL;
01055            ++i);
01056       while (fgets(pagename, sizeof(pagename), fimg) != NULL) {
01057         chomp_string(pagename);
01058         pix = pixRead(pagename);
01059         if (pix == NULL) {
01060           tprintf("Image file %s cannot be read!\n", pagename);
01061           fclose(fimg);
01062           return false;
01063         }
01064         tprintf("Page %d : %s\n", page, pagename);
01065         success &= ProcessPage(pix, page, pagename, retry_config,
01066                                timeout_millisec, renderer);
01067         pixDestroy(&pix);
01068         ++page;
01069       }
01070       fclose(fimg);
01071     }
01072   }
01073 
01074   bool all_ok = success;
01075   if (renderer && !renderer->EndDocument()) {
01076     all_ok = false;
01077   }
01078   PERF_COUNT_END
01079   return all_ok;
01080 }
01081 
01093 bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
01094                               const char* retry_config, int timeout_millisec,
01095                               STRING* text_out) {
01096   TessResultRenderer* renderer = NewRenderer();
01097 
01098   if (!ProcessPage(pix, page_index, filename, retry_config, timeout_millisec,
01099                    renderer)) {
01100     return false;
01101   }
01102 
01103   const char* out_data;
01104   inT32 out_len;
01105   if (!renderer->GetOutput(&out_data, &out_len)) {
01106     return false;
01107   }
01108 
01109   // TODO(ewiseblatt): 20111103
01110   // if text_out->size() != out_len then we have binary data which STRING wont
01111   // support so this should fail. Really want to eliminate this interface
01112   // alltogether so not worrying about at this time.
01113   text_out->assign(out_data, out_len);
01114 
01115   return true;
01116 }
01117 
01129 bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
01130                               const char* retry_config, int timeout_millisec,
01131                               TessResultRenderer* renderer) {
01132   PERF_COUNT_START("ProcessPage")
01133   SetInputName(filename);
01134   SetImage(pix);
01135   SetInputImage(pix);
01136   bool failed = false;
01137   if (timeout_millisec > 0) {
01138     // Running with a timeout.
01139     ETEXT_DESC monitor;
01140     monitor.cancel = NULL;
01141     monitor.cancel_this = NULL;
01142     monitor.set_deadline_msecs(timeout_millisec);
01143     // Now run the main recognition.
01144     failed = Recognize(&monitor) < 0;
01145   } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY ||
01146              tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
01147     // Disabled character recognition.
01148     PageIterator* it = AnalyseLayout();
01149     if (it == NULL) {
01150       failed = true;
01151     } else {
01152       delete it;
01153       PERF_COUNT_END
01154       return true;
01155     }
01156   } else {
01157     // Normal layout and character recognition with no timeout.
01158     failed = Recognize(NULL) < 0;
01159   }
01160   if (tesseract_->tessedit_write_images) {
01161     Pix* page_pix = GetThresholdedImage();
01162     pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
01163   }
01164   if (failed && retry_config != NULL && retry_config[0] != '\0') {
01165     // Save current config variables before switching modes.
01166     FILE* fp = fopen(kOldVarsFile, "wb");
01167     PrintVariables(fp);
01168     fclose(fp);
01169     // Switch to alternate mode for retry.
01170     ReadConfigFile(retry_config);
01171     SetImage(pix);
01172     Recognize(NULL);
01173     // Restore saved config variables.
01174     ReadConfigFile(kOldVarsFile);
01175   }
01176 
01177   if (renderer) {
01178     if (failed) {
01179       renderer->AddError(this);
01180     } else {
01181       failed = !renderer->AddImage(this);
01182     }
01183   }
01184   PERF_COUNT_END
01185   return !failed;
01186 }
01187 
01192 LTRResultIterator* TessBaseAPI::GetLTRIterator() {
01193   if (tesseract_ == NULL || page_res_ == NULL)
01194     return NULL;
01195   return new LTRResultIterator(
01196       page_res_, tesseract_,
01197       thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
01198       rect_left_, rect_top_, rect_width_, rect_height_);
01199 }
01200 
01209 ResultIterator* TessBaseAPI::GetIterator() {
01210   if (tesseract_ == NULL || page_res_ == NULL)
01211     return NULL;
01212   return ResultIterator::StartOfParagraph(LTRResultIterator(
01213       page_res_, tesseract_,
01214       thresholder_->GetScaleFactor(), thresholder_->GetScaledYResolution(),
01215       rect_left_, rect_top_, rect_width_, rect_height_));
01216 }
01217 
01226 MutableIterator* TessBaseAPI::GetMutableIterator() {
01227   if (tesseract_ == NULL || page_res_ == NULL)
01228     return NULL;
01229   return new MutableIterator(page_res_, tesseract_,
01230                              thresholder_->GetScaleFactor(),
01231                              thresholder_->GetScaledYResolution(),
01232                              rect_left_, rect_top_, rect_width_, rect_height_);
01233 }
01234 
01236 char* TessBaseAPI::GetUTF8Text() {
01237   if (tesseract_ == NULL ||
01238       (!recognition_done_ && Recognize(NULL) < 0))
01239     return NULL;
01240   STRING text("");
01241   ResultIterator *it = GetIterator();
01242   do {
01243     if (it->Empty(RIL_PARA)) continue;
01244     char *para_text = it->GetUTF8Text(RIL_PARA);
01245     text += para_text;
01246     delete []para_text;
01247   } while (it->Next(RIL_PARA));
01248   char* result = new char[text.length() + 1];
01249   strncpy(result, text.string(), text.length() + 1);
01250   delete it;
01251   return result;
01252 }
01253 
01257 static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
01258   tesseract::Orientation orientation;
01259   tesseract::WritingDirection writing_direction;
01260   tesseract::TextlineOrder textline_order;
01261   float deskew_angle;
01262   it->Orientation(&orientation, &writing_direction, &textline_order,
01263                   &deskew_angle);
01264   return orientation;
01265 }
01266 
01275 static void AddBaselineCoordsTohOCR(const PageIterator *it,
01276                                     PageIteratorLevel level,
01277                                     STRING* hocr_str) {
01278   tesseract::Orientation orientation = GetBlockTextOrientation(it);
01279   if (orientation != ORIENTATION_PAGE_UP) {
01280     hocr_str->add_str_int("; textangle ", 360 - orientation * 90);
01281     return;
01282   }
01283 
01284   int left, top, right, bottom;
01285   it->BoundingBox(level, &left, &top, &right, &bottom);
01286 
01287   // Try to get the baseline coordinates at this level.
01288   int x1, y1, x2, y2;
01289   if (!it->Baseline(level, &x1, &y1, &x2, &y2))
01290     return;
01291   // Following the description of this field of the hOCR spec, we convert the
01292   // baseline coordinates so that "the bottom left of the bounding box is the
01293   // origin".
01294   x1 -= left;
01295   x2 -= left;
01296   y1 -= bottom;
01297   y2 -= bottom;
01298 
01299   // Now fit a line through the points so we can extract coefficients for the
01300   // equation:  y = p1 x + p0
01301   double p1 = 0;
01302   double p0 = 0;
01303   if (x1 == x2) {
01304     // Problem computing the polynomial coefficients.
01305     return;
01306   }
01307   p1 = (y2 - y1) / static_cast<double>(x2 - x1);
01308   p0 = y1 - static_cast<double>(p1 * x1);
01309 
01310   hocr_str->add_str_double("; baseline ", round(p1 * 1000.0) / 1000.0);
01311   hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0);
01312 }
01313 
01314 static void AddBoxTohOCR(const PageIterator *it,
01315                          PageIteratorLevel level,
01316                          STRING* hocr_str) {
01317   int left, top, right, bottom;
01318   it->BoundingBox(level, &left, &top, &right, &bottom);
01319   hocr_str->add_str_int("' title=\"bbox ", left);
01320   hocr_str->add_str_int(" ", top);
01321   hocr_str->add_str_int(" ", right);
01322   hocr_str->add_str_int(" ", bottom);
01323   // Add baseline coordinates for textlines only.
01324   if (level == RIL_TEXTLINE)
01325     AddBaselineCoordsTohOCR(it, level, hocr_str);
01326   *hocr_str += "\">";
01327 }
01328 
01337 char* TessBaseAPI::GetHOCRText(int page_number) {
01338   if (tesseract_ == NULL ||
01339       (page_res_ == NULL && Recognize(NULL) < 0))
01340     return NULL;
01341 
01342   int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
01343   int page_id = page_number + 1;  // hOCR uses 1-based page numbers.
01344 
01345   STRING hocr_str("");
01346 
01347   if (input_file_ == NULL)
01348       SetInputName(NULL);
01349 
01350 #ifdef _WIN32
01351   // convert input name from ANSI encoding to utf-8
01352   int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
01353                                       NULL, NULL);
01354   wchar_t *uni16_str = new WCHAR[str16_len];
01355   str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
01356                                   uni16_str, str16_len);
01357   int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL,
01358                                      NULL, NULL, NULL);
01359   char *utf8_str = new char[utf8_len];
01360   WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
01361                       utf8_len, NULL, NULL);
01362   *input_file_ = utf8_str;
01363   delete[] uni16_str;
01364   delete[] utf8_str;
01365 #endif
01366 
01367   hocr_str.add_str_int("  <div class='ocr_page' id='page_", page_id);
01368   hocr_str += "' title='image \"";
01369   hocr_str += input_file_ ? *input_file_ : "unknown";
01370   hocr_str.add_str_int("\"; bbox ", rect_left_);
01371   hocr_str.add_str_int(" ", rect_top_);
01372   hocr_str.add_str_int(" ", rect_width_);
01373   hocr_str.add_str_int(" ", rect_height_);
01374   hocr_str.add_str_int("; ppageno ", page_number);
01375   hocr_str += "'>\n";
01376 
01377   ResultIterator *res_it = GetIterator();
01378   while (!res_it->Empty(RIL_BLOCK)) {
01379     if (res_it->Empty(RIL_WORD)) {
01380       res_it->Next(RIL_WORD);
01381       continue;
01382     }
01383 
01384     // Open any new block/paragraph/textline.
01385     if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
01386       hocr_str.add_str_int("   <div class='ocr_carea' id='block_", page_id);
01387       hocr_str.add_str_int("_", bcnt);
01388       AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
01389     }
01390     if (res_it->IsAtBeginningOf(RIL_PARA)) {
01391       if (res_it->ParagraphIsLtr()) {
01392         hocr_str.add_str_int("\n    <p class='ocr_par' dir='ltr' id='par_",
01393                              page_id);
01394         hocr_str.add_str_int("_", pcnt);
01395       } else {
01396         hocr_str.add_str_int("\n    <p class='ocr_par' dir='rtl' id='par_",
01397                              page_id);
01398         hocr_str.add_str_int("_", pcnt);
01399       }
01400       AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
01401     }
01402     if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
01403       hocr_str.add_str_int("\n     <span class='ocr_line' id='line_", page_id);
01404       hocr_str.add_str_int("_", lcnt);
01405       AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
01406     }
01407 
01408     // Now, process the word...
01409     hocr_str.add_str_int("<span class='ocrx_word' id='word_", page_id);
01410     hocr_str.add_str_int("_", wcnt);
01411     int left, top, right, bottom;
01412     res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
01413     hocr_str.add_str_int("' title='bbox ", left);
01414     hocr_str.add_str_int(" ", top);
01415     hocr_str.add_str_int(" ", right);
01416     hocr_str.add_str_int(" ", bottom);
01417     hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
01418     hocr_str += "'";
01419     if (res_it->WordRecognitionLanguage()) {
01420       hocr_str += " lang='";
01421       hocr_str += res_it->WordRecognitionLanguage();
01422       hocr_str += "'";
01423     }
01424     switch (res_it->WordDirection()) {
01425       case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
01426       case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
01427       default:  // Do nothing.
01428         break;
01429     }
01430     hocr_str += ">";
01431     const char *font_name;
01432     bool bold, italic, underlined, monospace, serif, smallcaps;
01433     int pointsize, font_id;
01434     font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
01435                                            &monospace, &serif, &smallcaps,
01436                                            &pointsize, &font_id);
01437     bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
01438     bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
01439     bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
01440     if (bold) hocr_str += "<strong>";
01441     if (italic) hocr_str += "<em>";
01442     do {
01443       const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
01444       if (grapheme && grapheme[0] != 0) {
01445         if (grapheme[1] == 0) {
01446           switch (grapheme[0]) {
01447             case '<': hocr_str += "&lt;"; break;
01448             case '>': hocr_str += "&gt;"; break;
01449             case '&': hocr_str += "&amp;"; break;
01450             case '"': hocr_str += "&quot;"; break;
01451             case '\'': hocr_str += "&#39;"; break;
01452             default: hocr_str += grapheme;
01453           }
01454         } else {
01455           hocr_str += grapheme;
01456         }
01457       }
01458       delete []grapheme;
01459       res_it->Next(RIL_SYMBOL);
01460     } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
01461     if (italic) hocr_str += "</em>";
01462     if (bold) hocr_str += "</strong>";
01463     hocr_str += "</span> ";
01464     wcnt++;
01465     // Close any ending block/paragraph/textline.
01466     if (last_word_in_line) {
01467       hocr_str += "\n     </span>";
01468       lcnt++;
01469     }
01470     if (last_word_in_para) {
01471       hocr_str += "\n    </p>\n";
01472       pcnt++;
01473     }
01474     if (last_word_in_block) {
01475       hocr_str += "   </div>\n";
01476       bcnt++;
01477     }
01478   }
01479   hocr_str += "  </div>\n";
01480 
01481   char *ret = new char[hocr_str.length() + 1];
01482   strcpy(ret, hocr_str.string());
01483   delete res_it;
01484   return ret;
01485 }
01486 
01488 const int kNumbersPerBlob = 5;
01493 const int kBytesPerNumber = 5;
01499 const int kBytesPerBlob = kNumbersPerBlob * (kBytesPerNumber + 1) + 1;
01500 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
01502 const int kBytesPer64BitNumber = 20;
01509 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 +
01510     UNICHAR_LEN;
01511 
01517 char* TessBaseAPI::GetBoxText(int page_number) {
01518   if (tesseract_ == NULL ||
01519       (!recognition_done_ && Recognize(NULL) < 0))
01520     return NULL;
01521   int blob_count;
01522   int utf8_length = TextLength(&blob_count);
01523   int total_length = blob_count * kBytesPerBoxFileLine + utf8_length +
01524       kMaxBytesPerLine;
01525   char* result = new char[total_length];
01526   strcpy(result, "\0");
01527   int output_length = 0;
01528   LTRResultIterator* it = GetLTRIterator();
01529   do {
01530     int left, top, right, bottom;
01531     if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
01532       char* text = it->GetUTF8Text(RIL_SYMBOL);
01533       // Tesseract uses space for recognition failure. Fix to a reject
01534       // character, kTesseractReject so we don't create illegal box files.
01535       for (int i = 0; text[i] != '\0'; ++i) {
01536         if (text[i] == ' ')
01537           text[i] = kTesseractReject;
01538       }
01539       snprintf(result + output_length, total_length - output_length,
01540                "%s %d %d %d %d %d\n",
01541                text, left, image_height_ - bottom,
01542                right, image_height_ - top, page_number);
01543       output_length += strlen(result + output_length);
01544       delete [] text;
01545       // Just in case...
01546       if (output_length + kMaxBytesPerLine > total_length)
01547         break;
01548     }
01549   } while (it->Next(RIL_SYMBOL));
01550   delete it;
01551   return result;
01552 }
01553 
01559 const int kUniChs[] = {
01560   0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
01561 };
01563 const int kLatinChs[] = {
01564   0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
01565 };
01566 
01572 char* TessBaseAPI::GetUNLVText() {
01573   if (tesseract_ == NULL ||
01574       (!recognition_done_ && Recognize(NULL) < 0))
01575     return NULL;
01576   bool tilde_crunch_written = false;
01577   bool last_char_was_newline = true;
01578   bool last_char_was_tilde = false;
01579 
01580   int total_length = TextLength(NULL);
01581   PAGE_RES_IT   page_res_it(page_res_);
01582   char* result = new char[total_length];
01583   char* ptr = result;
01584   for (page_res_it.restart_page(); page_res_it.word () != NULL;
01585        page_res_it.forward()) {
01586     WERD_RES *word = page_res_it.word();
01587     // Process the current word.
01588     if (word->unlv_crunch_mode != CR_NONE) {
01589       if (word->unlv_crunch_mode != CR_DELETE &&
01590           (!tilde_crunch_written ||
01591            (word->unlv_crunch_mode == CR_KEEP_SPACE &&
01592             word->word->space() > 0 &&
01593             !word->word->flag(W_FUZZY_NON) &&
01594             !word->word->flag(W_FUZZY_SP)))) {
01595         if (!word->word->flag(W_BOL) &&
01596             word->word->space() > 0 &&
01597             !word->word->flag(W_FUZZY_NON) &&
01598             !word->word->flag(W_FUZZY_SP)) {
01599           /* Write a space to separate from preceeding good text */
01600           *ptr++ = ' ';
01601           last_char_was_tilde = false;
01602         }
01603         if (!last_char_was_tilde) {
01604           // Write a reject char.
01605           last_char_was_tilde = true;
01606           *ptr++ = kUNLVReject;
01607           tilde_crunch_written = true;
01608           last_char_was_newline = false;
01609         }
01610       }
01611     } else {
01612       // NORMAL PROCESSING of non tilde crunched words.
01613       tilde_crunch_written = false;
01614       tesseract_->set_unlv_suspects(word);
01615       const char* wordstr = word->best_choice->unichar_string().string();
01616       const STRING& lengths = word->best_choice->unichar_lengths();
01617       int length = lengths.length();
01618       int i = 0;
01619       int offset = 0;
01620 
01621       if (last_char_was_tilde &&
01622           word->word->space() == 0 && wordstr[offset] == ' ') {
01623         // Prevent adjacent tilde across words - we know that adjacent tildes
01624         // within words have been removed.
01625         // Skip the first character.
01626         offset = lengths[i++];
01627       }
01628       if (i < length && wordstr[offset] != 0) {
01629         if (!last_char_was_newline)
01630           *ptr++ = ' ';
01631         else
01632           last_char_was_newline = false;
01633         for (; i < length; offset += lengths[i++]) {
01634           if (wordstr[offset] == ' ' ||
01635               wordstr[offset] == kTesseractReject) {
01636             *ptr++ = kUNLVReject;
01637             last_char_was_tilde = true;
01638           } else {
01639             if (word->reject_map[i].rejected())
01640               *ptr++ = kUNLVSuspect;
01641             UNICHAR ch(wordstr + offset, lengths[i]);
01642             int uni_ch = ch.first_uni();
01643             for (int j = 0; kUniChs[j] != 0; ++j) {
01644               if (kUniChs[j] == uni_ch) {
01645                 uni_ch = kLatinChs[j];
01646                 break;
01647               }
01648             }
01649             if (uni_ch <= 0xff) {
01650               *ptr++ = static_cast<char>(uni_ch);
01651               last_char_was_tilde = false;
01652             } else {
01653               *ptr++ = kUNLVReject;
01654               last_char_was_tilde = true;
01655             }
01656           }
01657         }
01658       }
01659     }
01660     if (word->word->flag(W_EOL) && !last_char_was_newline) {
01661       /* Add a new line output */
01662       *ptr++ = '\n';
01663       tilde_crunch_written = false;
01664       last_char_was_newline = true;
01665       last_char_was_tilde = false;
01666     }
01667   }
01668   *ptr++ = '\n';
01669   *ptr = '\0';
01670   return result;
01671 }
01672 
01674 int TessBaseAPI::MeanTextConf() {
01675   int* conf = AllWordConfidences();
01676   if (!conf) return 0;
01677   int sum = 0;
01678   int *pt = conf;
01679   while (*pt >= 0) sum += *pt++;
01680   if (pt != conf) sum /= pt - conf;
01681   delete [] conf;
01682   return sum;
01683 }
01684 
01686 int* TessBaseAPI::AllWordConfidences() {
01687   if (tesseract_ == NULL ||
01688       (!recognition_done_ && Recognize(NULL) < 0))
01689     return NULL;
01690   int n_word = 0;
01691   PAGE_RES_IT res_it(page_res_);
01692   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
01693     n_word++;
01694 
01695   int* conf = new int[n_word+1];
01696   n_word = 0;
01697   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
01698     WERD_RES *word = res_it.word();
01699     WERD_CHOICE* choice = word->best_choice;
01700     int w_conf = static_cast<int>(100 + 5 * choice->certainty());
01701                  // This is the eq for converting Tesseract confidence to 1..100
01702     if (w_conf < 0) w_conf = 0;
01703     if (w_conf > 100) w_conf = 100;
01704     conf[n_word++] = w_conf;
01705   }
01706   conf[n_word] = -1;
01707   return conf;
01708 }
01709 
01720 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) {
01721   int debug = 0;
01722   GetIntVariable("applybox_debug", &debug);
01723   bool success = true;
01724   PageSegMode current_psm = GetPageSegMode();
01725   SetPageSegMode(mode);
01726   SetVariable("classify_enable_learning", "0");
01727   char* text = GetUTF8Text();
01728   if (debug) {
01729     tprintf("Trying to adapt \"%s\" to \"%s\"\n", text, wordstr);
01730   }
01731   if (text != NULL) {
01732     PAGE_RES_IT it(page_res_);
01733     WERD_RES* word_res = it.word();
01734     if (word_res != NULL) {
01735       word_res->word->set_text(wordstr);
01736     } else {
01737       success = false;
01738     }
01739     // Check to see if text matches wordstr.
01740     int w = 0;
01741     int t = 0;
01742     for (t = 0; text[t] != '\0'; ++t) {
01743       if (text[t] == '\n' || text[t] == ' ')
01744         continue;
01745       while (wordstr[w] != '\0' && wordstr[w] == ' ')
01746         ++w;
01747       if (text[t] != wordstr[w])
01748         break;
01749       ++w;
01750     }
01751     if (text[t] != '\0' || wordstr[w] != '\0') {
01752       // No match.
01753       delete page_res_;
01754       GenericVector<TBOX> boxes;
01755       page_res_ = tesseract_->SetupApplyBoxes(boxes, block_list_);
01756       tesseract_->ReSegmentByClassification(page_res_);
01757       tesseract_->TidyUp(page_res_);
01758       PAGE_RES_IT pr_it(page_res_);
01759       if (pr_it.word() == NULL)
01760         success = false;
01761       else
01762         word_res = pr_it.word();
01763     } else {
01764       word_res->BestChoiceToCorrectText();
01765     }
01766     if (success) {
01767       tesseract_->EnableLearning = true;
01768       tesseract_->LearnWord(NULL, word_res);
01769     }
01770     delete [] text;
01771   } else {
01772     success = false;
01773   }
01774   SetPageSegMode(current_psm);
01775   return success;
01776 }
01777 
01784 void TessBaseAPI::Clear() {
01785   if (thresholder_ != NULL)
01786     thresholder_->Clear();
01787   ClearResults();
01788 }
01789 
01796 void TessBaseAPI::End() {
01797   if (thresholder_ != NULL) {
01798     delete thresholder_;
01799     thresholder_ = NULL;
01800   }
01801   if (page_res_ != NULL) {
01802     delete page_res_;
01803     page_res_ = NULL;
01804   }
01805   if (block_list_ != NULL) {
01806     delete block_list_;
01807     block_list_ = NULL;
01808   }
01809   if (paragraph_models_ != NULL) {
01810     paragraph_models_->delete_data_pointers();
01811     delete paragraph_models_;
01812     paragraph_models_ = NULL;
01813   }
01814   if (tesseract_ != NULL) {
01815     delete tesseract_;
01816     if (osd_tesseract_ == tesseract_)
01817       osd_tesseract_ = NULL;
01818     tesseract_ = NULL;
01819   }
01820   if (osd_tesseract_ != NULL) {
01821     delete osd_tesseract_;
01822     osd_tesseract_ = NULL;
01823   }
01824   if (equ_detect_ != NULL) {
01825     delete equ_detect_;
01826     equ_detect_ = NULL;
01827   }
01828   if (input_file_ != NULL) {
01829     delete input_file_;
01830     input_file_ = NULL;
01831   }
01832   if (output_file_ != NULL) {
01833     delete output_file_;
01834     output_file_ = NULL;
01835   }
01836   if (datapath_ != NULL) {
01837     delete datapath_;
01838     datapath_ = NULL;
01839   }
01840   if (language_ != NULL) {
01841     delete language_;
01842     language_ = NULL;
01843   }
01844 }
01845 
01846 // Clear any library-level memory caches.
01847 // There are a variety of expensive-to-load constant data structures (mostly
01848 // language dictionaries) that are cached globally -- surviving the Init()
01849 // and End() of individual TessBaseAPI's.  This function allows the clearing
01850 // of these caches.
01851 void TessBaseAPI::ClearPersistentCache() {
01852   Dict::GlobalDawgCache()->DeleteUnusedDawgs();
01853 }
01854 
01859 int TessBaseAPI::IsValidWord(const char *word) {
01860   return tesseract_->getDict().valid_word(word);
01861 }
01862 
01863 
01864 // TODO(rays) Obsolete this function and replace with a more aptly named
01865 // function that returns image coordinates rather than tesseract coordinates.
01866 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
01867   PageIterator* it = AnalyseLayout();
01868   if (it == NULL) {
01869     return false;
01870   }
01871   int x1, x2, y1, y2;
01872   it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
01873   // Calculate offset and slope (NOTE: Kind of ugly)
01874   if (x2 <= x1) x2 = x1 + 1;
01875   // Convert the point pair to slope/offset of the baseline (in image coords.)
01876   *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
01877   *out_offset = static_cast<int>(y1 - *out_slope * x1);
01878   // Get the y-coord of the baseline at the left and right edges of the
01879   // textline's bounding box.
01880   int left, top, right, bottom;
01881   if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom))
01882     return false;
01883   int left_y = IntCastRounded(*out_slope * left + *out_offset);
01884   int right_y = IntCastRounded(*out_slope * right + *out_offset);
01885   // Shift the baseline down so it passes through the nearest bottom-corner
01886   // of the textline's bounding box. This is the difference between the y
01887   // at the lowest (max) edge of the box and the actual box bottom.
01888   *out_offset += bottom - MAX(left_y, right_y);
01889   // Switch back to bottom-up tesseract coordinates. Requires negation of
01890   // the slope and height - offset for the offset.
01891   *out_slope = -*out_slope;
01892   *out_offset = rect_height_ - *out_offset;
01893   delete it;
01894 
01895   return true;
01896 }
01897 
01899 void TessBaseAPI::SetDictFunc(DictFunc f) {
01900   if (tesseract_ != NULL) {
01901     tesseract_->getDict().letter_is_okay_ = f;
01902   }
01903 }
01904 
01909 void TessBaseAPI::SetProbabilityInContextFunc(ProbabilityInContextFunc f) {
01910   if (tesseract_ != NULL) {
01911     tesseract_->getDict().probability_in_context_ = f;
01912     // Set it for the sublangs too.
01913     int num_subs = tesseract_->num_sub_langs();
01914     for (int i = 0; i < num_subs; ++i) {
01915       tesseract_->get_sub_lang(i)->getDict().probability_in_context_ = f;
01916     }
01917   }
01918 }
01919 
01921 void TessBaseAPI::SetFillLatticeFunc(FillLatticeFunc f) {
01922   if (tesseract_ != NULL) tesseract_->fill_lattice_ = f;
01923 }
01924 
01926 bool TessBaseAPI::InternalSetImage() {
01927   if (tesseract_ == NULL) {
01928     tprintf("Please call Init before attempting to send an image.");
01929     return false;
01930   }
01931   if (thresholder_ == NULL)
01932     thresholder_ = new ImageThresholder;
01933   ClearResults();
01934   return true;
01935 }
01936 
01943 void TessBaseAPI::Threshold(Pix** pix) {
01944   ASSERT_HOST(pix != NULL);
01945   if (*pix != NULL)
01946     pixDestroy(pix);
01947   // Zero resolution messes up the algorithms, so make sure it is credible.
01948   int y_res = thresholder_->GetScaledYResolution();
01949   if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
01950     // Use the minimum default resolution, as it is safer to under-estimate
01951     // than over-estimate resolution.
01952     thresholder_->SetSourceYResolution(kMinCredibleResolution);
01953   }
01954   thresholder_->ThresholdToPix(pix);
01955   thresholder_->GetImageSizes(&rect_left_, &rect_top_,
01956                               &rect_width_, &rect_height_,
01957                               &image_width_, &image_height_);
01958   if (!thresholder_->IsBinary()) {
01959     tesseract_->set_pix_thresholds(thresholder_->GetPixRectThresholds());
01960     tesseract_->set_pix_grey(thresholder_->GetPixRectGrey());
01961   } else {
01962     tesseract_->set_pix_thresholds(NULL);
01963     tesseract_->set_pix_grey(NULL);
01964   }
01965   // Set the internal resolution that is used for layout parameters from the
01966   // estimated resolution, rather than the image resolution, which may be
01967   // fabricated, but we will use the image resolution, if there is one, to
01968   // report output point sizes.
01969   int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
01970                                   kMinCredibleResolution,
01971                                   kMaxCredibleResolution);
01972   if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
01973     tprintf("Estimated resolution %d out of range! Corrected to %d\n",
01974             thresholder_->GetScaledEstimatedResolution(), estimated_res);
01975   }
01976   tesseract_->set_source_resolution(estimated_res);
01977   SavePixForCrash(estimated_res, *pix);
01978 }
01979 
01981 int TessBaseAPI::FindLines() {
01982   if (thresholder_ == NULL || thresholder_->IsEmpty()) {
01983     tprintf("Please call SetImage before attempting recognition.");
01984     return -1;
01985   }
01986   if (recognition_done_)
01987     ClearResults();
01988   if (!block_list_->empty()) {
01989     return 0;
01990   }
01991   if (tesseract_ == NULL) {
01992     tesseract_ = new Tesseract;
01993     tesseract_->InitAdaptiveClassifier(false);
01994   }
01995   if (tesseract_->pix_binary() == NULL)
01996     Threshold(tesseract_->mutable_pix_binary());
01997   if (tesseract_->ImageWidth() > MAX_INT16 ||
01998       tesseract_->ImageHeight() > MAX_INT16) {
01999     tprintf("Image too large: (%d, %d)\n",
02000             tesseract_->ImageWidth(), tesseract_->ImageHeight());
02001     return -1;
02002   }
02003 
02004   tesseract_->PrepareForPageseg();
02005 
02006   if (tesseract_->textord_equation_detect) {
02007     if (equ_detect_ == NULL && datapath_ != NULL) {
02008       equ_detect_ = new EquationDetect(datapath_->string(), NULL);
02009     }
02010     tesseract_->SetEquationDetect(equ_detect_);
02011   }
02012 
02013   Tesseract* osd_tess = osd_tesseract_;
02014   OSResults osr;
02015   if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == NULL) {
02016     if (strcmp(language_->string(), "osd") == 0) {
02017       osd_tess = tesseract_;
02018     } else {
02019       osd_tesseract_ = new Tesseract;
02020       if (osd_tesseract_->init_tesseract(
02021           datapath_->string(), NULL, "osd", OEM_TESSERACT_ONLY,
02022           NULL, 0, NULL, NULL, false) == 0) {
02023         osd_tess = osd_tesseract_;
02024         osd_tesseract_->set_source_resolution(
02025             thresholder_->GetSourceYResolution());
02026       } else {
02027         tprintf("Warning: Auto orientation and script detection requested,"
02028                 " but osd language failed to load\n");
02029         delete osd_tesseract_;
02030         osd_tesseract_ = NULL;
02031       }
02032     }
02033   }
02034 
02035   if (tesseract_->SegmentPage(input_file_, block_list_, osd_tess, &osr) < 0)
02036     return -1;
02037   // If Devanagari is being recognized, we use different images for page seg
02038   // and for OCR.
02039   tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
02040   return 0;
02041 }
02042 
02044 void TessBaseAPI::ClearResults() {
02045   if (tesseract_ != NULL) {
02046     tesseract_->Clear();
02047   }
02048   if (page_res_ != NULL) {
02049     delete page_res_;
02050     page_res_ = NULL;
02051   }
02052   recognition_done_ = false;
02053   if (block_list_ == NULL)
02054     block_list_ = new BLOCK_LIST;
02055   else
02056     block_list_->clear();
02057   if (paragraph_models_ != NULL) {
02058     paragraph_models_->delete_data_pointers();
02059     delete paragraph_models_;
02060     paragraph_models_ = NULL;
02061   }
02062   SavePixForCrash(0, NULL);
02063 }
02064 
02072 int TessBaseAPI::TextLength(int* blob_count) {
02073   if (tesseract_ == NULL || page_res_ == NULL)
02074     return 0;
02075 
02076   PAGE_RES_IT   page_res_it(page_res_);
02077   int total_length = 2;
02078   int total_blobs = 0;
02079   // Iterate over the data structures to extract the recognition result.
02080   for (page_res_it.restart_page(); page_res_it.word () != NULL;
02081        page_res_it.forward()) {
02082     WERD_RES *word = page_res_it.word();
02083     WERD_CHOICE* choice = word->best_choice;
02084     if (choice != NULL) {
02085       total_blobs += choice->length() + 2;
02086       total_length += choice->unichar_string().length() + 2;
02087       for (int i = 0; i < word->reject_map.length(); ++i) {
02088         if (word->reject_map[i].rejected())
02089           ++total_length;
02090       }
02091     }
02092   }
02093   if (blob_count != NULL)
02094     *blob_count = total_blobs;
02095   return total_length;
02096 }
02097 
02102 bool TessBaseAPI::DetectOS(OSResults* osr) {
02103   if (tesseract_ == NULL)
02104     return false;
02105   ClearResults();
02106   if (tesseract_->pix_binary() == NULL)
02107     Threshold(tesseract_->mutable_pix_binary());
02108   if (input_file_ == NULL)
02109     input_file_ = new STRING(kInputFile);
02110   return orientation_and_script_detection(*input_file_, osr, tesseract_);
02111 }
02112 
02113 void TessBaseAPI::set_min_orientation_margin(double margin) {
02114   tesseract_->min_orientation_margin.set_value(margin);
02115 }
02116 
02131 void TessBaseAPI::GetBlockTextOrientations(int** block_orientation,
02132                                            bool** vertical_writing) {
02133   delete[] *block_orientation;
02134   *block_orientation = NULL;
02135   delete[] *vertical_writing;
02136   *vertical_writing = NULL;
02137   BLOCK_IT block_it(block_list_);
02138 
02139   block_it.move_to_first();
02140   int num_blocks = 0;
02141   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
02142     if (!block_it.data()->poly_block()->IsText()) {
02143       continue;
02144     }
02145     ++num_blocks;
02146   }
02147   if (!num_blocks) {
02148     tprintf("WARNING: Found no blocks\n");
02149     return;
02150   }
02151   *block_orientation = new int[num_blocks];
02152   *vertical_writing = new bool[num_blocks];
02153   block_it.move_to_first();
02154   int i = 0;
02155   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
02156        block_it.forward()) {
02157     if (!block_it.data()->poly_block()->IsText()) {
02158       continue;
02159     }
02160     FCOORD re_rotation = block_it.data()->re_rotation();
02161     float re_theta = re_rotation.angle();
02162     FCOORD classify_rotation = block_it.data()->classify_rotation();
02163     float classify_theta = classify_rotation.angle();
02164     double rot_theta = - (re_theta - classify_theta) * 2.0 / PI;
02165     if (rot_theta < 0) rot_theta += 4;
02166     int num_rotations = static_cast<int>(rot_theta + 0.5);
02167     (*block_orientation)[i] = num_rotations;
02168     // The classify_rotation is non-zero only if the text has vertical
02169     // writing direction.
02170     (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
02171     ++i;
02172   }
02173 }
02174 
02175 // ____________________________________________________________________________
02176 // Ocropus add-ons.
02177 
02179 BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() {
02180   FindLines();
02181   BLOCK_LIST* result = block_list_;
02182   block_list_ = NULL;
02183   return result;
02184 }
02185 
02191 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
02192   delete block_list;
02193 }
02194 
02195 
02196 ROW *TessBaseAPI::MakeTessOCRRow(float baseline,
02197                                  float xheight,
02198                                  float descender,
02199                                  float ascender) {
02200   inT32 xstarts[] = {-32000};
02201   double quad_coeffs[] = {0, 0, baseline};
02202   return new ROW(1,
02203                  xstarts,
02204                  quad_coeffs,
02205                  xheight,
02206                  ascender - (baseline + xheight),
02207                  descender - baseline,
02208                  0,
02209                  0);
02210 }
02211 
02213 TBLOB *TessBaseAPI::MakeTBLOB(Pix *pix) {
02214   int width = pixGetWidth(pix);
02215   int height = pixGetHeight(pix);
02216   BLOCK block("a character", TRUE, 0, 0, 0, 0, width, height);
02217 
02218   // Create C_BLOBs from the page
02219   extract_edges(pix, &block);
02220 
02221   // Merge all C_BLOBs
02222   C_BLOB_LIST *list = block.blob_list();
02223   C_BLOB_IT c_blob_it(list);
02224   if (c_blob_it.empty())
02225     return NULL;
02226   // Move all the outlines to the first blob.
02227   C_OUTLINE_IT ol_it(c_blob_it.data()->out_list());
02228   for (c_blob_it.forward();
02229        !c_blob_it.at_first();
02230        c_blob_it.forward()) {
02231       C_BLOB *c_blob = c_blob_it.data();
02232       ol_it.add_list_after(c_blob->out_list());
02233   }
02234   // Convert the first blob to the output TBLOB.
02235   return TBLOB::PolygonalCopy(false, c_blob_it.data());
02236 }
02237 
02243 void TessBaseAPI::NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode) {
02244   TBOX box = tblob->bounding_box();
02245   float x_center = (box.left() + box.right()) / 2.0f;
02246   float baseline = row->base_line(x_center);
02247   float scale = kBlnXHeight / row->x_height();
02248   tblob->Normalize(NULL, NULL, NULL, x_center, baseline, scale, scale,
02249                    0.0f, static_cast<float>(kBlnBaselineOffset), false, NULL);
02250 }
02251 
02256 TBLOB *make_tesseract_blob(float baseline, float xheight,
02257                            float descender, float ascender,
02258                            bool numeric_mode, Pix* pix) {
02259   TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix);
02260 
02261   // Normalize TBLOB
02262   ROW *row =
02263       TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender);
02264   TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode);
02265   delete row;
02266   return tblob;
02267 }
02268 
02274 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
02275                                    int length,
02276                                    float baseline,
02277                                    float xheight,
02278                                    float descender,
02279                                    float ascender) {
02280   UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
02281   TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender,
02282                                     tesseract_->classify_bln_numeric_mode,
02283                                     tesseract_->pix_binary());
02284   float threshold;
02285   float best_rating = -100;
02286 
02287 
02288   // Classify to get a raw choice.
02289   BLOB_CHOICE_LIST choices;
02290   tesseract_->AdaptiveClassifier(blob, &choices);
02291   BLOB_CHOICE_IT choice_it;
02292   choice_it.set_to_list(&choices);
02293   for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
02294        choice_it.forward()) {
02295     if (choice_it.data()->rating() > best_rating) {
02296       best_rating = choice_it.data()->rating();
02297     }
02298   }
02299 
02300   threshold = tesseract_->matcher_good_threshold;
02301 
02302   if (blob->outlines)
02303     tesseract_->AdaptToChar(blob, id, kUnknownFontinfoId, threshold);
02304   delete blob;
02305 }
02306 
02307 
02308 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
02309   PAGE_RES *page_res = new PAGE_RES(block_list,
02310                                     &(tesseract_->prev_word_best_choice_));
02311   tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1);
02312   return page_res;
02313 }
02314 
02315 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
02316                                         PAGE_RES* pass1_result) {
02317   if (!pass1_result)
02318     pass1_result = new PAGE_RES(block_list,
02319                                 &(tesseract_->prev_word_best_choice_));
02320   tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2);
02321   return pass1_result;
02322 }
02323 
02324 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
02325   int debug_level = 0;
02326   GetIntVariable("paragraph_debug_level", &debug_level);
02327   if (paragraph_models_ == NULL)
02328     paragraph_models_ = new GenericVector<ParagraphModel*>;
02329   MutableIterator *result_it = GetMutableIterator();
02330   do {  // Detect paragraphs for this block
02331     GenericVector<ParagraphModel *> models;
02332     ::tesseract::DetectParagraphs(debug_level, after_text_recognition,
02333                                   result_it, &models);
02334     *paragraph_models_ += models;
02335   } while (result_it->Next(RIL_BLOCK));
02336   delete result_it;
02337 }
02338 
02339 struct TESS_CHAR : ELIST_LINK {
02340   char *unicode_repr;
02341   int length;  // of unicode_repr
02342   float cost;
02343   TBOX box;
02344 
02345   TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
02346     length = (len == -1 ? strlen(repr) : len);
02347     unicode_repr = new char[length + 1];
02348     strncpy(unicode_repr, repr, length);
02349   }
02350 
02351   TESS_CHAR() {  // Satisfies ELISTIZE.
02352   }
02353   ~TESS_CHAR() {
02354     delete [] unicode_repr;
02355   }
02356 };
02357 
02358 ELISTIZEH(TESS_CHAR)
02359 ELISTIZE(TESS_CHAR)
02360 
02361 static void add_space(TESS_CHAR_IT* it) {
02362   TESS_CHAR *t = new TESS_CHAR(0, " ");
02363   it->add_after_then_move(t);
02364 }
02365 
02366 
02367 static float rating_to_cost(float rating) {
02368   rating = 100 + rating;
02369   // cuddled that to save from coverage profiler
02370   // (I have never seen ratings worse than -100,
02371   //  but the check won't hurt)
02372   if (rating < 0) rating = 0;
02373   return rating;
02374 }
02375 
02380 static void extract_result(TESS_CHAR_IT* out,
02381                            PAGE_RES* page_res) {
02382   PAGE_RES_IT page_res_it(page_res);
02383   int word_count = 0;
02384   while (page_res_it.word() != NULL) {
02385     WERD_RES *word = page_res_it.word();
02386     const char *str = word->best_choice->unichar_string().string();
02387     const char *len = word->best_choice->unichar_lengths().string();
02388     TBOX real_rect = word->word->bounding_box();
02389 
02390     if (word_count)
02391       add_space(out);
02392     int n = strlen(len);
02393     for (int i = 0; i < n; i++) {
02394       TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()),
02395                                     str, *len);
02396       tc->box = real_rect.intersection(word->box_word->BlobBox(i));
02397       out->add_after_then_move(tc);
02398        str += *len;
02399       len++;
02400     }
02401     page_res_it.forward();
02402     word_count++;
02403   }
02404 }
02405 
02410 int TessBaseAPI::TesseractExtractResult(char** text,
02411                                         int** lengths,
02412                                         float** costs,
02413                                         int** x0,
02414                                         int** y0,
02415                                         int** x1,
02416                                         int** y1,
02417                                         PAGE_RES* page_res) {
02418   TESS_CHAR_LIST tess_chars;
02419   TESS_CHAR_IT tess_chars_it(&tess_chars);
02420   extract_result(&tess_chars_it, page_res);
02421   tess_chars_it.move_to_first();
02422   int n = tess_chars.length();
02423   int text_len = 0;
02424   *lengths = new int[n];
02425   *costs = new float[n];
02426   *x0 = new int[n];
02427   *y0 = new int[n];
02428   *x1 = new int[n];
02429   *y1 = new int[n];
02430   int i = 0;
02431   for (tess_chars_it.mark_cycle_pt();
02432        !tess_chars_it.cycled_list();
02433        tess_chars_it.forward(), i++) {
02434     TESS_CHAR *tc = tess_chars_it.data();
02435     text_len += (*lengths)[i] = tc->length;
02436     (*costs)[i] = tc->cost;
02437     (*x0)[i] = tc->box.left();
02438     (*y0)[i] = tc->box.bottom();
02439     (*x1)[i] = tc->box.right();
02440     (*y1)[i] = tc->box.top();
02441   }
02442   char *p = *text = new char[text_len];
02443 
02444   tess_chars_it.move_to_first();
02445   for (tess_chars_it.mark_cycle_pt();
02446         !tess_chars_it.cycled_list();
02447        tess_chars_it.forward()) {
02448     TESS_CHAR *tc = tess_chars_it.data();
02449     strncpy(p, tc->unicode_repr, tc->length);
02450     p += tc->length;
02451   }
02452   return n;
02453 }
02454 
02456 // The resulting features are returned in int_features, which must be
02457 // of size MAX_NUM_INT_FEATURES. The number of features is returned in
02458 // num_features (or 0 if there was a failure).
02459 // On return feature_outline_index is filled with an index of the outline
02460 // corresponding to each feature in int_features.
02461 // TODO(rays) Fix the caller to out outline_counts instead.
02462 void TessBaseAPI::GetFeaturesForBlob(TBLOB* blob,
02463                                      INT_FEATURE_STRUCT* int_features,
02464                                      int* num_features,
02465                                      int* feature_outline_index) {
02466   GenericVector<int> outline_counts;
02467   GenericVector<INT_FEATURE_STRUCT> bl_features;
02468   GenericVector<INT_FEATURE_STRUCT> cn_features;
02469   INT_FX_RESULT_STRUCT fx_info;
02470   tesseract_->ExtractFeatures(*blob, false, &bl_features,
02471                               &cn_features, &fx_info, &outline_counts);
02472   if (cn_features.size() == 0 || cn_features.size() > MAX_NUM_INT_FEATURES) {
02473     *num_features = 0;
02474     return;  // Feature extraction failed.
02475   }
02476   *num_features = cn_features.size();
02477   memcpy(int_features, &cn_features[0], *num_features * sizeof(cn_features[0]));
02478   // TODO(rays) Pass outline_counts back and simplify the calling code.
02479   if (feature_outline_index != NULL) {
02480     int f = 0;
02481     for (int i = 0; i < outline_counts.size(); ++i) {
02482       while (f < outline_counts[i])
02483         feature_outline_index[f++] = i;
02484     }
02485   }
02486 }
02487 
02488 // This method returns the row to which a box of specified dimensions would
02489 // belong. If no good match is found, it returns NULL.
02490 ROW* TessBaseAPI::FindRowForBox(BLOCK_LIST* blocks,
02491                                 int left, int top, int right, int bottom) {
02492   TBOX box(left, bottom, right, top);
02493   BLOCK_IT b_it(blocks);
02494   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
02495     BLOCK* block = b_it.data();
02496     if (!box.major_overlap(block->bounding_box()))
02497       continue;
02498     ROW_IT r_it(block->row_list());
02499     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
02500       ROW* row = r_it.data();
02501       if (!box.major_overlap(row->bounding_box()))
02502         continue;
02503       WERD_IT w_it(row->word_list());
02504       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
02505         WERD* word = w_it.data();
02506         if (box.major_overlap(word->bounding_box()))
02507           return row;
02508       }
02509     }
02510   }
02511   return NULL;
02512 }
02513 
02515 void TessBaseAPI::RunAdaptiveClassifier(TBLOB* blob,
02516                                         int num_max_matches,
02517                                         int* unichar_ids,
02518                                         float* ratings,
02519                                         int* num_matches_returned) {
02520   BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
02521   tesseract_->AdaptiveClassifier(blob, choices);
02522   BLOB_CHOICE_IT choices_it(choices);
02523   int& index = *num_matches_returned;
02524   index = 0;
02525   for (choices_it.mark_cycle_pt();
02526        !choices_it.cycled_list() && index < num_max_matches;
02527        choices_it.forward()) {
02528     BLOB_CHOICE* choice = choices_it.data();
02529     unichar_ids[index] = choice->unichar_id();
02530     ratings[index] = choice->rating();
02531     ++index;
02532   }
02533   *num_matches_returned = index;
02534   delete choices;
02535 }
02536 
02538 const char* TessBaseAPI::GetUnichar(int unichar_id) {
02539   return tesseract_->unicharset.id_to_unichar(unichar_id);
02540 }
02541 
02543 const Dawg *TessBaseAPI::GetDawg(int i) const {
02544   if (tesseract_ == NULL || i >= NumDawgs()) return NULL;
02545   return tesseract_->getDict().GetDawg(i);
02546 }
02547 
02549 int TessBaseAPI::NumDawgs() const {
02550   return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
02551 }
02552 
02554 CubeRecoContext *TessBaseAPI::GetCubeRecoContext() const {
02555   return (tesseract_ == NULL) ? NULL : tesseract_->GetCubeRecoContext();
02556 }
02557 
02558 TessResultRenderer* TessBaseAPI::NewRenderer() {
02559   if (tesseract_->tessedit_create_boxfile
02560       || tesseract_->tessedit_make_boxes_from_boxes) {
02561     return new TessBoxTextRenderer();
02562   } else if (tesseract_->tessedit_create_hocr) {
02563     return new TessHOcrRenderer();
02564   } else if (tesseract_->tessedit_create_pdf) {
02565     return new TessPDFRenderer(tesseract_->datadir.c_str());
02566   } else if (tesseract_->tessedit_write_unlv) {
02567     return new TessUnlvRenderer();
02568   } else if (tesseract_->tessedit_create_boxfile) {
02569     return new TessBoxTextRenderer();
02570   } else {
02571     return new TessTextRenderer();
02572   }
02573 }
02574 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines