tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/api/renderer.cpp
Go to the documentation of this file.
00001 // Include automatically generated configuration file if running autoconf.
00002 #ifdef HAVE_CONFIG_H
00003 #include "config_auto.h"
00004 #endif
00005 
00006 #include <string.h>
00007 #include "baseapi.h"
00008 #include "genericvector.h"
00009 #include "renderer.h"
00010 
00011 #if !defined(VERSION)
00012 #include "version.h"
00013 #endif
00014 
00015 namespace tesseract {
00016 
00017 // Start with a 4K output buffer which should be pretty big for a page of text
00018 // though might need to grow for other formats or multi-page documents.
00019 static const int kInitialAlloc = 1 << 12;
00020 
00021 /**********************************************************************
00022  * Base Renderer interface implementation
00023  **********************************************************************/
00024 TessResultRenderer::TessResultRenderer(const char* type, const char* extension)
00025     : full_typename_(type), file_extension_(extension),
00026       title_(""), imagenum_(-1),
00027       output_data_(NULL),
00028       next_(NULL) {
00029   ResetData();
00030 }
00031 
00032 TessResultRenderer::~TessResultRenderer() {
00033   delete[] output_data_;
00034   delete next_;
00035 }
00036 
00037 void TessResultRenderer::insert(TessResultRenderer* next) {
00038   if (next == NULL) return;
00039 
00040   TessResultRenderer* remainder = next_;
00041   next_ = next;
00042   if (remainder) {
00043     while (next->next_ != NULL) {
00044       next = next->next_;
00045     }
00046     next->next_ = remainder;
00047   }
00048 }
00049 
00050 bool TessResultRenderer::BeginDocument(const char* title) {
00051   ResetData();
00052 
00053   title_ = title;
00054   imagenum_ = -1;
00055   bool ok = BeginDocumentHandler();
00056   if (next_) {
00057     ok = next_->BeginDocument(title) && ok;
00058   }
00059   return ok;
00060 }
00061 
00062 bool TessResultRenderer::AddImage(TessBaseAPI* api) {
00063   ++imagenum_;
00064   bool ok = AddImageHandler(api);
00065   if (next_) {
00066     ok = next_->AddImage(api) && ok;
00067   }
00068   return ok;
00069 }
00070 
00071 bool TessResultRenderer::AddError(TessBaseAPI* api) {
00072   ++imagenum_;
00073   bool ok = AddErrorHandler(api);
00074   if (next_) {
00075     ok = next_->AddError(api) && ok;
00076   }
00077   return ok;
00078 }
00079 
00080 bool TessResultRenderer::EndDocument() {
00081   bool ok = EndDocumentHandler();
00082   if (next_) {
00083     ok = next_->EndDocument() && ok;
00084   }
00085   return ok;
00086 }
00087 
00088 bool TessResultRenderer::GetOutput(const char** data, int* data_len) const {
00089   *data = output_data_;
00090   *data_len = output_len_;
00091   return true;
00092 }
00093 
00094 void TessResultRenderer::ResetData() {
00095   delete[] output_data_;
00096   output_data_ = new char[kInitialAlloc];
00097   output_alloc_ = kInitialAlloc;
00098   output_len_ = 0;
00099 }
00100 
00101 void TessResultRenderer::ReserveAdditionalData(int relative_len) {
00102   int total = relative_len + output_len_;
00103   if (total <= output_alloc_)
00104     return;
00105 
00106   if (total < 2 * output_alloc_) {
00107     total = 2 * output_alloc_;
00108   }
00109 
00110   char* new_data = new char[total];
00111   memcpy(new_data, output_data_, output_len_);
00112   delete[] output_data_;
00113   output_data_ = new_data;
00114 }
00115 
00116 void TessResultRenderer::AppendString(const char* s) {
00117   AppendData(s, strlen(s));
00118 }
00119 
00120 void TessResultRenderer::AppendData(const char* s, int len) {
00121   ReserveAdditionalData(len);
00122   memcpy(output_data_ + output_len_, s, len);
00123   output_len_ += len;
00124 }
00125 
00126 bool TessResultRenderer::BeginDocumentHandler() {
00127   return true;
00128 }
00129 
00130 bool TessResultRenderer::AddErrorHandler(TessBaseAPI* api) {
00131   return true;
00132 }
00133 
00134 bool TessResultRenderer::EndDocumentHandler() {
00135   return true;
00136 }
00137 
00138 
00139 /**********************************************************************
00140  * UTF8 Text Renderer interface implementation
00141  **********************************************************************/
00142 TessTextRenderer::TessTextRenderer()
00143     : TessResultRenderer("Text", "txt") {
00144 }
00145 
00146 bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) {
00147   char* utf8 = api->GetUTF8Text();
00148   if (utf8 == NULL) {
00149     return false;
00150   }
00151 
00152   AppendString(utf8);
00153   delete[] utf8;
00154 
00155   return true;
00156 }
00157 
00158 /**********************************************************************
00159  * HOcr Text Renderer interface implementation
00160  **********************************************************************/
00161 TessHOcrRenderer::TessHOcrRenderer()
00162     : TessResultRenderer("HOcr", "hocr") {
00163 }
00164 
00165 bool TessHOcrRenderer::BeginDocumentHandler() {
00166   AppendString(
00167         "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
00168         "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
00169         "    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
00170         "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
00171         "lang=\"en\">\n <head>\n  <title>\n");
00172   AppendString(title());
00173   AppendString(
00174       "</title>\n"
00175       "<meta http-equiv=\"Content-Type\" content=\"text/html;"
00176       "charset=utf-8\" />\n"
00177       "  <meta name='ocr-system' content='tesseract " VERSION "' />\n"
00178       "  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
00179       " ocr_line ocrx_word'/>\n"
00180       "</head>\n<body>\n");
00181 
00182   return true;
00183 }
00184 
00185 bool TessHOcrRenderer::EndDocumentHandler() {
00186   AppendString(" </body>\n</html>\n");
00187 
00188   return true;
00189 }
00190 
00191 bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
00192   char* hocr = api->GetHOCRText(imagenum());
00193   if (hocr == NULL) return false;
00194 
00195   AppendString(hocr);
00196   delete[] hocr;
00197 
00198   return true;
00199 }
00200 
00201 /**********************************************************************
00202  * UNLV Text Renderer interface implementation
00203  **********************************************************************/
00204 TessUnlvRenderer::TessUnlvRenderer()
00205     : TessResultRenderer("UNLV", "unlv") {
00206 }
00207 
00208 bool TessUnlvRenderer::AddImageHandler(TessBaseAPI* api) {
00209   char* unlv = api->GetUNLVText();
00210   if (unlv == NULL) return false;
00211 
00212   AppendString(unlv);
00213   delete[] unlv;
00214 
00215   return true;
00216 }
00217 
00218 /**********************************************************************
00219  * BoxText Renderer interface implementation
00220  **********************************************************************/
00221 TessBoxTextRenderer::TessBoxTextRenderer()
00222     : TessResultRenderer("Box Text", "box") {
00223 }
00224 
00225 bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI* api) {
00226   char* text = api->GetBoxText(imagenum());
00227   if (text == NULL) return false;
00228 
00229   AppendString(text);
00230   delete[] text;
00231 
00232   return true;
00233 }
00234 
00235 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines