tesseract
3.03
|
00001 // Include automatically generated configuration file if running autoconf. 00002 #ifdef HAVE_CONFIG_H 00003 #include "config_auto.h" 00004 #endif 00005 00006 #include <string.h> 00007 #include "baseapi.h" 00008 #include "genericvector.h" 00009 #include "renderer.h" 00010 00011 #if !defined(VERSION) 00012 #include "version.h" 00013 #endif 00014 00015 namespace tesseract { 00016 00017 // Start with a 4K output buffer which should be pretty big for a page of text 00018 // though might need to grow for other formats or multi-page documents. 00019 static const int kInitialAlloc = 1 << 12; 00020 00021 /********************************************************************** 00022 * Base Renderer interface implementation 00023 **********************************************************************/ 00024 TessResultRenderer::TessResultRenderer(const char* type, const char* extension) 00025 : full_typename_(type), file_extension_(extension), 00026 title_(""), imagenum_(-1), 00027 output_data_(NULL), 00028 next_(NULL) { 00029 ResetData(); 00030 } 00031 00032 TessResultRenderer::~TessResultRenderer() { 00033 delete[] output_data_; 00034 delete next_; 00035 } 00036 00037 void TessResultRenderer::insert(TessResultRenderer* next) { 00038 if (next == NULL) return; 00039 00040 TessResultRenderer* remainder = next_; 00041 next_ = next; 00042 if (remainder) { 00043 while (next->next_ != NULL) { 00044 next = next->next_; 00045 } 00046 next->next_ = remainder; 00047 } 00048 } 00049 00050 bool TessResultRenderer::BeginDocument(const char* title) { 00051 ResetData(); 00052 00053 title_ = title; 00054 imagenum_ = -1; 00055 bool ok = BeginDocumentHandler(); 00056 if (next_) { 00057 ok = next_->BeginDocument(title) && ok; 00058 } 00059 return ok; 00060 } 00061 00062 bool TessResultRenderer::AddImage(TessBaseAPI* api) { 00063 ++imagenum_; 00064 bool ok = AddImageHandler(api); 00065 if (next_) { 00066 ok = next_->AddImage(api) && ok; 00067 } 00068 return ok; 00069 } 00070 00071 bool TessResultRenderer::AddError(TessBaseAPI* api) { 00072 ++imagenum_; 00073 bool ok = AddErrorHandler(api); 00074 if (next_) { 00075 ok = next_->AddError(api) && ok; 00076 } 00077 return ok; 00078 } 00079 00080 bool TessResultRenderer::EndDocument() { 00081 bool ok = EndDocumentHandler(); 00082 if (next_) { 00083 ok = next_->EndDocument() && ok; 00084 } 00085 return ok; 00086 } 00087 00088 bool TessResultRenderer::GetOutput(const char** data, int* data_len) const { 00089 *data = output_data_; 00090 *data_len = output_len_; 00091 return true; 00092 } 00093 00094 void TessResultRenderer::ResetData() { 00095 delete[] output_data_; 00096 output_data_ = new char[kInitialAlloc]; 00097 output_alloc_ = kInitialAlloc; 00098 output_len_ = 0; 00099 } 00100 00101 void TessResultRenderer::ReserveAdditionalData(int relative_len) { 00102 int total = relative_len + output_len_; 00103 if (total <= output_alloc_) 00104 return; 00105 00106 if (total < 2 * output_alloc_) { 00107 total = 2 * output_alloc_; 00108 } 00109 00110 char* new_data = new char[total]; 00111 memcpy(new_data, output_data_, output_len_); 00112 delete[] output_data_; 00113 output_data_ = new_data; 00114 } 00115 00116 void TessResultRenderer::AppendString(const char* s) { 00117 AppendData(s, strlen(s)); 00118 } 00119 00120 void TessResultRenderer::AppendData(const char* s, int len) { 00121 ReserveAdditionalData(len); 00122 memcpy(output_data_ + output_len_, s, len); 00123 output_len_ += len; 00124 } 00125 00126 bool TessResultRenderer::BeginDocumentHandler() { 00127 return true; 00128 } 00129 00130 bool TessResultRenderer::AddErrorHandler(TessBaseAPI* api) { 00131 return true; 00132 } 00133 00134 bool TessResultRenderer::EndDocumentHandler() { 00135 return true; 00136 } 00137 00138 00139 /********************************************************************** 00140 * UTF8 Text Renderer interface implementation 00141 **********************************************************************/ 00142 TessTextRenderer::TessTextRenderer() 00143 : TessResultRenderer("Text", "txt") { 00144 } 00145 00146 bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) { 00147 char* utf8 = api->GetUTF8Text(); 00148 if (utf8 == NULL) { 00149 return false; 00150 } 00151 00152 AppendString(utf8); 00153 delete[] utf8; 00154 00155 return true; 00156 } 00157 00158 /********************************************************************** 00159 * HOcr Text Renderer interface implementation 00160 **********************************************************************/ 00161 TessHOcrRenderer::TessHOcrRenderer() 00162 : TessResultRenderer("HOcr", "hocr") { 00163 } 00164 00165 bool TessHOcrRenderer::BeginDocumentHandler() { 00166 AppendString( 00167 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" 00168 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" 00169 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" 00170 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" " 00171 "lang=\"en\">\n <head>\n <title>\n"); 00172 AppendString(title()); 00173 AppendString( 00174 "</title>\n" 00175 "<meta http-equiv=\"Content-Type\" content=\"text/html;" 00176 "charset=utf-8\" />\n" 00177 " <meta name='ocr-system' content='tesseract " VERSION "' />\n" 00178 " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par" 00179 " ocr_line ocrx_word'/>\n" 00180 "</head>\n<body>\n"); 00181 00182 return true; 00183 } 00184 00185 bool TessHOcrRenderer::EndDocumentHandler() { 00186 AppendString(" </body>\n</html>\n"); 00187 00188 return true; 00189 } 00190 00191 bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) { 00192 char* hocr = api->GetHOCRText(imagenum()); 00193 if (hocr == NULL) return false; 00194 00195 AppendString(hocr); 00196 delete[] hocr; 00197 00198 return true; 00199 } 00200 00201 /********************************************************************** 00202 * UNLV Text Renderer interface implementation 00203 **********************************************************************/ 00204 TessUnlvRenderer::TessUnlvRenderer() 00205 : TessResultRenderer("UNLV", "unlv") { 00206 } 00207 00208 bool TessUnlvRenderer::AddImageHandler(TessBaseAPI* api) { 00209 char* unlv = api->GetUNLVText(); 00210 if (unlv == NULL) return false; 00211 00212 AppendString(unlv); 00213 delete[] unlv; 00214 00215 return true; 00216 } 00217 00218 /********************************************************************** 00219 * BoxText Renderer interface implementation 00220 **********************************************************************/ 00221 TessBoxTextRenderer::TessBoxTextRenderer() 00222 : TessResultRenderer("Box Text", "box") { 00223 } 00224 00225 bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI* api) { 00226 char* text = api->GetBoxText(imagenum()); 00227 if (text == NULL) return false; 00228 00229 AppendString(text); 00230 delete[] text; 00231 00232 return true; 00233 } 00234 00235 } // namespace tesseract