tesseract
3.03
|
#include <renderer.h>
Public Member Functions | |
TessPDFRenderer (const char *datadir) | |
Protected Member Functions | |
virtual bool | BeginDocumentHandler () |
virtual bool | AddImageHandler (TessBaseAPI *api) |
virtual bool | EndDocumentHandler () |
Renders tesseract output into searchable PDF
Definition at line 186 of file renderer.h.
tesseract::TessPDFRenderer::TessPDFRenderer | ( | const char * | datadir | ) |
Definition at line 32 of file pdfrenderer.cpp.
: TessResultRenderer("PDF", "pdf") { obj_ = 0; datadir_ = datadir; offsets_.push_back(0); }
bool tesseract::TessPDFRenderer::AddImageHandler | ( | TessBaseAPI * | api | ) | [protected, virtual] |
Implements tesseract::TessResultRenderer.
Definition at line 583 of file pdfrenderer.cpp.
{ char buf[kBasicBufSize]; Pix *pix = api->GetInputImage(); char *filename = (char *)api->GetInputName(); int ppi = api->GetSourceYResolution(); if (!pix || ppi <= 0) return false; double width = pixGetWidth(pix) * 72.0 / ppi; double height = pixGetHeight(pix) * 72.0 / ppi; // PAGE snprintf(buf, sizeof(buf), "%ld 0 obj\n" "<<\n" " /Type /Page\n" " /Parent %ld 0 R\n" " /MediaBox [0 0 %.2f %.2f]\n" " /Contents %ld 0 R\n" " /Resources\n" " <<\n" " /XObject << /Im1 %ld 0 R >>\n" " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" " /Font << /f-0-0 %ld 0 R >>\n" " >>\n" ">>\n" "endobj\n", obj_, 2L, // Pages object width, height, obj_ + 1, // Contents object obj_ + 2, // Image object 3L); // Type0 Font pages_.push_back(obj_); AppendPDFObject(buf); // CONTENTS char* pdftext = GetPDFTextObjects(api, width, height, imagenum()); long pdftext_len = strlen(pdftext); unsigned char *pdftext_casted = reinterpret_cast<unsigned char *>(pdftext); size_t len; unsigned char *comp_pdftext = zlibCompress(pdftext_casted, pdftext_len, &len); long comp_pdftext_len = len; snprintf(buf, sizeof(buf), "%ld 0 obj\n" "<<\n" " /Length %ld /Filter /FlateDecode\n" ">>\n" "stream\n", obj_, comp_pdftext_len); AppendString(buf); long objsize = strlen(buf); AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len); objsize += comp_pdftext_len; lept_free(comp_pdftext); delete[] pdftext; snprintf(buf, sizeof(buf), "endstream\n" "endobj\n"); AppendString(buf); objsize += strlen(buf); AppendPDFObjectDIY(objsize); char *pdf_object; if (!fileToPDFObj(filename, obj_, &pdf_object, &objsize)) { if (!pixToPDFObj(pix, obj_, &pdf_object, &objsize)) { return false; } } AppendData(pdf_object, objsize); AppendPDFObjectDIY(objsize); delete[] pdf_object; return true; }
bool tesseract::TessPDFRenderer::BeginDocumentHandler | ( | ) | [protected, virtual] |
Reimplemented from tesseract::TessResultRenderer.
Definition at line 254 of file pdfrenderer.cpp.
{ fprintf(stderr, "XXX 2"); char buf[kBasicBufSize]; snprintf(buf, sizeof(buf), "%%PDF-1.5\n" "%%%c%c%c%c\n", 0xDE, 0xAD, 0xBE, 0xEB); AppendPDFObject(buf); // CATALOG snprintf(buf, sizeof(buf), "1 0 obj\n" "<<\n" " /Type /Catalog\n" " /Pages %ld 0 R\n" ">>\n" "endobj\n", 2L); AppendPDFObject(buf); // We are reserving object #2 for the /Pages // object, which I am going to create and write // at the end of the PDF file. AppendPDFObject(""); // TYPE0 FONT snprintf(buf, sizeof(buf), "3 0 obj\n" "<<\n" " /BaseFont /GlyphLessFont\n" " /DescendantFonts [ %ld 0 R ]\n" " /Encoding /Identity-H\n" " /Subtype /Type0\n" " /ToUnicode %ld 0 R\n" " /Type /Font\n" ">>\n" "endobj\n", 4L, // CIDFontType2 font 5L // ToUnicode ); AppendPDFObject(buf); // CIDFONTTYPE2 snprintf(buf, sizeof(buf), "4 0 obj\n" "<<\n" " /BaseFont /GlyphLessFont\n" " /CIDToGIDMap /Identity\n" " /CIDSystemInfo\n" " <<\n" " /Ordering (Identity)\n" " /Registry (Adobe)\n" " /Supplement 0\n" " >>\n" " /FontDescriptor %ld 0 R\n" " /Subtype /CIDFontType2\n" " /Type /Font\n" " /DW 1000\n" ">>\n" "endobj\n", 6L // Font descriptor ); AppendPDFObject(buf); const char *stream = "/CIDInit /ProcSet findresource begin\n" "12 dict begin\n" "begincmap\n" "/CIDSystemInfo\n" "<<\n" " /Registry (Adobe)\n" " /Ordering (UCS)\n" " /Supplement 0\n" ">> def\n" "/CMapName /Adobe-Identify-UCS def\n" "/CMapType 2 def\n" "1 begincodespacerange\n" "<0000> <FFFF>\n" "endcodespacerange\n" "1 beginbfrange\n" "<0000> <FFFF> <0000>\n" "endbfrange\n" "endcmap\n" "CMapName currentdict /CMap defineresource pop\n" "end\n" "end\n"; // TOUNICODE snprintf(buf, sizeof(buf), "5 0 obj\n" "<< /Length %lu >>\n" "stream\n" "%s" "endstream\n" "endobj\n", (unsigned long) strlen(stream), stream); AppendPDFObject(buf); // TODO(jbreiden) Fix the FontBBox entry. And of course make // the font data match the descriptor. // FONT DESCRIPTOR snprintf(buf, sizeof(buf), "6 0 obj\n" "<<\n" " /Ascent 1000\n" " /CapHeight 1000\n" " /Descent 0\n" // Nothing goes below baseline " /Flags 4\n" " /FontBBox [ 0 0 1000 1000 ]\n" " /FontFile2 %ld 0 R\n" " /FontName /GlyphLessFont\n" " /ItalicAngle 0\n" " /StemV 80\n" " /Type /FontDescriptor\n" ">>\n" "endobj\n", 7L // Font data ); AppendPDFObject(buf); snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_); FILE *fp = fopen(buf, "rb"); if (!fp) return false; fseek(fp, 0, SEEK_END); long int size = ftell(fp); fseek(fp, 0, SEEK_SET); char *buffer = new char[size]; fread(buffer, 1, size, fp); fclose(fp); // FONTFILE2 snprintf(buf, sizeof(buf), "7 0 obj\n" "<<\n" " /Length %ld\n" " /Length1 %ld\n" ">>\n" "stream\n", size, size); AppendString(buf); size_t objsize = strlen(buf); AppendData(buffer, size); objsize += size; snprintf(buf, sizeof(buf), "endstream\n" "endobj\n"); AppendString(buf); objsize += strlen(buf); AppendPDFObjectDIY(objsize); return true; }
bool tesseract::TessPDFRenderer::EndDocumentHandler | ( | ) | [protected, virtual] |
Reimplemented from tesseract::TessResultRenderer.
Definition at line 662 of file pdfrenderer.cpp.
{ char buf[kBasicBufSize]; // We reserved the /Pages object number early, so that the /Page // objects could refer to their parent. We finally have enough // information to go fill it in. Using lower level calls to manipulate // the offset record in two spots, because we are placing objects // out of order in the file. // PAGES const long int kPagesObjectNumber = 2; offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1 snprintf(buf, sizeof(buf), "%ld 0 obj\n" "<<\n" " /Type /Pages\n" " /Kids [ ", kPagesObjectNumber); AppendString(buf); size_t pages_objsize = strlen(buf); for (size_t i = 0; i < pages_.size(); i++) { snprintf(buf, sizeof(buf), "%ld 0 R ", pages_[i]); AppendString(buf); pages_objsize += strlen(buf); } snprintf(buf, sizeof(buf), "]\n" " /Count %d\n" ">>\n" "endobj\n", pages_.size()); AppendString(buf); pages_objsize += strlen(buf); offsets_.back() += pages_objsize; // manipulation #2 // INFO char* datestr = l_getFormattedDate(); snprintf(buf, sizeof(buf), "%ld 0 obj\n" "<<\n" " /Producer (Tesseract %s)\n" " /CreationDate (D:%s)\n" " /Title (%s)" ">>\n" "endobj\n", obj_, VERSION, datestr, title()); lept_free(datestr); AppendPDFObject(buf); snprintf(buf, sizeof(buf), "xref\n" "0 %ld\n" "0000000000 65535 f \n", obj_); AppendString(buf); for (int i = 1; i < obj_; i++) { snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]); AppendString(buf); } snprintf(buf, sizeof(buf), "trailer\n" "<<\n" " /Size %ld\n" " /Root %ld 0 R\n" " /Info %ld 0 R\n" ">>\n" "startxref\n" "%ld\n" "%%%%EOF\n", obj_, 1L, // catalog obj_ - 1, // info offsets_.back()); AppendString(buf); return true; }