tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/api/pdfrenderer.cpp
Go to the documentation of this file.
00001 // Include automatically generated configuration file if running autoconf.
00002 #ifdef HAVE_CONFIG_H
00003 #include "config_auto.h"
00004 #endif
00005 
00006 #include "baseapi.h"
00007 #include "renderer.h"
00008 #include "math.h"
00009 #include "strngs.h"
00010 #include "cube_utils.h"
00011 #include "allheaders.h"
00012 
00013 #if !defined(VERSION)
00014 #include "version.h"
00015 #endif
00016 
00017 #ifdef _MSC_VER
00018 #include "mathfix.h"
00019 #endif
00020 
00021 namespace tesseract {
00022 
00023 // Use for PDF object fragments. Must be large enough
00024 // to hold a colormap with 256 colors in the verbose
00025 // PDF representation.
00026 const int kBasicBufSize = 2048;
00027 
00028 /**********************************************************************
00029  * PDF Renderer interface implementation
00030  **********************************************************************/
00031 
00032 TessPDFRenderer::TessPDFRenderer(const char *datadir)
00033     : TessResultRenderer("PDF", "pdf") {
00034   obj_  = 0;
00035   datadir_ = datadir;
00036   offsets_.push_back(0);
00037 }
00038 
00039 void TessPDFRenderer::AppendPDFObjectDIY(size_t objectsize) {
00040   offsets_.push_back(objectsize + offsets_.back());
00041   obj_++;
00042 }
00043 
00044 void TessPDFRenderer::AppendPDFObject(const char *data) {
00045   AppendPDFObjectDIY(strlen(data));
00046   AppendString((const char *)data);
00047 }
00048 
00049 // Helper function to prevent us from accidentaly writing
00050 // scientific notation to an HOCR or PDF file. Besides, three
00051 // decimal points are all you really need.
00052 double prec(double x) {
00053   double kPrecision = 1000.0;
00054   double a = round(x * kPrecision) / kPrecision;
00055   if (a == -0)
00056     return 0;
00057   return a;
00058 }
00059 
00060 long dist2(int x1, int y1, int x2, int y2) {
00061   return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
00062 }
00063 
00064 char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
00065                                          double width, double height,
00066                                          int page_number) {
00067   double ppi = api->GetSourceYResolution();
00068   STRING pdf_str("");
00069   double old_x = 0.0, old_y = 0.0;
00070   int old_pointsize = 0;
00071 
00072   // TODO(jbreiden) Slightly cleaner from an abstraction standpoint
00073   // if this were to live inside a separate text object.
00074   pdf_str += "q ";
00075   pdf_str.add_str_double("", prec(width));
00076   pdf_str += " 0 0 ";
00077   pdf_str.add_str_double("", prec(height));
00078   pdf_str += " 0 0 cm /Im1 Do Q\n";
00079 
00080   ResultIterator *res_it = api->GetIterator();
00081 
00082   while (!res_it->Empty(RIL_BLOCK)) {
00083     if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
00084       pdf_str += "BT\n3 Tr\n";  // Begin text object, use invisible ink
00085       old_pointsize = 0.0;      // Every block will declare its font
00086     }
00087 
00088     int line_x1, line_y1, line_x2, line_y2;
00089     if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
00090       res_it->Baseline(RIL_TEXTLINE,
00091                      &line_x1, &line_y1, &line_x2, &line_y2);
00092     }
00093 
00094     if (res_it->Empty(RIL_WORD)) {
00095       res_it->Next(RIL_WORD);
00096       continue;
00097     }
00098 
00099     int word_x1, word_y1, word_x2, word_y2;
00100     res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
00101 
00102     // The critical one is writing_direction
00103     tesseract::Orientation orientation;
00104     tesseract::WritingDirection writing_direction;
00105     tesseract::TextlineOrder textline_order;
00106     float deskew_angle;
00107     res_it->Orientation(&orientation, &writing_direction,
00108                         &textline_order, &deskew_angle);
00109 
00110     // Unlike Tesseract, we always want the word baseline in reading order.
00111     if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
00112       Swap(&word_x1, &word_x2);
00113       Swap(&word_y1, &word_y2);
00114     }
00115 
00116     // Viewers like evince can get really confused during copy-paste
00117     // when the baseline wanders around. I've decided to force every
00118     // word to match the (straight) baseline.  The math below is just
00119     // projecting the word origin onto the baseline.  All numbers are
00120     // in the native PDF coordinate system, which has the origin in
00121     // the bottom left and the unit is points, which is 1/72 inch.
00122     double word_length;
00123     double x, y;
00124     {
00125       int px = word_x1;
00126       int py = word_y1;
00127       double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
00128       if (l2 == 0) {
00129         x = line_x1;
00130         y = line_y1;
00131       } else {
00132         double t = ((px - line_x2) * (line_x2 - line_x1) +
00133                     (py - line_y2) * (line_y2 - line_y1)) / l2;
00134         x = line_x2 + t * (line_x2 - line_x1);
00135         y = line_y2 + t * (line_y2 - line_y1);
00136       }
00137       word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
00138                                                    word_x2, word_y2)));
00139       word_length = word_length * 72.0 / ppi;
00140       x = x * 72 / ppi;
00141       y = height - (y * 72.0 / ppi);
00142     }
00143 
00144     int pointsize = 0;
00145     if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
00146       // Calculate the rotation angle in the PDF cooordinate system,
00147       // which has the origin in the bottom left. The Tesseract
00148       // coordinate system has the origin in the upper left.
00149       //
00150       // PDF is kind of a like turtle graphics, and we orient the
00151       // turtle (errr... initial cursor position) with an affine
00152       // transformation.
00153       //
00154       //                                Rotate              RTL    Translate
00155       //
00156       // [ x' y' 1 ]  = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ]  [ -1 0 0 ] [ 1 0 0 ]
00157       //                          [ sin𝜃  cos𝜃 0 ]  [  0 1 0 ] [ 0 1 0 ]
00158       //                          [   0    0   1 ]  [  0 0 1 ] [ x y 1 ]
00159       //
00160       double theta = atan2(static_cast<double>(line_y1 - line_y2),
00161                            static_cast<double>(line_x2 - line_x1));
00162       double a, b, c, d;
00163       a = cos(theta);
00164       b = sin(theta);
00165       c = -sin(theta);
00166       d = cos(theta);
00167       switch(writing_direction) {
00168         case WRITING_DIRECTION_RIGHT_TO_LEFT:
00169           a = -a;
00170           b = -b;
00171           c = -c;
00172           break;
00173         case WRITING_DIRECTION_TOP_TO_BOTTOM:
00174           // TODO(jbreiden) Consider switching PDF writing mode to vertical.
00175           break;
00176         default:
00177           break;
00178       }
00179 
00180       pdf_str.add_str_double("",  prec(a));  // . This affine matrix
00181       pdf_str.add_str_double(" ", prec(b));  // . sets the coordinate
00182       pdf_str.add_str_double(" ", prec(c));  // . system for all
00183       pdf_str.add_str_double(" ", prec(d));  // . text in the entire
00184       pdf_str.add_str_double(" ", prec(x));  // . line.
00185       pdf_str.add_str_double(" ", prec(y));  // .
00186       pdf_str += (" Tm ");                   // Place cursor absolutely
00187     } else {
00188       double offset = sqrt(static_cast<double>(dist2(old_x, old_y, x, y)));
00189       pdf_str.add_str_double(" ", prec(offset));  // Delta x in pts
00190       pdf_str.add_str_double(" ", 0);             // Delta y in pts
00191       pdf_str += (" Td ");                        // Relative moveto
00192     }
00193     old_x = x;
00194     old_y = y;
00195 
00196     // Adjust font size on a per word granularity. Pay attention to
00197     // pointsize, old_pointsize, and pdf_str.
00198     {
00199       bool bold, italic, underlined, monospace, serif, smallcaps;
00200       int font_id;
00201       res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
00202                                  &serif, &smallcaps, &pointsize, &font_id);
00203       if (pointsize != old_pointsize) {
00204         char textfont[20];
00205         snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", pointsize);
00206         pdf_str += textfont;
00207         old_pointsize = pointsize;
00208       }
00209     }
00210 
00211     bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
00212     bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
00213     STRING pdf_word("");
00214     int pdf_word_len = 0;
00215     do {
00216       const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
00217       if (grapheme && grapheme[0] != 0) {
00218         // TODO(jbreiden) Do a real UTF-16BE conversion
00219         // http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure
00220         string_32 utf32;
00221         CubeUtils::UTF8ToUTF32(grapheme, &utf32);
00222         char utf16[20];
00223         for (int i = 0; i < static_cast<int>(utf32.length()); i++) {
00224           snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]);
00225           pdf_word += utf16;
00226           pdf_word_len++;
00227         }
00228       }
00229       delete []grapheme;
00230       res_it->Next(RIL_SYMBOL);
00231     } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
00232     if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) {
00233       double h_stretch =
00234           prec(100.0 * word_length / (pointsize * pdf_word_len));
00235       pdf_str.add_str_double("", h_stretch);
00236       pdf_str += " Tz";          // horizontal stretch
00237       pdf_str += " [ ";
00238       pdf_str += pdf_word;       // UTF-16BE representation
00239       pdf_str += " ] TJ";        // show the text
00240     }
00241     if (last_word_in_line) {
00242       pdf_str += " \n";
00243     }
00244     if (last_word_in_block) {
00245       pdf_str += "ET\n";         // end the text object
00246     }
00247   }
00248   char *ret = new char[pdf_str.length() + 1];
00249   strcpy(ret, pdf_str.string());
00250   delete res_it;
00251   return ret;
00252 }
00253 
00254 bool TessPDFRenderer::BeginDocumentHandler() {
00255   fprintf(stderr, "XXX 2");
00256 
00257   char buf[kBasicBufSize];
00258 
00259   snprintf(buf, sizeof(buf),
00260            "%%PDF-1.5\n"
00261            "%%%c%c%c%c\n",
00262            0xDE, 0xAD, 0xBE, 0xEB);
00263   AppendPDFObject(buf);
00264 
00265   // CATALOG
00266   snprintf(buf, sizeof(buf),
00267            "1 0 obj\n"
00268            "<<\n"
00269            "  /Type /Catalog\n"
00270            "  /Pages %ld 0 R\n"
00271            ">>\n"
00272            "endobj\n", 2L);
00273   AppendPDFObject(buf);
00274 
00275   // We are reserving object #2 for the /Pages
00276   // object, which I am going to create and write
00277   // at the end of the PDF file.
00278   AppendPDFObject("");
00279 
00280   // TYPE0 FONT
00281   snprintf(buf, sizeof(buf),
00282            "3 0 obj\n"
00283            "<<\n"
00284            "  /BaseFont /GlyphLessFont\n"
00285            "  /DescendantFonts [ %ld 0 R ]\n"
00286            "  /Encoding /Identity-H\n"
00287            "  /Subtype /Type0\n"
00288            "  /ToUnicode %ld 0 R\n"
00289            "  /Type /Font\n"
00290            ">>\n"
00291            "endobj\n",
00292            4L,          // CIDFontType2 font
00293            5L           // ToUnicode
00294            );
00295   AppendPDFObject(buf);
00296 
00297   // CIDFONTTYPE2
00298   snprintf(buf, sizeof(buf),
00299            "4 0 obj\n"
00300            "<<\n"
00301            "  /BaseFont /GlyphLessFont\n"
00302            "  /CIDToGIDMap /Identity\n"
00303            "  /CIDSystemInfo\n"
00304            "  <<\n"
00305            "     /Ordering (Identity)\n"
00306            "     /Registry (Adobe)\n"
00307            "     /Supplement 0\n"
00308            "  >>\n"
00309            "  /FontDescriptor %ld 0 R\n"
00310            "  /Subtype /CIDFontType2\n"
00311            "  /Type /Font\n"
00312            "  /DW 1000\n"
00313            ">>\n"
00314            "endobj\n",
00315            6L         // Font descriptor
00316            );
00317   AppendPDFObject(buf);
00318 
00319   const char *stream =
00320       "/CIDInit /ProcSet findresource begin\n"
00321       "12 dict begin\n"
00322       "begincmap\n"
00323       "/CIDSystemInfo\n"
00324       "<<\n"
00325       "  /Registry (Adobe)\n"
00326       "  /Ordering (UCS)\n"
00327       "  /Supplement 0\n"
00328       ">> def\n"
00329       "/CMapName /Adobe-Identify-UCS def\n"
00330       "/CMapType 2 def\n"
00331       "1 begincodespacerange\n"
00332       "<0000> <FFFF>\n"
00333       "endcodespacerange\n"
00334       "1 beginbfrange\n"
00335       "<0000> <FFFF> <0000>\n"
00336       "endbfrange\n"
00337       "endcmap\n"
00338       "CMapName currentdict /CMap defineresource pop\n"
00339       "end\n"
00340       "end\n";
00341 
00342   // TOUNICODE
00343   snprintf(buf, sizeof(buf),
00344            "5 0 obj\n"
00345            "<< /Length %lu >>\n"
00346            "stream\n"
00347            "%s"
00348            "endstream\n"
00349            "endobj\n", (unsigned long) strlen(stream), stream);
00350   AppendPDFObject(buf);
00351 
00352   // TODO(jbreiden) Fix the FontBBox entry. And of course make
00353   // the font data match the descriptor.
00354   // FONT DESCRIPTOR
00355   snprintf(buf, sizeof(buf),
00356            "6 0 obj\n"
00357            "<<\n"
00358            "  /Ascent 1000\n"
00359            "  /CapHeight 1000\n"
00360            "  /Descent 0\n"          // Nothing goes below baseline
00361            "  /Flags 4\n"
00362            "  /FontBBox  [ 0 0 1000 1000 ]\n"
00363            "  /FontFile2 %ld 0 R\n"
00364            "  /FontName /GlyphLessFont\n"
00365            "  /ItalicAngle 0\n"
00366            "  /StemV 80\n"
00367            "  /Type /FontDescriptor\n"
00368            ">>\n"
00369            "endobj\n",
00370            7L      // Font data
00371            );
00372   AppendPDFObject(buf);
00373 
00374   snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_);
00375   FILE *fp = fopen(buf, "rb");
00376   if (!fp)
00377     return false;
00378   fseek(fp, 0, SEEK_END);
00379   long int size = ftell(fp);
00380   fseek(fp, 0, SEEK_SET);
00381   char *buffer = new char[size];
00382   fread(buffer, 1, size, fp);
00383   fclose(fp);
00384   // FONTFILE2
00385   snprintf(buf, sizeof(buf),
00386            "7 0 obj\n"
00387            "<<\n"
00388            "  /Length %ld\n"
00389            "  /Length1 %ld\n"
00390            ">>\n"
00391            "stream\n", size, size);
00392   AppendString(buf);
00393   size_t objsize  = strlen(buf);
00394   AppendData(buffer, size);
00395   objsize += size;
00396   snprintf(buf, sizeof(buf),
00397            "endstream\n"
00398            "endobj\n");
00399   AppendString(buf);
00400   objsize += strlen(buf);
00401   AppendPDFObjectDIY(objsize);
00402   return true;
00403 }
00404 
00405 // TODO(jbreiden) I hear that you can pull the flate stream out
00406 // of a PNG file and, by mentioning the predictor in the PDF object,
00407 // make most of them work without transcoding. If so that's a big win
00408 // versus what we do now. Try it out.
00409 bool TessPDFRenderer::fileToPDFObj(char *filename, long int objnum,
00410                                    char **pdf_object,
00411                                    long int *pdf_object_size) {
00412   char b1[kBasicBufSize];
00413   char b2[kBasicBufSize];
00414   if (!pdf_object_size || !pdf_object)
00415     return false;
00416   *pdf_object = NULL;
00417   *pdf_object_size = 0;
00418   if (!filename)
00419     return false;
00420   FILE *fp = fopen(filename, "rb");
00421   if (!fp)
00422     return false;
00423   int format;
00424 
00425   findFileFormatStream(fp, &format);
00426   if (format != IFF_JFIF_JPEG) {
00427     fclose(fp);
00428     return false;
00429   }
00430 
00431   fseek(fp, 0, SEEK_END);
00432   long int jpeg_size = ftell(fp);
00433   fseek(fp, 0, SEEK_SET);
00434 
00435   int spp, cmyk, w, h;
00436   freadHeaderJpeg(fp, &w, &h, &spp, NULL, &cmyk);
00437   const char *colorspace;
00438   switch (spp) {
00439     case 1:
00440       colorspace = "/DeviceGray";
00441       break;
00442     case 3:
00443       colorspace = "/DeviceRGB";
00444       break;
00445     case 4:
00446       if (cmyk)
00447         colorspace = "/DeviceCMYK";
00448       else
00449         return false;
00450       break;
00451     default:
00452       return false;
00453   }
00454 
00455   // IMAGE
00456   snprintf(b1, sizeof(b1),
00457            "%ld 0 obj\n"
00458            "<<\n"
00459            "  /Length %ld\n"
00460            "  /Subtype /Image\n"
00461            "  /ColorSpace %s\n"
00462            "  /Width %d\n"
00463            "  /Height %d\n"
00464            "  /BitsPerComponent 8\n"
00465            "  /Filter /DCTDecode\n"
00466            ">>\n"
00467            "stream\n", objnum, jpeg_size,
00468            colorspace, w, h);
00469   size_t b1_len = strlen(b1);
00470 
00471   snprintf(b2, sizeof(b2),
00472            "\n"
00473            "endstream\n"
00474            "endobj\n");
00475   size_t b2_len = strlen(b2);
00476 
00477   *pdf_object_size = b1_len + jpeg_size + b2_len;
00478   *pdf_object = new char[*pdf_object_size];
00479   if (!pdf_object)
00480     return false;
00481   memcpy(*pdf_object, b1, b1_len);
00482   if (static_cast<int>(fread(*pdf_object + b1_len, 1, jpeg_size, fp)) !=
00483       jpeg_size) {
00484     delete[] pdf_object;
00485     return false;
00486   }
00487   memcpy(*pdf_object + b1_len + jpeg_size, b2, b2_len);
00488   fclose(fp);
00489   fprintf(stderr, "XXX 1");
00490   return true;
00491 }
00492 
00493 bool TessPDFRenderer::pixToPDFObj(Pix *pix, long int objnum,
00494                                   char **pdf_object,
00495                                   long int *pdf_object_size) {
00496   if (!pdf_object_size || !pdf_object)
00497     return false;
00498   *pdf_object = NULL;
00499   *pdf_object_size = 0;
00500   char b0[kBasicBufSize];
00501   char b1[kBasicBufSize * 2];
00502   char b2[kBasicBufSize];
00503   L_COMP_DATA *cid;
00504   int encoding_type;
00505   const int kJpegQuality = 85;
00506   if (selectDefaultPdfEncoding(pix, &encoding_type) != 0)
00507     return false;
00508   if (pixGenerateCIData(pix, encoding_type, kJpegQuality, 0, &cid) != 0)
00509     return false;
00510 
00511   const char *filter;
00512   switch(encoding_type) {
00513     case L_FLATE_ENCODE:
00514       filter = "/FlateDecode";
00515       break;
00516     case L_JPEG_ENCODE:
00517       filter = "/DCTDecode";
00518       break;
00519     case L_G4_ENCODE:
00520       filter = "/CCITTFaxDecode";
00521       break;
00522     default:
00523       return false;
00524   }
00525 
00526   const char *colorspace;
00527   if (cid->ncolors > 0) {
00528     snprintf(b0, sizeof(b0), "[ /Indexed /DeviceRGB %d %s ]",
00529              cid->ncolors - 1, cid->cmapdatahex);
00530     colorspace = b0;
00531   } else {
00532     switch (cid->spp) {
00533       case 1:
00534         colorspace = "/DeviceGray";
00535         break;
00536       case 3:
00537         colorspace = "/DeviceRGB";
00538         break;
00539       default:
00540         return false;
00541     }
00542   }
00543 
00544   snprintf(b1, sizeof(b1),
00545            "%ld 0 obj\n"
00546            "<<\n"
00547            "  /Length %lu\n"
00548            "  /Subtype /Image\n"
00549            "  /ColorSpace %s\n"
00550            "  /Width %d\n"
00551            "  /Height %d\n"
00552            "  /BitsPerComponent %d\n"
00553            "  /Filter %s\n"
00554            "  /DecodeParms\n"
00555            "  <<\n"
00556            "    /K -1\n"
00557            "    /Columns %d\n"
00558            "  >>\n"
00559            ">>\n"
00560            "stream\n",
00561            objnum, (unsigned long) cid->nbytescomp, colorspace,
00562            cid->w, cid->h, cid->bps, filter, cid->w);
00563   size_t b1_len = strlen(b1);
00564 
00565   snprintf(b2, sizeof(b2),
00566            "\n"
00567            "endstream\n"
00568            "endobj\n");
00569   size_t b2_len = strlen(b2);
00570 
00571   *pdf_object_size = b1_len + cid->nbytescomp + b2_len;
00572   *pdf_object = new char[*pdf_object_size];
00573   if (!pdf_object)
00574     return false;
00575   memcpy(*pdf_object, b1, b1_len);
00576   memcpy(*pdf_object + b1_len, cid->datacomp, cid->nbytescomp);
00577   memcpy(*pdf_object + b1_len + cid->nbytescomp, b2, b2_len);
00578 
00579   return true;
00580 }
00581 
00582 
00583 bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
00584   char buf[kBasicBufSize];
00585   Pix *pix = api->GetInputImage();
00586   char *filename = (char *)api->GetInputName();
00587   int ppi = api->GetSourceYResolution();
00588   if (!pix || ppi <= 0)
00589     return false;
00590   double width = pixGetWidth(pix) * 72.0 / ppi;
00591   double height = pixGetHeight(pix) * 72.0 / ppi;
00592 
00593   // PAGE
00594   snprintf(buf, sizeof(buf),
00595            "%ld 0 obj\n"
00596            "<<\n"
00597            "  /Type /Page\n"
00598            "  /Parent %ld 0 R\n"
00599            "  /MediaBox [0 0 %.2f %.2f]\n"
00600            "  /Contents %ld 0 R\n"
00601            "  /Resources\n"
00602            "  <<\n"
00603            "    /XObject << /Im1 %ld 0 R >>\n"
00604            "    /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
00605            "    /Font << /f-0-0 %ld 0 R >>\n"
00606            "  >>\n"
00607            ">>\n"
00608            "endobj\n",
00609            obj_,
00610            2L,            // Pages object
00611            width,
00612            height,
00613            obj_ + 1,      // Contents object
00614            obj_ + 2,      // Image object
00615            3L);           // Type0 Font
00616   pages_.push_back(obj_);
00617   AppendPDFObject(buf);
00618 
00619   // CONTENTS
00620   char* pdftext = GetPDFTextObjects(api, width, height, imagenum());
00621   long pdftext_len = strlen(pdftext);
00622   unsigned char *pdftext_casted = reinterpret_cast<unsigned char *>(pdftext);
00623   size_t len;
00624   unsigned char *comp_pdftext =
00625       zlibCompress(pdftext_casted,
00626                    pdftext_len,
00627                    &len);
00628   long comp_pdftext_len = len;
00629   snprintf(buf, sizeof(buf),
00630            "%ld 0 obj\n"
00631            "<<\n"
00632            "  /Length %ld /Filter /FlateDecode\n"
00633            ">>\n"
00634            "stream\n", obj_, comp_pdftext_len);
00635   AppendString(buf);
00636   long objsize = strlen(buf);
00637   AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
00638   objsize += comp_pdftext_len;
00639   lept_free(comp_pdftext);
00640 
00641   delete[] pdftext;
00642   snprintf(buf, sizeof(buf),
00643            "endstream\n"
00644            "endobj\n");
00645   AppendString(buf);
00646   objsize += strlen(buf);
00647   AppendPDFObjectDIY(objsize);
00648 
00649   char *pdf_object;
00650   if (!fileToPDFObj(filename, obj_, &pdf_object, &objsize)) {
00651     if (!pixToPDFObj(pix, obj_, &pdf_object, &objsize)) {
00652       return false;
00653     }
00654   }
00655   AppendData(pdf_object, objsize);
00656   AppendPDFObjectDIY(objsize);
00657   delete[] pdf_object;
00658   return true;
00659 }
00660 
00661 
00662 bool TessPDFRenderer::EndDocumentHandler() {
00663   char buf[kBasicBufSize];
00664 
00665   // We reserved the /Pages object number early, so that the /Page
00666   // objects could refer to their parent. We finally have enough
00667   // information to go fill it in. Using lower level calls to manipulate
00668   // the offset record in two spots, because we are placing objects
00669   // out of order in the file.
00670 
00671   // PAGES
00672   const long int kPagesObjectNumber = 2;
00673   offsets_[kPagesObjectNumber] = offsets_.back();  // manipulation #1
00674   snprintf(buf, sizeof(buf),
00675            "%ld 0 obj\n"
00676            "<<\n"
00677            "  /Type /Pages\n"
00678            "  /Kids [ ", kPagesObjectNumber);
00679   AppendString(buf);
00680   size_t pages_objsize  = strlen(buf);
00681   for (size_t i = 0; i < pages_.size(); i++) {
00682     snprintf(buf, sizeof(buf),
00683              "%ld 0 R ", pages_[i]);
00684     AppendString(buf);
00685     pages_objsize += strlen(buf);
00686   }
00687   snprintf(buf, sizeof(buf),
00688            "]\n"
00689            "  /Count %d\n"
00690            ">>\n"
00691            "endobj\n", pages_.size());
00692   AppendString(buf);
00693   pages_objsize += strlen(buf);
00694   offsets_.back() += pages_objsize;    // manipulation #2
00695 
00696   // INFO
00697   char* datestr = l_getFormattedDate();
00698   snprintf(buf, sizeof(buf),
00699            "%ld 0 obj\n"
00700            "<<\n"
00701            "  /Producer (Tesseract %s)\n"
00702            "  /CreationDate (D:%s)\n"
00703            "  /Title (%s)"
00704            ">>\n"
00705            "endobj\n", obj_, VERSION, datestr, title());
00706   lept_free(datestr);
00707   AppendPDFObject(buf);
00708 
00709   snprintf(buf, sizeof(buf),
00710            "xref\n"
00711            "0 %ld\n"
00712            "0000000000 65535 f \n", obj_);
00713   AppendString(buf);
00714   for (int i = 1; i < obj_; i++) {
00715     snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
00716     AppendString(buf);
00717   }
00718   snprintf(buf, sizeof(buf),
00719            "trailer\n"
00720            "<<\n"
00721            "  /Size %ld\n"
00722            "  /Root %ld 0 R\n"
00723            "  /Info %ld 0 R\n"
00724            ">>\n"
00725            "startxref\n"
00726            "%ld\n"
00727            "%%%%EOF\n",
00728            obj_,
00729            1L,               // catalog
00730            obj_ - 1,         // info
00731            offsets_.back());
00732 
00733   AppendString(buf);
00734   return true;
00735 }
00736 
00737 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines