tesseract
3.03
|
00001 // Include automatically generated configuration file if running autoconf. 00002 #ifdef HAVE_CONFIG_H 00003 #include "config_auto.h" 00004 #endif 00005 00006 #include "baseapi.h" 00007 #include "renderer.h" 00008 #include "math.h" 00009 #include "strngs.h" 00010 #include "cube_utils.h" 00011 #include "allheaders.h" 00012 00013 #if !defined(VERSION) 00014 #include "version.h" 00015 #endif 00016 00017 #ifdef _MSC_VER 00018 #include "mathfix.h" 00019 #endif 00020 00021 namespace tesseract { 00022 00023 // Use for PDF object fragments. Must be large enough 00024 // to hold a colormap with 256 colors in the verbose 00025 // PDF representation. 00026 const int kBasicBufSize = 2048; 00027 00028 /********************************************************************** 00029 * PDF Renderer interface implementation 00030 **********************************************************************/ 00031 00032 TessPDFRenderer::TessPDFRenderer(const char *datadir) 00033 : TessResultRenderer("PDF", "pdf") { 00034 obj_ = 0; 00035 datadir_ = datadir; 00036 offsets_.push_back(0); 00037 } 00038 00039 void TessPDFRenderer::AppendPDFObjectDIY(size_t objectsize) { 00040 offsets_.push_back(objectsize + offsets_.back()); 00041 obj_++; 00042 } 00043 00044 void TessPDFRenderer::AppendPDFObject(const char *data) { 00045 AppendPDFObjectDIY(strlen(data)); 00046 AppendString((const char *)data); 00047 } 00048 00049 // Helper function to prevent us from accidentaly writing 00050 // scientific notation to an HOCR or PDF file. Besides, three 00051 // decimal points are all you really need. 00052 double prec(double x) { 00053 double kPrecision = 1000.0; 00054 double a = round(x * kPrecision) / kPrecision; 00055 if (a == -0) 00056 return 0; 00057 return a; 00058 } 00059 00060 long dist2(int x1, int y1, int x2, int y2) { 00061 return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1); 00062 } 00063 00064 char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, 00065 double width, double height, 00066 int page_number) { 00067 double ppi = api->GetSourceYResolution(); 00068 STRING pdf_str(""); 00069 double old_x = 0.0, old_y = 0.0; 00070 int old_pointsize = 0; 00071 00072 // TODO(jbreiden) Slightly cleaner from an abstraction standpoint 00073 // if this were to live inside a separate text object. 00074 pdf_str += "q "; 00075 pdf_str.add_str_double("", prec(width)); 00076 pdf_str += " 0 0 "; 00077 pdf_str.add_str_double("", prec(height)); 00078 pdf_str += " 0 0 cm /Im1 Do Q\n"; 00079 00080 ResultIterator *res_it = api->GetIterator(); 00081 00082 while (!res_it->Empty(RIL_BLOCK)) { 00083 if (res_it->IsAtBeginningOf(RIL_BLOCK)) { 00084 pdf_str += "BT\n3 Tr\n"; // Begin text object, use invisible ink 00085 old_pointsize = 0.0; // Every block will declare its font 00086 } 00087 00088 int line_x1, line_y1, line_x2, line_y2; 00089 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { 00090 res_it->Baseline(RIL_TEXTLINE, 00091 &line_x1, &line_y1, &line_x2, &line_y2); 00092 } 00093 00094 if (res_it->Empty(RIL_WORD)) { 00095 res_it->Next(RIL_WORD); 00096 continue; 00097 } 00098 00099 int word_x1, word_y1, word_x2, word_y2; 00100 res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2); 00101 00102 // The critical one is writing_direction 00103 tesseract::Orientation orientation; 00104 tesseract::WritingDirection writing_direction; 00105 tesseract::TextlineOrder textline_order; 00106 float deskew_angle; 00107 res_it->Orientation(&orientation, &writing_direction, 00108 &textline_order, &deskew_angle); 00109 00110 // Unlike Tesseract, we always want the word baseline in reading order. 00111 if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) { 00112 Swap(&word_x1, &word_x2); 00113 Swap(&word_y1, &word_y2); 00114 } 00115 00116 // Viewers like evince can get really confused during copy-paste 00117 // when the baseline wanders around. I've decided to force every 00118 // word to match the (straight) baseline. The math below is just 00119 // projecting the word origin onto the baseline. All numbers are 00120 // in the native PDF coordinate system, which has the origin in 00121 // the bottom left and the unit is points, which is 1/72 inch. 00122 double word_length; 00123 double x, y; 00124 { 00125 int px = word_x1; 00126 int py = word_y1; 00127 double l2 = dist2(line_x1, line_y1, line_x2, line_y2); 00128 if (l2 == 0) { 00129 x = line_x1; 00130 y = line_y1; 00131 } else { 00132 double t = ((px - line_x2) * (line_x2 - line_x1) + 00133 (py - line_y2) * (line_y2 - line_y1)) / l2; 00134 x = line_x2 + t * (line_x2 - line_x1); 00135 y = line_y2 + t * (line_y2 - line_y1); 00136 } 00137 word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1, 00138 word_x2, word_y2))); 00139 word_length = word_length * 72.0 / ppi; 00140 x = x * 72 / ppi; 00141 y = height - (y * 72.0 / ppi); 00142 } 00143 00144 int pointsize = 0; 00145 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { 00146 // Calculate the rotation angle in the PDF cooordinate system, 00147 // which has the origin in the bottom left. The Tesseract 00148 // coordinate system has the origin in the upper left. 00149 // 00150 // PDF is kind of a like turtle graphics, and we orient the 00151 // turtle (errr... initial cursor position) with an affine 00152 // transformation. 00153 // 00154 // Rotate RTL Translate 00155 // 00156 // [ x' y' 1 ] = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ] [ -1 0 0 ] [ 1 0 0 ] 00157 // [ sin𝜃 cos𝜃 0 ] [ 0 1 0 ] [ 0 1 0 ] 00158 // [ 0 0 1 ] [ 0 0 1 ] [ x y 1 ] 00159 // 00160 double theta = atan2(static_cast<double>(line_y1 - line_y2), 00161 static_cast<double>(line_x2 - line_x1)); 00162 double a, b, c, d; 00163 a = cos(theta); 00164 b = sin(theta); 00165 c = -sin(theta); 00166 d = cos(theta); 00167 switch(writing_direction) { 00168 case WRITING_DIRECTION_RIGHT_TO_LEFT: 00169 a = -a; 00170 b = -b; 00171 c = -c; 00172 break; 00173 case WRITING_DIRECTION_TOP_TO_BOTTOM: 00174 // TODO(jbreiden) Consider switching PDF writing mode to vertical. 00175 break; 00176 default: 00177 break; 00178 } 00179 00180 pdf_str.add_str_double("", prec(a)); // . This affine matrix 00181 pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate 00182 pdf_str.add_str_double(" ", prec(c)); // . system for all 00183 pdf_str.add_str_double(" ", prec(d)); // . text in the entire 00184 pdf_str.add_str_double(" ", prec(x)); // . line. 00185 pdf_str.add_str_double(" ", prec(y)); // . 00186 pdf_str += (" Tm "); // Place cursor absolutely 00187 } else { 00188 double offset = sqrt(static_cast<double>(dist2(old_x, old_y, x, y))); 00189 pdf_str.add_str_double(" ", prec(offset)); // Delta x in pts 00190 pdf_str.add_str_double(" ", 0); // Delta y in pts 00191 pdf_str += (" Td "); // Relative moveto 00192 } 00193 old_x = x; 00194 old_y = y; 00195 00196 // Adjust font size on a per word granularity. Pay attention to 00197 // pointsize, old_pointsize, and pdf_str. 00198 { 00199 bool bold, italic, underlined, monospace, serif, smallcaps; 00200 int font_id; 00201 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, 00202 &serif, &smallcaps, &pointsize, &font_id); 00203 if (pointsize != old_pointsize) { 00204 char textfont[20]; 00205 snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", pointsize); 00206 pdf_str += textfont; 00207 old_pointsize = pointsize; 00208 } 00209 } 00210 00211 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); 00212 bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); 00213 STRING pdf_word(""); 00214 int pdf_word_len = 0; 00215 do { 00216 const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); 00217 if (grapheme && grapheme[0] != 0) { 00218 // TODO(jbreiden) Do a real UTF-16BE conversion 00219 // http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure 00220 string_32 utf32; 00221 CubeUtils::UTF8ToUTF32(grapheme, &utf32); 00222 char utf16[20]; 00223 for (int i = 0; i < static_cast<int>(utf32.length()); i++) { 00224 snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]); 00225 pdf_word += utf16; 00226 pdf_word_len++; 00227 } 00228 } 00229 delete []grapheme; 00230 res_it->Next(RIL_SYMBOL); 00231 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); 00232 if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) { 00233 double h_stretch = 00234 prec(100.0 * word_length / (pointsize * pdf_word_len)); 00235 pdf_str.add_str_double("", h_stretch); 00236 pdf_str += " Tz"; // horizontal stretch 00237 pdf_str += " [ "; 00238 pdf_str += pdf_word; // UTF-16BE representation 00239 pdf_str += " ] TJ"; // show the text 00240 } 00241 if (last_word_in_line) { 00242 pdf_str += " \n"; 00243 } 00244 if (last_word_in_block) { 00245 pdf_str += "ET\n"; // end the text object 00246 } 00247 } 00248 char *ret = new char[pdf_str.length() + 1]; 00249 strcpy(ret, pdf_str.string()); 00250 delete res_it; 00251 return ret; 00252 } 00253 00254 bool TessPDFRenderer::BeginDocumentHandler() { 00255 fprintf(stderr, "XXX 2"); 00256 00257 char buf[kBasicBufSize]; 00258 00259 snprintf(buf, sizeof(buf), 00260 "%%PDF-1.5\n" 00261 "%%%c%c%c%c\n", 00262 0xDE, 0xAD, 0xBE, 0xEB); 00263 AppendPDFObject(buf); 00264 00265 // CATALOG 00266 snprintf(buf, sizeof(buf), 00267 "1 0 obj\n" 00268 "<<\n" 00269 " /Type /Catalog\n" 00270 " /Pages %ld 0 R\n" 00271 ">>\n" 00272 "endobj\n", 2L); 00273 AppendPDFObject(buf); 00274 00275 // We are reserving object #2 for the /Pages 00276 // object, which I am going to create and write 00277 // at the end of the PDF file. 00278 AppendPDFObject(""); 00279 00280 // TYPE0 FONT 00281 snprintf(buf, sizeof(buf), 00282 "3 0 obj\n" 00283 "<<\n" 00284 " /BaseFont /GlyphLessFont\n" 00285 " /DescendantFonts [ %ld 0 R ]\n" 00286 " /Encoding /Identity-H\n" 00287 " /Subtype /Type0\n" 00288 " /ToUnicode %ld 0 R\n" 00289 " /Type /Font\n" 00290 ">>\n" 00291 "endobj\n", 00292 4L, // CIDFontType2 font 00293 5L // ToUnicode 00294 ); 00295 AppendPDFObject(buf); 00296 00297 // CIDFONTTYPE2 00298 snprintf(buf, sizeof(buf), 00299 "4 0 obj\n" 00300 "<<\n" 00301 " /BaseFont /GlyphLessFont\n" 00302 " /CIDToGIDMap /Identity\n" 00303 " /CIDSystemInfo\n" 00304 " <<\n" 00305 " /Ordering (Identity)\n" 00306 " /Registry (Adobe)\n" 00307 " /Supplement 0\n" 00308 " >>\n" 00309 " /FontDescriptor %ld 0 R\n" 00310 " /Subtype /CIDFontType2\n" 00311 " /Type /Font\n" 00312 " /DW 1000\n" 00313 ">>\n" 00314 "endobj\n", 00315 6L // Font descriptor 00316 ); 00317 AppendPDFObject(buf); 00318 00319 const char *stream = 00320 "/CIDInit /ProcSet findresource begin\n" 00321 "12 dict begin\n" 00322 "begincmap\n" 00323 "/CIDSystemInfo\n" 00324 "<<\n" 00325 " /Registry (Adobe)\n" 00326 " /Ordering (UCS)\n" 00327 " /Supplement 0\n" 00328 ">> def\n" 00329 "/CMapName /Adobe-Identify-UCS def\n" 00330 "/CMapType 2 def\n" 00331 "1 begincodespacerange\n" 00332 "<0000> <FFFF>\n" 00333 "endcodespacerange\n" 00334 "1 beginbfrange\n" 00335 "<0000> <FFFF> <0000>\n" 00336 "endbfrange\n" 00337 "endcmap\n" 00338 "CMapName currentdict /CMap defineresource pop\n" 00339 "end\n" 00340 "end\n"; 00341 00342 // TOUNICODE 00343 snprintf(buf, sizeof(buf), 00344 "5 0 obj\n" 00345 "<< /Length %lu >>\n" 00346 "stream\n" 00347 "%s" 00348 "endstream\n" 00349 "endobj\n", (unsigned long) strlen(stream), stream); 00350 AppendPDFObject(buf); 00351 00352 // TODO(jbreiden) Fix the FontBBox entry. And of course make 00353 // the font data match the descriptor. 00354 // FONT DESCRIPTOR 00355 snprintf(buf, sizeof(buf), 00356 "6 0 obj\n" 00357 "<<\n" 00358 " /Ascent 1000\n" 00359 " /CapHeight 1000\n" 00360 " /Descent 0\n" // Nothing goes below baseline 00361 " /Flags 4\n" 00362 " /FontBBox [ 0 0 1000 1000 ]\n" 00363 " /FontFile2 %ld 0 R\n" 00364 " /FontName /GlyphLessFont\n" 00365 " /ItalicAngle 0\n" 00366 " /StemV 80\n" 00367 " /Type /FontDescriptor\n" 00368 ">>\n" 00369 "endobj\n", 00370 7L // Font data 00371 ); 00372 AppendPDFObject(buf); 00373 00374 snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_); 00375 FILE *fp = fopen(buf, "rb"); 00376 if (!fp) 00377 return false; 00378 fseek(fp, 0, SEEK_END); 00379 long int size = ftell(fp); 00380 fseek(fp, 0, SEEK_SET); 00381 char *buffer = new char[size]; 00382 fread(buffer, 1, size, fp); 00383 fclose(fp); 00384 // FONTFILE2 00385 snprintf(buf, sizeof(buf), 00386 "7 0 obj\n" 00387 "<<\n" 00388 " /Length %ld\n" 00389 " /Length1 %ld\n" 00390 ">>\n" 00391 "stream\n", size, size); 00392 AppendString(buf); 00393 size_t objsize = strlen(buf); 00394 AppendData(buffer, size); 00395 objsize += size; 00396 snprintf(buf, sizeof(buf), 00397 "endstream\n" 00398 "endobj\n"); 00399 AppendString(buf); 00400 objsize += strlen(buf); 00401 AppendPDFObjectDIY(objsize); 00402 return true; 00403 } 00404 00405 // TODO(jbreiden) I hear that you can pull the flate stream out 00406 // of a PNG file and, by mentioning the predictor in the PDF object, 00407 // make most of them work without transcoding. If so that's a big win 00408 // versus what we do now. Try it out. 00409 bool TessPDFRenderer::fileToPDFObj(char *filename, long int objnum, 00410 char **pdf_object, 00411 long int *pdf_object_size) { 00412 char b1[kBasicBufSize]; 00413 char b2[kBasicBufSize]; 00414 if (!pdf_object_size || !pdf_object) 00415 return false; 00416 *pdf_object = NULL; 00417 *pdf_object_size = 0; 00418 if (!filename) 00419 return false; 00420 FILE *fp = fopen(filename, "rb"); 00421 if (!fp) 00422 return false; 00423 int format; 00424 00425 findFileFormatStream(fp, &format); 00426 if (format != IFF_JFIF_JPEG) { 00427 fclose(fp); 00428 return false; 00429 } 00430 00431 fseek(fp, 0, SEEK_END); 00432 long int jpeg_size = ftell(fp); 00433 fseek(fp, 0, SEEK_SET); 00434 00435 int spp, cmyk, w, h; 00436 freadHeaderJpeg(fp, &w, &h, &spp, NULL, &cmyk); 00437 const char *colorspace; 00438 switch (spp) { 00439 case 1: 00440 colorspace = "/DeviceGray"; 00441 break; 00442 case 3: 00443 colorspace = "/DeviceRGB"; 00444 break; 00445 case 4: 00446 if (cmyk) 00447 colorspace = "/DeviceCMYK"; 00448 else 00449 return false; 00450 break; 00451 default: 00452 return false; 00453 } 00454 00455 // IMAGE 00456 snprintf(b1, sizeof(b1), 00457 "%ld 0 obj\n" 00458 "<<\n" 00459 " /Length %ld\n" 00460 " /Subtype /Image\n" 00461 " /ColorSpace %s\n" 00462 " /Width %d\n" 00463 " /Height %d\n" 00464 " /BitsPerComponent 8\n" 00465 " /Filter /DCTDecode\n" 00466 ">>\n" 00467 "stream\n", objnum, jpeg_size, 00468 colorspace, w, h); 00469 size_t b1_len = strlen(b1); 00470 00471 snprintf(b2, sizeof(b2), 00472 "\n" 00473 "endstream\n" 00474 "endobj\n"); 00475 size_t b2_len = strlen(b2); 00476 00477 *pdf_object_size = b1_len + jpeg_size + b2_len; 00478 *pdf_object = new char[*pdf_object_size]; 00479 if (!pdf_object) 00480 return false; 00481 memcpy(*pdf_object, b1, b1_len); 00482 if (static_cast<int>(fread(*pdf_object + b1_len, 1, jpeg_size, fp)) != 00483 jpeg_size) { 00484 delete[] pdf_object; 00485 return false; 00486 } 00487 memcpy(*pdf_object + b1_len + jpeg_size, b2, b2_len); 00488 fclose(fp); 00489 fprintf(stderr, "XXX 1"); 00490 return true; 00491 } 00492 00493 bool TessPDFRenderer::pixToPDFObj(Pix *pix, long int objnum, 00494 char **pdf_object, 00495 long int *pdf_object_size) { 00496 if (!pdf_object_size || !pdf_object) 00497 return false; 00498 *pdf_object = NULL; 00499 *pdf_object_size = 0; 00500 char b0[kBasicBufSize]; 00501 char b1[kBasicBufSize * 2]; 00502 char b2[kBasicBufSize]; 00503 L_COMP_DATA *cid; 00504 int encoding_type; 00505 const int kJpegQuality = 85; 00506 if (selectDefaultPdfEncoding(pix, &encoding_type) != 0) 00507 return false; 00508 if (pixGenerateCIData(pix, encoding_type, kJpegQuality, 0, &cid) != 0) 00509 return false; 00510 00511 const char *filter; 00512 switch(encoding_type) { 00513 case L_FLATE_ENCODE: 00514 filter = "/FlateDecode"; 00515 break; 00516 case L_JPEG_ENCODE: 00517 filter = "/DCTDecode"; 00518 break; 00519 case L_G4_ENCODE: 00520 filter = "/CCITTFaxDecode"; 00521 break; 00522 default: 00523 return false; 00524 } 00525 00526 const char *colorspace; 00527 if (cid->ncolors > 0) { 00528 snprintf(b0, sizeof(b0), "[ /Indexed /DeviceRGB %d %s ]", 00529 cid->ncolors - 1, cid->cmapdatahex); 00530 colorspace = b0; 00531 } else { 00532 switch (cid->spp) { 00533 case 1: 00534 colorspace = "/DeviceGray"; 00535 break; 00536 case 3: 00537 colorspace = "/DeviceRGB"; 00538 break; 00539 default: 00540 return false; 00541 } 00542 } 00543 00544 snprintf(b1, sizeof(b1), 00545 "%ld 0 obj\n" 00546 "<<\n" 00547 " /Length %lu\n" 00548 " /Subtype /Image\n" 00549 " /ColorSpace %s\n" 00550 " /Width %d\n" 00551 " /Height %d\n" 00552 " /BitsPerComponent %d\n" 00553 " /Filter %s\n" 00554 " /DecodeParms\n" 00555 " <<\n" 00556 " /K -1\n" 00557 " /Columns %d\n" 00558 " >>\n" 00559 ">>\n" 00560 "stream\n", 00561 objnum, (unsigned long) cid->nbytescomp, colorspace, 00562 cid->w, cid->h, cid->bps, filter, cid->w); 00563 size_t b1_len = strlen(b1); 00564 00565 snprintf(b2, sizeof(b2), 00566 "\n" 00567 "endstream\n" 00568 "endobj\n"); 00569 size_t b2_len = strlen(b2); 00570 00571 *pdf_object_size = b1_len + cid->nbytescomp + b2_len; 00572 *pdf_object = new char[*pdf_object_size]; 00573 if (!pdf_object) 00574 return false; 00575 memcpy(*pdf_object, b1, b1_len); 00576 memcpy(*pdf_object + b1_len, cid->datacomp, cid->nbytescomp); 00577 memcpy(*pdf_object + b1_len + cid->nbytescomp, b2, b2_len); 00578 00579 return true; 00580 } 00581 00582 00583 bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { 00584 char buf[kBasicBufSize]; 00585 Pix *pix = api->GetInputImage(); 00586 char *filename = (char *)api->GetInputName(); 00587 int ppi = api->GetSourceYResolution(); 00588 if (!pix || ppi <= 0) 00589 return false; 00590 double width = pixGetWidth(pix) * 72.0 / ppi; 00591 double height = pixGetHeight(pix) * 72.0 / ppi; 00592 00593 // PAGE 00594 snprintf(buf, sizeof(buf), 00595 "%ld 0 obj\n" 00596 "<<\n" 00597 " /Type /Page\n" 00598 " /Parent %ld 0 R\n" 00599 " /MediaBox [0 0 %.2f %.2f]\n" 00600 " /Contents %ld 0 R\n" 00601 " /Resources\n" 00602 " <<\n" 00603 " /XObject << /Im1 %ld 0 R >>\n" 00604 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" 00605 " /Font << /f-0-0 %ld 0 R >>\n" 00606 " >>\n" 00607 ">>\n" 00608 "endobj\n", 00609 obj_, 00610 2L, // Pages object 00611 width, 00612 height, 00613 obj_ + 1, // Contents object 00614 obj_ + 2, // Image object 00615 3L); // Type0 Font 00616 pages_.push_back(obj_); 00617 AppendPDFObject(buf); 00618 00619 // CONTENTS 00620 char* pdftext = GetPDFTextObjects(api, width, height, imagenum()); 00621 long pdftext_len = strlen(pdftext); 00622 unsigned char *pdftext_casted = reinterpret_cast<unsigned char *>(pdftext); 00623 size_t len; 00624 unsigned char *comp_pdftext = 00625 zlibCompress(pdftext_casted, 00626 pdftext_len, 00627 &len); 00628 long comp_pdftext_len = len; 00629 snprintf(buf, sizeof(buf), 00630 "%ld 0 obj\n" 00631 "<<\n" 00632 " /Length %ld /Filter /FlateDecode\n" 00633 ">>\n" 00634 "stream\n", obj_, comp_pdftext_len); 00635 AppendString(buf); 00636 long objsize = strlen(buf); 00637 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len); 00638 objsize += comp_pdftext_len; 00639 lept_free(comp_pdftext); 00640 00641 delete[] pdftext; 00642 snprintf(buf, sizeof(buf), 00643 "endstream\n" 00644 "endobj\n"); 00645 AppendString(buf); 00646 objsize += strlen(buf); 00647 AppendPDFObjectDIY(objsize); 00648 00649 char *pdf_object; 00650 if (!fileToPDFObj(filename, obj_, &pdf_object, &objsize)) { 00651 if (!pixToPDFObj(pix, obj_, &pdf_object, &objsize)) { 00652 return false; 00653 } 00654 } 00655 AppendData(pdf_object, objsize); 00656 AppendPDFObjectDIY(objsize); 00657 delete[] pdf_object; 00658 return true; 00659 } 00660 00661 00662 bool TessPDFRenderer::EndDocumentHandler() { 00663 char buf[kBasicBufSize]; 00664 00665 // We reserved the /Pages object number early, so that the /Page 00666 // objects could refer to their parent. We finally have enough 00667 // information to go fill it in. Using lower level calls to manipulate 00668 // the offset record in two spots, because we are placing objects 00669 // out of order in the file. 00670 00671 // PAGES 00672 const long int kPagesObjectNumber = 2; 00673 offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1 00674 snprintf(buf, sizeof(buf), 00675 "%ld 0 obj\n" 00676 "<<\n" 00677 " /Type /Pages\n" 00678 " /Kids [ ", kPagesObjectNumber); 00679 AppendString(buf); 00680 size_t pages_objsize = strlen(buf); 00681 for (size_t i = 0; i < pages_.size(); i++) { 00682 snprintf(buf, sizeof(buf), 00683 "%ld 0 R ", pages_[i]); 00684 AppendString(buf); 00685 pages_objsize += strlen(buf); 00686 } 00687 snprintf(buf, sizeof(buf), 00688 "]\n" 00689 " /Count %d\n" 00690 ">>\n" 00691 "endobj\n", pages_.size()); 00692 AppendString(buf); 00693 pages_objsize += strlen(buf); 00694 offsets_.back() += pages_objsize; // manipulation #2 00695 00696 // INFO 00697 char* datestr = l_getFormattedDate(); 00698 snprintf(buf, sizeof(buf), 00699 "%ld 0 obj\n" 00700 "<<\n" 00701 " /Producer (Tesseract %s)\n" 00702 " /CreationDate (D:%s)\n" 00703 " /Title (%s)" 00704 ">>\n" 00705 "endobj\n", obj_, VERSION, datestr, title()); 00706 lept_free(datestr); 00707 AppendPDFObject(buf); 00708 00709 snprintf(buf, sizeof(buf), 00710 "xref\n" 00711 "0 %ld\n" 00712 "0000000000 65535 f \n", obj_); 00713 AppendString(buf); 00714 for (int i = 1; i < obj_; i++) { 00715 snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]); 00716 AppendString(buf); 00717 } 00718 snprintf(buf, sizeof(buf), 00719 "trailer\n" 00720 "<<\n" 00721 " /Size %ld\n" 00722 " /Root %ld 0 R\n" 00723 " /Info %ld 0 R\n" 00724 ">>\n" 00725 "startxref\n" 00726 "%ld\n" 00727 "%%%%EOF\n", 00728 obj_, 00729 1L, // catalog 00730 obj_ - 1, // info 00731 offsets_.back()); 00732 00733 AppendString(buf); 00734 return true; 00735 } 00736 00737 } // namespace tesseract