tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/stringrenderer.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        stringrenderer.cpp
00003  * Description: Class for rendering UTF-8 text to an image, and retrieving
00004  *              bounding boxes around each grapheme cluster.
00005  * Author:      Ranjith Unnikrishnan
00006  * Created:     Mon Nov 18 2013
00007  *
00008  * (C) Copyright 2013, Google Inc.
00009  * Licensed under the Apache License, Version 2.0 (the "License");
00010  * you may not use this file except in compliance with the License.
00011  * You may obtain a copy of the License at
00012  * http://www.apache.org/licenses/LICENSE-2.0
00013  * Unless required by applicable law or agreed to in writing, software
00014  * distributed under the License is distributed on an "AS IS" BASIS,
00015  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016  * See the License for the specific language governing permissions and
00017  * limitations under the License.
00018  *
00019  **********************************************************************/
00020 
00021 #include "stringrenderer.h"
00022 
00023 #include <stdio.h>
00024 #include <string.h>
00025 #include <algorithm>
00026 #include <map>
00027 #include <utility>
00028 #include <vector>
00029 
00030 #include "allheaders.h"     // from leptonica
00031 #include "boxchar.h"
00032 #include "ligature_table.h"
00033 #include "normstrngs.h"
00034 #include "pango/pango-font.h"
00035 #include "pango/pango-glyph-item.h"
00036 #include "tlog.h"
00037 #include "unichar.h"
00038 #include "unicode/uchar.h"  // from libicu
00039 #include "util.h"
00040 
00041 #ifndef USE_STD_NAMESPACE
00042 // Compatability with pango 1.20.
00043 #include "pango/pango-glyph-item-private.h"
00044 #define pango_glyph_item_iter_init_start _pango_glyph_item_iter_init_start
00045 #define pango_glyph_item_iter_next_cluster _pango_glyph_item_iter_next_cluster
00046 #else
00047 using std::map;
00048 using std::max;
00049 using std::min;
00050 using std::swap;
00051 #endif
00052 
00053 namespace tesseract {
00054 
00055 static const int kDefaultOutputResolution = 300;
00056 
00057 // Word joiner (U+2060) inserted after letters in ngram mode, as per
00058 // recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at
00059 // hyphens and other non-alpha characters.
00060 static const char* kWordJoinerUTF8 = "\u2060";
00061 static const char32 kWordJoiner = 0x2060;
00062 
00063 static bool IsCombiner(int ch) {
00064   const int char_type = u_charType(ch);
00065   return ((char_type == U_NON_SPACING_MARK) ||
00066           (char_type == U_ENCLOSING_MARK) ||
00067           (char_type == U_COMBINING_SPACING_MARK));
00068 }
00069 
00070 static string EncodeAsUTF8(const char32 ch32) {
00071   UNICHAR uni_ch(ch32);
00072   return string(uni_ch.utf8(), uni_ch.utf8_len());
00073 }
00074 
00075 
00076 /* static */
00077 Pix* CairoARGB32ToPixFormat(cairo_surface_t *surface) {
00078   if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
00079     printf("Unexpected surface format %d\n",
00080            cairo_image_surface_get_format(surface));
00081     return NULL;
00082   }
00083   const int width = cairo_image_surface_get_width(surface);
00084   const int height = cairo_image_surface_get_height(surface);
00085   Pix* pix = pixCreate(width, height, 32);
00086   int byte_stride = cairo_image_surface_get_stride(surface);
00087 
00088   for (int i = 0; i < height; ++i) {
00089     memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1,
00090            cairo_image_surface_get_data(surface) + i * byte_stride,
00091            byte_stride - ((i == height - 1) ? 1 : 0));
00092   }
00093   return pix;
00094 }
00095 
00096 StringRenderer::StringRenderer(const string& font_desc, int page_width,
00097                                int page_height) :
00098     page_width_(page_width), page_height_(page_height),
00099     h_margin_(50), v_margin_(50), char_spacing_(0), leading_(0),
00100     vertical_text_(false), gravity_hint_strong_(false),
00101     render_fullwidth_latin_(false) ,drop_uncovered_chars_(true),
00102     strip_unrenderable_words_(false), add_ligatures_(false),
00103     output_word_boxes_(false), surface_(NULL), cr_(NULL),
00104     layout_(NULL), start_box_(0), page_(0), box_padding_(0),
00105     total_chars_(0), font_index_(0), last_offset_(0) {
00106   pen_color_[0] = 0.0;
00107   pen_color_[1] = 0.0;
00108   pen_color_[2] = 0.0;
00109   set_font(font_desc);
00110   set_resolution(kDefaultOutputResolution);
00111   page_boxes_ = NULL;
00112 }
00113 
00114 bool StringRenderer::set_font(const string& desc) {
00115   bool success = font_.ParseFontDescriptionName(desc);
00116   font_.set_resolution(resolution_);
00117   return success;
00118 }
00119 
00120 void StringRenderer::set_resolution(const int resolution) {
00121   resolution_ = resolution;
00122   font_.set_resolution(resolution);
00123 }
00124 
00125 StringRenderer::~StringRenderer() {
00126   ClearBoxes();
00127   FreePangoCairo();
00128 }
00129 
00130 void StringRenderer::InitPangoCairo() {
00131   FreePangoCairo();
00132   surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_,
00133                                         page_height_);
00134   cr_ = cairo_create(surface_);
00135   layout_ = pango_cairo_create_layout(cr_);
00136 
00137   if (vertical_text_) {
00138     PangoContext* context = pango_layout_get_context(layout_);
00139     pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
00140     if (gravity_hint_strong_) {
00141       pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
00142     }
00143     pango_layout_context_changed(layout_);
00144   }
00145 
00146   SetLayoutProperties();
00147 }
00148 
00149 void StringRenderer::SetLayoutProperties() {
00150   string font_desc = font_.DescriptionName();
00151   // Specify the font via a description name
00152   PangoFontDescription *desc =
00153       pango_font_description_from_string(font_desc.c_str());
00154   // Assign the font description to the layout
00155   pango_layout_set_font_description(layout_, desc);
00156   pango_font_description_free(desc);  // free the description
00157   pango_cairo_context_set_resolution(pango_layout_get_context(layout_),
00158                                      resolution_);
00159 
00160   int max_width = page_width_ - 2 * h_margin_;
00161   int max_height = page_height_ - 2 * v_margin_;
00162   tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height);
00163   if (vertical_text_) {
00164     swap(max_width, max_height);
00165   }
00166   pango_layout_set_width(layout_, max_width * PANGO_SCALE);
00167   pango_layout_set_wrap(layout_, PANGO_WRAP_WORD);
00168 
00169   // Adjust character spacing
00170   PangoAttrList* attr_list = pango_attr_list_new();
00171   if (char_spacing_) {
00172     PangoAttribute* spacing_attr = pango_attr_letter_spacing_new(
00173         static_cast<int>(char_spacing_ * PANGO_SCALE + 0.5));
00174     spacing_attr->start_index = 0;
00175     spacing_attr->end_index = static_cast<guint>(-1);
00176     pango_attr_list_change(attr_list, spacing_attr);
00177   }
00178   pango_layout_set_attributes(layout_, attr_list);
00179   pango_attr_list_unref(attr_list);
00180   // Adjust line spacing
00181   if (leading_) {
00182     pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);
00183   }
00184 }
00185 
00186 void StringRenderer::FreePangoCairo() {
00187   if (layout_) {
00188     g_object_unref(layout_);
00189     layout_ = NULL;
00190   }
00191   if (cr_) {
00192     cairo_destroy(cr_);
00193     cr_ = NULL;
00194   }
00195   if (surface_) {
00196     cairo_surface_destroy(surface_);
00197     surface_ = NULL;
00198   }
00199 }
00200 
00201 
00202 // Returns offset in utf8 bytes to first page.
00203 int StringRenderer::FindFirstPageBreakOffset(const char* text,
00204                                              int text_length) {
00205   if (!text_length) return 0;
00206   const int max_height = (page_height_ - 2 * v_margin_);
00207   const int max_width = (page_width_ - 2 * h_margin_);
00208   const int max_layout_height = vertical_text_ ? max_width : max_height;
00209 
00210   UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
00211   const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);
00212   const int kMaxUnicodeBufLength = 15000;
00213   for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i);
00214   int buf_length = it.utf8_data() - text;
00215   tlog(1, "len = %d  buf_len = %d\n", text_length, buf_length);
00216   pango_layout_set_text(layout_, text, buf_length);
00217 
00218   PangoLayoutIter* line_iter = NULL;
00219   { // Fontconfig caches some info here that is not freed before exit.
00220     DISABLE_HEAP_LEAK_CHECK;
00221     line_iter = pango_layout_get_iter(layout_);
00222   }
00223   bool first_page = true;
00224   int page_top = 0;
00225   int offset = buf_length;
00226   do {
00227     // Get bounding box of the current line
00228     PangoRectangle line_ink_rect;
00229     pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, NULL);
00230     pango_extents_to_pixels(&line_ink_rect, NULL);
00231     PangoLayoutLine* line = pango_layout_iter_get_line_readonly(line_iter);
00232     if (first_page) {
00233       page_top = line_ink_rect.y;
00234       first_page = false;
00235     }
00236     int line_bottom = line_ink_rect.y + line_ink_rect.height;
00237     if (line_bottom - page_top > max_layout_height) {
00238       offset = line->start_index;
00239       tlog(1, "Found offset = %d\n", offset);
00240       break;
00241     }
00242   } while (pango_layout_iter_next_line(line_iter));
00243   pango_layout_iter_free(line_iter);
00244   return offset;
00245 }
00246 
00247 const vector<BoxChar*>& StringRenderer::GetBoxes() const {
00248     return boxchars_;
00249 }
00250 
00251 Boxa* StringRenderer::GetPageBoxes() const {
00252     return page_boxes_;
00253 }
00254 
00255 void StringRenderer::RotatePageBoxes(float rotation) {
00256   BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2,
00257                        start_box_, boxchars_.size(), &boxchars_);
00258 }
00259 
00260 
00261 void StringRenderer::ClearBoxes() {
00262   for (int i = 0; i < boxchars_.size(); ++i)
00263     delete boxchars_[i];
00264   boxchars_.clear();
00265   boxaDestroy(&page_boxes_);
00266 }
00267 
00268 void StringRenderer::WriteAllBoxes(const string& filename) const {
00269   BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_);
00270 }
00271 
00272 // Returns cluster strings in logical order.
00273 bool StringRenderer::GetClusterStrings(vector<string>* cluster_text) {
00274   map<int, string> start_byte_to_text;
00275   PangoLayoutIter* run_iter = pango_layout_get_iter(layout_);
00276   const char* full_text = pango_layout_get_text(layout_);
00277   do {
00278     PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
00279     if (!run) {
00280       // End of line NULL run marker
00281       tlog(2, "Found end of line marker\n");
00282       continue;
00283     }
00284     PangoGlyphItemIter cluster_iter;
00285     gboolean have_cluster;
00286     for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
00287                                                           run, full_text);
00288          have_cluster;
00289          have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
00290       const int start_byte_index = cluster_iter.start_index;
00291       const int end_byte_index = cluster_iter.end_index;
00292       string text = string(full_text + start_byte_index,
00293                            end_byte_index - start_byte_index);
00294       if (IsUTF8Whitespace(text.c_str())) {
00295         tlog(2, "Found whitespace\n");
00296         text = " ";
00297       }
00298       tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index,
00299            end_byte_index, text.c_str());
00300       if (add_ligatures_) {
00301         // Make sure the output box files have ligatured text in case the font
00302         // decided to use an unmapped glyph.
00303         text = LigatureTable::Get()->AddLigatures(text, NULL);
00304       }
00305       start_byte_to_text[start_byte_index] = text;
00306     }
00307   } while (pango_layout_iter_next_run(run_iter));
00308   pango_layout_iter_free(run_iter);
00309 
00310   cluster_text->clear();
00311   for (map<int, string>::const_iterator it = start_byte_to_text.begin();
00312        it != start_byte_to_text.end(); ++it) {
00313     cluster_text->push_back(it->second);
00314   }
00315   return cluster_text->size();
00316 }
00317 
00318 // Merges an array of BoxChars into words based on the identification of
00319 // BoxChars containing the space character as inter-word separators.
00320 //
00321 // Sometime two adjacent characters in the sequence may be detected as lying on
00322 // different lines based on their spatial positions. This may be the result of a
00323 // newline character at end of the last word on a line in the source text, or of
00324 // a discretionary line-break created by Pango at intra-word locations like
00325 // hyphens. When this is detected the word is split at that location into
00326 // multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and
00327 // its bounding box.
00328 static void MergeBoxCharsToWords(vector<BoxChar*>* boxchars) {
00329   vector<BoxChar*> result;
00330   bool started_word = false;
00331   for (int i = 0; i < boxchars->size(); ++i) {
00332     if (boxchars->at(i)->ch() == " " ||
00333         boxchars->at(i)->box() == NULL) {
00334       result.push_back(boxchars->at(i));
00335       boxchars->at(i) = NULL;
00336       started_word = false;
00337       continue;
00338     }
00339 
00340     if (!started_word) {
00341       // Begin new word
00342       started_word = true;
00343       result.push_back(boxchars->at(i));
00344       boxchars->at(i) = NULL;
00345     } else {
00346       BoxChar* last_boxchar = result.back();
00347       // Compute bounding box union
00348       const Box* box = boxchars->at(i)->box();
00349       Box* last_box = last_boxchar->mutable_box();
00350       int left = min(last_box->x, box->x);
00351       int right = max(last_box->x + last_box->w, box->x + box->w);
00352       int top = min(last_box->y, box->y);
00353       int bottom = max(last_box->y + last_box->h, box->y + box->h);
00354       // Conclude that the word was broken to span multiple lines based on the
00355       // size of the merged bounding box in relation to those of the individual
00356       // characters seen so far.
00357       if (right - left > last_box->w + 5 * box->w) {
00358         tlog(1, "Found line break after '%s'", last_boxchar->ch().c_str());
00359         // Insert a fake interword space and start a new word with the current
00360         // boxchar.
00361         result.push_back(new BoxChar(" ", 1));
00362         result.push_back(boxchars->at(i));
00363         boxchars->at(i) = NULL;
00364         continue;
00365       }
00366       // Append to last word
00367       last_boxchar->mutable_ch()->append(boxchars->at(i)->ch());
00368       last_box->x = left;
00369       last_box->w = right - left;
00370       last_box->y = top;
00371       last_box->h = bottom - top;
00372       delete boxchars->at(i);
00373       boxchars->at(i) = NULL;
00374     }
00375   }
00376   boxchars->swap(result);
00377 }
00378 
00379 
00380 void StringRenderer::ComputeClusterBoxes() {
00381   const char* text = pango_layout_get_text(layout_);
00382   PangoLayoutIter* cluster_iter = pango_layout_get_iter(layout_);
00383 
00384   // Do a first pass to store cluster start indexes.
00385   vector<int> cluster_start_indices;
00386   do {
00387     cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
00388     tlog(3, "Added %d\n", cluster_start_indices.back());
00389   } while (pango_layout_iter_next_cluster(cluster_iter));
00390   pango_layout_iter_free(cluster_iter);
00391   cluster_start_indices.push_back(strlen(text));
00392   tlog(3, "Added last index %d\n", cluster_start_indices.back());
00393   // Sort the indices and create a map from start to end indices.
00394   sort(cluster_start_indices.begin(), cluster_start_indices.end());
00395   map<int, int> cluster_start_to_end_index;
00396   for (int i = 0; i < cluster_start_indices.size() - 1; ++i) {
00397     cluster_start_to_end_index[cluster_start_indices[i]]
00398         = cluster_start_indices[i + 1];
00399   }
00400 
00401   // Iterate again to compute cluster boxes and their text with the obtained
00402   // cluster extent information.
00403   cluster_iter = pango_layout_get_iter(layout_);
00404   // Store BoxChars* sorted by their byte start positions
00405   map<int, BoxChar*> start_byte_to_box;
00406   do {
00407     PangoRectangle cluster_rect;
00408     pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect,
00409                                           NULL);
00410     pango_extents_to_pixels(&cluster_rect, NULL);
00411     const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
00412     const int end_byte_index = cluster_start_to_end_index[start_byte_index];
00413     string cluster_text = string(text + start_byte_index,
00414                                  end_byte_index - start_byte_index);
00415     if (cluster_text.size() && cluster_text[0] == '\n') {
00416       tlog(2, "Skipping newlines at start of text.\n");
00417       continue;
00418     }
00419     if (!cluster_rect.width || !cluster_rect.height ||
00420         IsUTF8Whitespace(cluster_text.c_str())) {
00421       tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n",
00422            cluster_rect.width, cluster_rect.height, cluster_text.c_str());
00423       BoxChar* boxchar = new BoxChar(" ", 1);
00424       boxchar->set_page(page_);
00425       start_byte_to_box[start_byte_index] = boxchar;
00426       continue;
00427     }
00428     // Prepare a boxchar for addition at this byte position.
00429     tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n",
00430          cluster_rect.x, cluster_rect.y,
00431          cluster_rect.width, cluster_rect.height,
00432          start_byte_index, end_byte_index,
00433          cluster_text.c_str());
00434     ASSERT_HOST_MSG(cluster_rect.width,
00435                     "cluster_text:%s  start_byte_index:%d\n",
00436                     cluster_text.c_str(), start_byte_index);
00437     ASSERT_HOST_MSG(cluster_rect.height,
00438                     "cluster_text:%s  start_byte_index:%d\n",
00439                     cluster_text.c_str(), start_byte_index);
00440     if (box_padding_) {
00441       cluster_rect.x = max(0, cluster_rect.x - box_padding_);
00442       cluster_rect.width += 2 * box_padding_;
00443       cluster_rect.y = max(0, cluster_rect.y - box_padding_);
00444       cluster_rect.height += 2 * box_padding_;
00445     }
00446     if (add_ligatures_) {
00447       // Make sure the output box files have ligatured text in case the font
00448       // decided to use an unmapped glyph.
00449       cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, NULL);
00450     }
00451     BoxChar* boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size());
00452     boxchar->set_page(page_);
00453     boxchar->AddBox(cluster_rect.x, cluster_rect.y,
00454                     cluster_rect.width, cluster_rect.height);
00455     start_byte_to_box[start_byte_index] = boxchar;
00456   } while (pango_layout_iter_next_cluster(cluster_iter));
00457   pango_layout_iter_free(cluster_iter);
00458 
00459   // There is a subtle bug in the cluster text reported by the PangoLayoutIter
00460   // on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work
00461   // around this, we use text reported using the PangoGlyphIter which is
00462   // accurate.
00463   // TODO(ranjith): Revisit whether this is still needed in newer versions of
00464   // pango.
00465   vector<string> cluster_text;
00466   if (GetClusterStrings(&cluster_text)) {
00467     ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
00468     int ind = 0;
00469     for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
00470          it != start_byte_to_box.end(); ++it, ++ind) {
00471       it->second->mutable_ch()->swap(cluster_text[ind]);
00472     }
00473   }
00474 
00475   // Append to the boxchars list in byte order.
00476   vector<BoxChar*> page_boxchars;
00477   page_boxchars.reserve(start_byte_to_box.size());
00478   string last_ch;
00479   for (map<int, BoxChar*>::const_iterator it = start_byte_to_box.begin();
00480        it != start_byte_to_box.end(); ++it) {
00481     if (it->second->ch() == kWordJoinerUTF8) {
00482       // Skip zero-width joiner characters (ZWJs) here.
00483       delete it->second;
00484     } else {
00485       page_boxchars.push_back(it->second);
00486     }
00487   }
00488   CorrectBoxPositionsToLayout(&page_boxchars);
00489 
00490   if (render_fullwidth_latin_) {
00491     for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
00492          it != start_byte_to_box.end(); ++it) {
00493       // Convert fullwidth Latin characters to their halfwidth forms.
00494       string half(ConvertFullwidthLatinToBasicLatin(it->second->ch()));
00495       it->second->mutable_ch()->swap(half);
00496     }
00497   }
00498 
00499   // Merge the character boxes into word boxes if we are rendering n-grams.
00500   if (output_word_boxes_) {
00501     MergeBoxCharsToWords(&page_boxchars);
00502   }
00503 
00504   boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());
00505 
00506   // Compute the page bounding box
00507   Box* page_box = NULL;
00508   Boxa* all_boxes = NULL;
00509   for (int i = 0; i < page_boxchars.size(); ++i) {
00510     if (page_boxchars[i]->box() == NULL) continue;
00511     if (all_boxes == NULL)
00512       all_boxes = boxaCreate(0);
00513     boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE);
00514   }
00515   boxaGetExtent(all_boxes, NULL, NULL, &page_box);
00516   boxaDestroy(&all_boxes);
00517   if (page_boxes_ == NULL)
00518     page_boxes_ = boxaCreate(0);
00519   boxaAddBox(page_boxes_, page_box, L_INSERT);
00520 }
00521 
00522 
00523 void StringRenderer::CorrectBoxPositionsToLayout(vector<BoxChar*>* boxchars) {
00524   if (vertical_text_) {
00525     const double rotation = - pango_gravity_to_rotation(
00526         pango_context_get_base_gravity(pango_layout_get_context(layout_)));
00527     BoxChar::TranslateBoxes(page_width_ - h_margin_, v_margin_, boxchars);
00528     BoxChar::RotateBoxes(rotation, page_width_ - h_margin_, v_margin_,
00529                          0, boxchars->size(), boxchars);
00530   } else {
00531     BoxChar::TranslateBoxes(h_margin_, v_margin_, boxchars);
00532   }
00533 }
00534 
00535 int StringRenderer::StripUnrenderableWords(string* utf8_text) const {
00536   string output_text;
00537   const char* text = utf8_text->c_str();
00538   int offset = 0;
00539   int num_dropped = 0;
00540   while (offset < utf8_text->length()) {
00541     int space_len = SpanUTF8Whitespace(text + offset);
00542     output_text.append(text + offset, space_len);
00543     offset += space_len;
00544     if (offset == utf8_text->length()) break;
00545 
00546     int word_len = SpanUTF8NotWhitespace(text + offset);
00547     if (font_.CanRenderString(text + offset, word_len)) {
00548       output_text.append(text + offset, word_len);
00549     } else {
00550       ++num_dropped;
00551     }
00552     offset += word_len;
00553   }
00554   utf8_text->swap(output_text);
00555 
00556   if (num_dropped > 0) {
00557     tprintf("Stripped %d unrenderable words\n", num_dropped);
00558   }
00559   return num_dropped;
00560 }
00561 
00562 int StringRenderer::RenderToBinaryImage(const char* text, int text_length,
00563                                         int threshold, Pix** pix) {
00564   Pix *orig_pix = NULL;
00565   int offset = RenderToImage(text, text_length, &orig_pix);
00566   if (orig_pix) {
00567     Pix* gray_pix = pixConvertTo8(orig_pix, false);
00568     pixDestroy(&orig_pix);
00569     *pix = pixThresholdToBinary(gray_pix, threshold);
00570     pixDestroy(&gray_pix);
00571   } else {
00572     *pix = orig_pix;
00573   }
00574   return offset;
00575 }
00576 
00577 // Add word joiner (WJ) characters between adjacent non-space characters except
00578 // immediately before a combiner.
00579 /* static */
00580 string StringRenderer::InsertWordJoiners(const string& text) {
00581   string out_str;
00582   const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(),
00583                                                       text.length());
00584   for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length());
00585        it != it_end; ++it) {
00586     // Add the symbol to the output string.
00587     out_str.append(it.utf8_data(), it.utf8_len());
00588     // Check the next symbol.
00589     UNICHAR::const_iterator next_it = it;
00590     ++next_it;
00591     bool next_char_is_boundary = (next_it == it_end || *next_it == ' ');
00592     bool next_char_is_combiner = (next_it == it_end) ?
00593         false : IsCombiner(*next_it);
00594     if (*it != ' ' && *it != '\n' && !next_char_is_boundary &&
00595         !next_char_is_combiner) {
00596       out_str += kWordJoinerUTF8;
00597     }
00598   }
00599   return out_str;
00600 }
00601 
00602 // Convert halfwidth Basic Latin characters to their fullwidth forms.
00603 string StringRenderer::ConvertBasicLatinToFullwidthLatin(const string& str) {
00604   string full_str;
00605   const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(),
00606                                                       str.length());
00607   for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
00608        it != it_end; ++it) {
00609     // Convert printable and non-space 7-bit ASCII characters to
00610     // their fullwidth forms.
00611     if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) {
00612       // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII.
00613       char32 full_char = *it + 0xFEE0;
00614       full_str.append(EncodeAsUTF8(full_char));
00615     } else {
00616       full_str.append(it.utf8_data(), it.utf8_len());
00617     }
00618   }
00619   return full_str;
00620 }
00621 
00622 // Convert fullwidth Latin characters to their halfwidth forms.
00623 string StringRenderer::ConvertFullwidthLatinToBasicLatin(const string& str) {
00624   string half_str;
00625   UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
00626   for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
00627        it != it_end; ++it) {
00628     char32 half_char = FullwidthToHalfwidth(*it);
00629     // Convert fullwidth Latin characters to their halfwidth forms
00630     // only if halfwidth forms are printable and non-space 7-bit ASCII.
00631     if (IsInterchangeValid7BitAscii(half_char) &&
00632         isprint(half_char) && !isspace(half_char)) {
00633       half_str.append(EncodeAsUTF8(half_char));
00634     } else {
00635       half_str.append(it.utf8_data(), it.utf8_len());
00636     }
00637   }
00638   return half_str;
00639 }
00640 
00641 // Returns offset to end of text substring rendered in this method.
00642 int StringRenderer::RenderToImage(const char* text, int text_length,
00643                                   Pix** pix) {
00644   if (pix && *pix) pixDestroy(pix);
00645   InitPangoCairo();
00646 
00647   const int page_offset = FindFirstPageBreakOffset(text, text_length);
00648   if (!page_offset) {
00649     return 0;
00650   }
00651   start_box_ = boxchars_.size();
00652 
00653   if (!vertical_text_) {
00654     // Translate by the specified margin
00655     cairo_translate(cr_, h_margin_, v_margin_);
00656   } else {
00657     // Vertical text rendering is achieved by a two-step process of first
00658     // performing regular horizontal layout with character orientation set to
00659     // EAST, and then translating and rotating the layout before rendering onto
00660     // the desired image surface. The settings required for the former step are
00661     // done within InitPangoCairo().
00662     //
00663     // Translate to the top-right margin of page
00664     cairo_translate(cr_, page_width_ - h_margin_, v_margin_);
00665     // Rotate the layout
00666     double rotation = - pango_gravity_to_rotation(
00667         pango_context_get_base_gravity(pango_layout_get_context(layout_)));
00668     tlog(2, "Rotating by %f radians\n", rotation);
00669     cairo_rotate(cr_, rotation);
00670     pango_cairo_update_layout(cr_, layout_);
00671   }
00672   string page_text(text, page_offset);
00673   if (render_fullwidth_latin_) {
00674     // Convert Basic Latin to their fullwidth forms.
00675     page_text = ConvertBasicLatinToFullwidthLatin(page_text);
00676   }
00677   if (strip_unrenderable_words_) {
00678     StripUnrenderableWords(&page_text);
00679   }
00680   if (drop_uncovered_chars_ &&
00681       !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {
00682     int num_dropped = font_.DropUncoveredChars(&page_text);
00683     if (num_dropped) {
00684       tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped);
00685     }
00686   }
00687   if (add_ligatures_) {
00688     // Add ligatures wherever possible, including custom ligatures.
00689     page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);
00690   }
00691 
00692   pango_layout_set_text(layout_, page_text.c_str(), page_text.length());
00693 
00694   if (pix) {
00695     // Set a white background for the target image surface.
00696     cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0);  // sets drawing colour to white
00697     // Fill the surface with the active colour (if you don't do this, you will
00698     // be given a surface with a transparent background to draw on)
00699     cairo_paint(cr_);
00700     // Set the ink color to black
00701     cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);
00702     // If the target surface or transformation properties of the cairo instance
00703     // have changed, update the pango layout to reflect this
00704     pango_cairo_update_layout(cr_, layout_);
00705     // Draw the pango layout onto the cairo surface
00706     pango_cairo_show_layout(cr_, layout_);
00707     *pix = CairoARGB32ToPixFormat(surface_);
00708   }
00709   ComputeClusterBoxes();
00710   FreePangoCairo();
00711   // Update internal state variables.
00712   ++page_;
00713   return page_offset;
00714 }
00715 
00716 // Render a string to an image, returning it as an 8 bit pix.  Behaves as
00717 // RenderString, except that it ignores the font set at construction and works
00718 // through all the fonts, returning 0 until they are exhausted, at which point
00719 // it returns the value it should have returned all along, but no pix this time.
00720 // Fonts that don't contain a large proportion of the characters in the string
00721 // get skipped.
00722 // Fonts that work each get rendered and the font name gets added
00723 // to the image.
00724 // NOTE that no boxes are produced by this function.
00725 //
00726 // Example usage: To render a null terminated char-array "txt"
00727 //
00728 // int offset = 0;
00729 // do {
00730 //   Pix *pix;
00731 //   offset += renderer.RenderAllFontsToImage(txt + offset,
00732 //                                            strlen(txt + offset), NULL, &pix);
00733 //   ...
00734 // } while (offset < strlen(text));
00735 //
00736 int StringRenderer::RenderAllFontsToImage(const char* text, int text_length,
00737                                           string* font_used, Pix** image) {
00738   // Select a suitable font to render the title with.
00739   const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
00740   string title_font;
00741   if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate),
00742                              &title_font, NULL)) {
00743     tprintf("WARNING: Could not find a font to render image title with!\n");
00744     title_font = "Arial";
00745   }
00746   title_font += " 8";
00747   tlog(1, "Selected title font: %s\n", title_font.c_str());
00748   if (font_used) font_used->clear();
00749 
00750   string orig_font = font_.DescriptionName();
00751   if (char_map_.empty()) {
00752     total_chars_ = 0;
00753     // Fill the hash table and use that for computing which fonts to use.
00754     for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
00755          it != UNICHAR::end(text, text_length); ++it) {
00756       ++total_chars_;
00757       ++char_map_[*it];
00758     }
00759     tprintf("Total chars = %d\n", total_chars_);
00760   }
00761   const vector<string>& all_fonts = FontUtils::ListAvailableFonts();
00762   for (int i = font_index_; i < all_fonts.size(); ++i) {
00763     ++font_index_;
00764     int raw_score = 0;
00765     int ok_chars = FontUtils::FontScore(char_map_, all_fonts[i], &raw_score,
00766                                         NULL);
00767     if (ok_chars > 0 && ok_chars == total_chars_) {
00768       set_font(all_fonts[i]);
00769       int offset = RenderToBinaryImage(text, text_length, 128, image);
00770       ClearBoxes();  // Get rid of them as they are garbage.
00771       const int kMaxTitleLength = 1024;
00772       char title[kMaxTitleLength];
00773       snprintf(title, kMaxTitleLength, kTitleTemplate,
00774                all_fonts[i].c_str(), ok_chars,
00775                100.0 * ok_chars / total_chars_, raw_score,
00776                100.0 * raw_score / char_map_.size());
00777       tprintf("%s\n", title);
00778       // This is a good font! Store the offset to return once we've tried all
00779       // the fonts.
00780       if (offset) {
00781         last_offset_ = offset;
00782         if (font_used) *font_used = all_fonts[i];
00783       }
00784       // Add the font to the image.
00785       set_font(title_font);
00786       v_margin_ /= 8;
00787       Pix* title_image = NULL;
00788       RenderToBinaryImage(title, strlen(title), 128, &title_image);
00789       pixOr(*image, *image, title_image);
00790       pixDestroy(&title_image);
00791 
00792       v_margin_ *= 8;
00793       set_font(orig_font);
00794       // We return the real offset only after cycling through the list of fonts.
00795       return 0;
00796     } else {
00797       tprintf("Font %s failed with %d hits = %.2f%%\n",
00798               all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
00799     }
00800   }
00801   *image = NULL;
00802   font_index_ = 0;
00803   char_map_.clear();
00804   return last_offset_;
00805 }
00806 
00807 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines