tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/stringrenderer.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        stringrenderer.h
00003  * Description: Class for rendering UTF-8 text to an image, and retrieving
00004  *              bounding boxes around each grapheme cluster.
00005  *
00006  *              Instances are created using a font description string
00007  *              (eg. "Arial Italic 12"; see pango_font_info.h for the format)
00008  *              and the page dimensions. Other renderer properties such as
00009  *              spacing, ligaturization, as well a preprocessing behavior such
00010  *              as removal of unrenderable words and a special n-gram mode may
00011  *              be set using respective set_* methods.
00012  *
00013  * Author:      Ranjith Unnikrishnan
00014  * Created:     Mon Nov 18 2013
00015  *
00016  * (C) Copyright 2013, Google Inc.
00017  * Licensed under the Apache License, Version 2.0 (the "License");
00018  * you may not use this file except in compliance with the License.
00019  * You may obtain a copy of the License at
00020  * http://www.apache.org/licenses/LICENSE-2.0
00021  * Unless required by applicable law or agreed to in writing, software
00022  * distributed under the License is distributed on an "AS IS" BASIS,
00023  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00024  * See the License for the specific language governing permissions and
00025  * limitations under the License.
00026  *
00027  **********************************************************************/
00028 
00029 #ifndef TESSERACT_TRAINING_STRINGRENDERER_H_
00030 #define TESSERACT_TRAINING_STRINGRENDERER_H_
00031 
00032 #include <string>
00033 #include <vector>
00034 
00035 #include "hashfn.h"
00036 #include "host.h"
00037 #include "pango_font_info.h"
00038 #include "pango/pango-layout.h"
00039 #include "pango/pangocairo.h"
00040 
00041 struct Boxa;
00042 struct Pix;
00043 
00044 namespace tesseract {
00045 
00046 class BoxChar;
00047 
00048 class StringRenderer {
00049  public:
00050   StringRenderer(const string& font_desc, int page_width, int page_height);
00051   ~StringRenderer();
00052 
00053   // Renders the text with the chosen font and returns the byte offset upto
00054   // which the text could be rendered so as to fit the specified page
00055   // dimensions.
00056   int RenderToImage(const char* text, int text_length, Pix** pix);
00057   int RenderToBinaryImage(const char* text, int text_length, int threshold,
00058                           Pix** pix);
00059   // Renders a line of text with all available fonts that were able to render
00060   // the text.
00061   int RenderAllFontsToImage(const char* text, int text_length,
00062                             string* font_used, Pix** pix);
00063 
00064   bool set_font(const string& desc);
00065   void set_char_spacing(double char_spacing) {
00066     char_spacing_ = char_spacing;
00067   }
00068   void set_leading(int leading) {
00069     leading_ = leading;
00070   }
00071   void set_resolution(const int resolution);
00072   void set_vertical_text(bool vertical_text) {
00073     vertical_text_ = vertical_text;
00074   }
00075   void set_gravity_hint_strong(bool gravity_hint_strong) {
00076     gravity_hint_strong_ = gravity_hint_strong;
00077   }
00078   void set_render_fullwidth_latin(bool render_fullwidth_latin) {
00079     render_fullwidth_latin_ = render_fullwidth_latin;
00080   }
00081   void set_page(int page) {
00082     page_ = page;
00083   }
00084   void set_box_padding(int val) {
00085     box_padding_ = val;
00086   }
00087   void set_drop_uncovered_chars(bool val) {
00088     drop_uncovered_chars_ = val;
00089   }
00090   void set_strip_unrenderable_words(bool val) {
00091     strip_unrenderable_words_ = val;
00092   }
00093   void set_output_word_boxes(bool val) {
00094     output_word_boxes_ = val;
00095   }
00096   // Before rendering the string, replace latin characters with their optional
00097   // ligatured forms (such as "fi", "ffi" etc.) if the font_ covers those
00098   // unicodes.
00099   void set_add_ligatures(bool add_ligatures) {
00100     add_ligatures_ = add_ligatures;
00101   }
00102   // Set the rgb value of the text ink. Values range in [0, 1.0]
00103   void set_pen_color(double r, double g, double b) {
00104     pen_color_[0] = r;
00105     pen_color_[1] = g;
00106     pen_color_[2] = b;
00107   }
00108   void set_h_margin(const int h_margin) {
00109     h_margin_ = h_margin;
00110   }
00111   void set_v_margin(const int v_margin) {
00112     v_margin_ = v_margin;
00113   }
00114   const PangoFontInfo& font() const {
00115     return font_;
00116   }
00117   const int h_margin() const {
00118     return h_margin_;
00119   }
00120   const int v_margin() const {
00121     return v_margin_;
00122   }
00123 
00124   // Get the boxchars of all clusters rendered thus far (or since the last call
00125   // to ClearBoxes()).
00126   const vector<BoxChar*>& GetBoxes() const;
00127   // Get the rendered page bounding boxes of all pages created thus far (or
00128   // since last call to ClearBoxes()).
00129   Boxa* GetPageBoxes() const;
00130 
00131   // Rotate the boxes on the most recent page by the given rotation.
00132   void RotatePageBoxes(float rotation);
00133   // Delete all boxes.
00134   void ClearBoxes();
00135   void WriteAllBoxes(const string& filename) const;
00136   // Removes space-delimited words from the string that are not renderable by
00137   // the current font and returns the count of such words.
00138   int StripUnrenderableWords(string* utf8_text) const;
00139 
00140   // Insert a Word Joiner symbol (U+2060) between adjacent characters, excluding
00141   // spaces and combining types, in each word before rendering to ensure words
00142   // are not broken across lines. The output boxchars will not contain the
00143   // joiner.
00144   static string InsertWordJoiners(const string& text);
00145 
00146   // Helper functions to convert fullwidth Latin and halfwidth Basic Latin.
00147   static string ConvertBasicLatinToFullwidthLatin(const string& text);
00148   static string ConvertFullwidthLatinToBasicLatin(const string& text);
00149 
00150  protected:
00151   // Init and free local renderer objects.
00152   void InitPangoCairo();
00153   void SetLayoutProperties();
00154   void FreePangoCairo();
00155   // Compute bounding boxes around grapheme clusters.
00156   void ComputeClusterBoxes();
00157   void CorrectBoxPositionsToLayout(vector<BoxChar*>* boxchars);
00158   bool GetClusterStrings(vector<string>* cluster_text);
00159   int FindFirstPageBreakOffset(const char* text, int text_length);
00160 
00161   PangoFontInfo font_;
00162   // Page properties
00163   int page_width_, page_height_, h_margin_, v_margin_;
00164   // Text rendering properties
00165   int pen_color_[3];
00166   double char_spacing_;
00167   int leading_, resolution_;
00168   bool vertical_text_;
00169   bool gravity_hint_strong_;
00170   bool render_fullwidth_latin_;
00171   // Text filtering options
00172   bool drop_uncovered_chars_;
00173   bool strip_unrenderable_words_;
00174   bool add_ligatures_;
00175   bool output_word_boxes_;
00176   // Pango and cairo specific objects
00177   cairo_surface_t* surface_;
00178   cairo_t* cr_;
00179   PangoLayout* layout_;
00180   // Internal state of current page number, updated on successive calls to
00181   // RenderToImage()
00182   int start_box_;
00183   int page_;
00184   // Boxes and associated text for all pages rendered with RenderToImage() since
00185   // the last call to ClearBoxes().
00186   vector<BoxChar*> boxchars_;
00187   int box_padding_;
00188   // Bounding boxes for pages since the last call to ClearBoxes().
00189   Boxa* page_boxes_;
00190 
00191   // Objects cached for subsequent calls to RenderAllFontsToImage()
00192   hash_map<char32, inT64> char_map_;  // Time-saving char histogram.
00193   int total_chars_;   // Number in the string to be rendered.
00194   int font_index_;    // Index of next font to use in font list.
00195   int last_offset_;   // Offset returned from last successful rendering
00196 
00197  private:
00198   StringRenderer(const StringRenderer&);
00199   void operator=(const StringRenderer&);
00200 };
00201 }  // namespace tesseract
00202 
00203 #endif  // THIRD_PARTY_TESSERACT_TRAINING_STRINGRENDERER_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines