tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/pango_font_info.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        pango_font_info.h
00003  * Description: Font-related objects and helper functions
00004  * Author:      Ranjith Unnikrishnan
00005  * Created:     Mon Nov 18 2013
00006  *
00007  * (C) Copyright 2013, Google Inc.
00008  * Licensed under the Apache License, Version 2.0 (the "License");
00009  * you may not use this file except in compliance with the License.
00010  * You may obtain a copy of the License at
00011  * http://www.apache.org/licenses/LICENSE-2.0
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef TESSERACT_TRAINING_PANGO_FONT_INFO_H_
00021 #define TESSERACT_TRAINING_PANGO_FONT_INFO_H_
00022 
00023 #include <string>
00024 #include <utility>
00025 #include <vector>
00026 
00027 #include "hashfn.h"
00028 #include "host.h"
00029 #include "util.h"
00030 #include "pango/pango-font.h"
00031 
00032 typedef signed int char32;
00033 
00034 namespace tesseract {
00035 
00036 // Data holder class for a font, intented to avoid having to work with Pango or
00037 // FontConfig-specific objects directly.
00038 class PangoFontInfo {
00039  public:
00040   enum FontTypeEnum {
00041     UNKNOWN,
00042     SERIF,
00043     SANS_SERIF,
00044     DECORATIVE,
00045   };
00046   PangoFontInfo();
00047   // Initialize from parsing a font description name, defined as a string of the
00048   // format:
00049   //   "FamilyName [FaceName] [PointSize]"
00050   // where a missing FaceName implies the default regular face.
00051   // eg. "Arial Italic 12", "Verdana"
00052   //
00053   // FaceName is a combination of:
00054   //   [StyleName] [Variant] [Weight] [Stretch]
00055   // with (all optional) Pango-defined values of:
00056   // StyleName: Oblique, Italic
00057   // Variant  : Small-Caps
00058   // Weight   : Ultra-Light, Light, Medium, Semi-Bold, Bold, Ultra-Bold, Heavy
00059   // Stretch  : Ultra-Condensed, Extra-Condensed, Condensed, Semi-Condensed,
00060   //            Semi-Expanded, Expanded, Extra-Expanded, Ultra-Expanded.
00061   explicit PangoFontInfo(const string& name);
00062   bool ParseFontDescriptionName(const string& name);
00063 
00064   // Returns true if the font have codepoint coverage for the specified text.
00065   bool CoversUTF8Text(const char* utf8_text, int byte_length) const;
00066   // Modifies string to remove unicode points that are not covered by the
00067   // font. Returns the number of characters dropped.
00068   int DropUncoveredChars(string* utf8_text) const;
00069 
00070   // Returns true if the entire string can be rendered by the font with full
00071   // character coverage and no unknown glyph or dotted-circle glyph
00072   // substitutions on encountering a badly formed unicode sequence.
00073   // If true, returns individual graphemes. Any whitespace characters in the
00074   // original string are also included in the list.
00075   bool CanRenderString(const char* utf8_word, int len,
00076                        vector<string>* graphemes) const;
00077   bool CanRenderString(const char* utf8_word, int len) const;
00078 
00079   // Retrieves the x_bearing and x_advance for the given utf8 character in the
00080   // font. Returns false if the glyph for the character could not be found in
00081   // the font.
00082   // Ref: http://freetype.sourceforge.net/freetype2/docs/glyphs/glyphs-3.html
00083   bool GetSpacingProperties(const string& utf8_char,
00084                             int* x_bearing, int* x_advance) const;
00085 
00086   // Accessors
00087   string DescriptionName() const;
00088   // Font Family name eg. "Arial"
00089   const string& family_name() const    { return family_name_; }
00090   // Size in points (1/72"), rounded to the nearest integer.
00091   const int font_size() const          { return font_size_; }
00092   const bool is_bold() const           { return is_bold_; }
00093   const bool is_italic() const         { return is_italic_; }
00094   const bool is_smallcaps() const      { return is_smallcaps_; }
00095   const bool is_monospace() const      { return is_monospace_; }
00096   const bool is_fraktur() const        { return is_fraktur_; }
00097   const FontTypeEnum font_type() const { return font_type_; }
00098 
00099   const int resolution() const         { return resolution_; }
00100   void set_resolution(const int resolution) {
00101     resolution_ = resolution;
00102   }
00103 
00104  private:
00105   friend class FontUtils;
00106   void Clear();
00107   bool ParseFontDescription(const PangoFontDescription* desc);
00108   // Returns the PangoFont structure corresponding to the closest available font
00109   // in the font map.
00110   PangoFont* ToPangoFont() const;
00111 
00112   // Font properties set automatically from parsing the font description name.
00113   string family_name_;
00114   int font_size_;
00115   bool is_bold_;
00116   bool is_italic_;
00117   bool is_smallcaps_;
00118   bool is_monospace_;
00119   bool is_fraktur_;
00120   FontTypeEnum font_type_;
00121   // The Pango description that was used to initialize the instance.
00122   PangoFontDescription* desc_;
00123   // Default output resolution to assume for GetSpacingProperties() and any
00124   // other methods that returns pixel values.
00125   int resolution_;
00126 
00127  private:
00128   PangoFontInfo(const PangoFontInfo&);
00129   void operator=(const PangoFontInfo&);
00130 };
00131 
00132 // Static utility methods for querying font availability and font-selection
00133 // based on codepoint coverage.
00134 class FontUtils {
00135  public:
00136   // Returns true if the font of the given description name is available in the
00137   // target directory specified by --fonts_dir
00138   static bool IsAvailableFont(const char* font_desc);
00139   // Outputs description names of available fonts.
00140   static const vector<string>& ListAvailableFonts();
00141 
00142   // Picks font among available fonts that covers and can render the given word,
00143   // and returns the font description name and the decomposition of the word to
00144   // graphemes. Returns false if no suitable font was found.
00145   static bool SelectFont(const char* utf8_word, const int utf8_len,
00146                          string* font_name, vector<string>* graphemes);
00147 
00148   // Picks font among all_fonts that covers and can render the given word,
00149   // and returns the font description name and the decomposition of the word to
00150   // graphemes. Returns false if no suitable font was found.
00151   static bool SelectFont(const char* utf8_word, const int utf8_len,
00152                          const vector<string>& all_fonts,
00153                          string* font_name, vector<string>* graphemes);
00154 
00155   // Returns a bitmask where the value of true at index 'n' implies that unicode
00156   // value 'n' is renderable by at least one available font.
00157   static void GetAllRenderableCharacters(vector<bool>* unichar_bitmap);
00158   // Variant of the above function that inspects only the provided font names.
00159   static void GetAllRenderableCharacters(const vector<string>& font_names,
00160                                          vector<bool>* unichar_bitmap);
00161   static void GetAllRenderableCharacters(const string& font_name,
00162                                          vector<bool>* unichar_bitmap);
00163 
00164   // NOTE: The following utilities were written to be backward compatible with
00165   // StringRender.
00166 
00167   // BestFonts returns a font name and a bit vector of the characters it
00168   // can render for the fonts that score within some fraction of the best
00169   // font on the characters in the given hash map.
00170   // In the flags vector, each flag is set according to whether the
00171   // corresponding character (in order of iterating ch_map) can be rendered.
00172   // The return string is a list of the acceptable fonts that were used.
00173   static string BestFonts(const unordered_map<char32, inT64>& ch_map,
00174       vector<std::pair<const char*, vector<bool> > >* font_flag);
00175 
00176   // FontScore returns the weighted renderability score of the given
00177   // hash map character table in the given font. The unweighted score
00178   // is also returned in raw_score.
00179   // The values in the bool vector ch_flags correspond to whether the
00180   // corresponding character (in order of iterating ch_map) can be rendered.
00181   static int FontScore(const unordered_map<char32, inT64>& ch_map,
00182                        const string& fontname, int* raw_score,
00183                        vector<bool>* ch_flags);
00184 };
00185 }  // namespace tesseract
00186 
00187 #endif  // TESSERACT_TRAINING_PANGO_FONT_INFO_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines