tesseract
3.03
|
00001 /********************************************************************** 00002 * File: pango_font_info.h 00003 * Description: Font-related objects and helper functions 00004 * Author: Ranjith Unnikrishnan 00005 * Created: Mon Nov 18 2013 00006 * 00007 * (C) Copyright 2013, Google Inc. 00008 * Licensed under the Apache License, Version 2.0 (the "License"); 00009 * you may not use this file except in compliance with the License. 00010 * You may obtain a copy of the License at 00011 * http://www.apache.org/licenses/LICENSE-2.0 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifndef TESSERACT_TRAINING_PANGO_FONT_INFO_H_ 00021 #define TESSERACT_TRAINING_PANGO_FONT_INFO_H_ 00022 00023 #include <string> 00024 #include <utility> 00025 #include <vector> 00026 00027 #include "hashfn.h" 00028 #include "host.h" 00029 #include "util.h" 00030 #include "pango/pango-font.h" 00031 00032 typedef signed int char32; 00033 00034 namespace tesseract { 00035 00036 // Data holder class for a font, intented to avoid having to work with Pango or 00037 // FontConfig-specific objects directly. 00038 class PangoFontInfo { 00039 public: 00040 enum FontTypeEnum { 00041 UNKNOWN, 00042 SERIF, 00043 SANS_SERIF, 00044 DECORATIVE, 00045 }; 00046 PangoFontInfo(); 00047 // Initialize from parsing a font description name, defined as a string of the 00048 // format: 00049 // "FamilyName [FaceName] [PointSize]" 00050 // where a missing FaceName implies the default regular face. 00051 // eg. "Arial Italic 12", "Verdana" 00052 // 00053 // FaceName is a combination of: 00054 // [StyleName] [Variant] [Weight] [Stretch] 00055 // with (all optional) Pango-defined values of: 00056 // StyleName: Oblique, Italic 00057 // Variant : Small-Caps 00058 // Weight : Ultra-Light, Light, Medium, Semi-Bold, Bold, Ultra-Bold, Heavy 00059 // Stretch : Ultra-Condensed, Extra-Condensed, Condensed, Semi-Condensed, 00060 // Semi-Expanded, Expanded, Extra-Expanded, Ultra-Expanded. 00061 explicit PangoFontInfo(const string& name); 00062 bool ParseFontDescriptionName(const string& name); 00063 00064 // Returns true if the font have codepoint coverage for the specified text. 00065 bool CoversUTF8Text(const char* utf8_text, int byte_length) const; 00066 // Modifies string to remove unicode points that are not covered by the 00067 // font. Returns the number of characters dropped. 00068 int DropUncoveredChars(string* utf8_text) const; 00069 00070 // Returns true if the entire string can be rendered by the font with full 00071 // character coverage and no unknown glyph or dotted-circle glyph 00072 // substitutions on encountering a badly formed unicode sequence. 00073 // If true, returns individual graphemes. Any whitespace characters in the 00074 // original string are also included in the list. 00075 bool CanRenderString(const char* utf8_word, int len, 00076 vector<string>* graphemes) const; 00077 bool CanRenderString(const char* utf8_word, int len) const; 00078 00079 // Retrieves the x_bearing and x_advance for the given utf8 character in the 00080 // font. Returns false if the glyph for the character could not be found in 00081 // the font. 00082 // Ref: http://freetype.sourceforge.net/freetype2/docs/glyphs/glyphs-3.html 00083 bool GetSpacingProperties(const string& utf8_char, 00084 int* x_bearing, int* x_advance) const; 00085 00086 // Accessors 00087 string DescriptionName() const; 00088 // Font Family name eg. "Arial" 00089 const string& family_name() const { return family_name_; } 00090 // Size in points (1/72"), rounded to the nearest integer. 00091 const int font_size() const { return font_size_; } 00092 const bool is_bold() const { return is_bold_; } 00093 const bool is_italic() const { return is_italic_; } 00094 const bool is_smallcaps() const { return is_smallcaps_; } 00095 const bool is_monospace() const { return is_monospace_; } 00096 const bool is_fraktur() const { return is_fraktur_; } 00097 const FontTypeEnum font_type() const { return font_type_; } 00098 00099 const int resolution() const { return resolution_; } 00100 void set_resolution(const int resolution) { 00101 resolution_ = resolution; 00102 } 00103 00104 private: 00105 friend class FontUtils; 00106 void Clear(); 00107 bool ParseFontDescription(const PangoFontDescription* desc); 00108 // Returns the PangoFont structure corresponding to the closest available font 00109 // in the font map. 00110 PangoFont* ToPangoFont() const; 00111 00112 // Font properties set automatically from parsing the font description name. 00113 string family_name_; 00114 int font_size_; 00115 bool is_bold_; 00116 bool is_italic_; 00117 bool is_smallcaps_; 00118 bool is_monospace_; 00119 bool is_fraktur_; 00120 FontTypeEnum font_type_; 00121 // The Pango description that was used to initialize the instance. 00122 PangoFontDescription* desc_; 00123 // Default output resolution to assume for GetSpacingProperties() and any 00124 // other methods that returns pixel values. 00125 int resolution_; 00126 00127 private: 00128 PangoFontInfo(const PangoFontInfo&); 00129 void operator=(const PangoFontInfo&); 00130 }; 00131 00132 // Static utility methods for querying font availability and font-selection 00133 // based on codepoint coverage. 00134 class FontUtils { 00135 public: 00136 // Returns true if the font of the given description name is available in the 00137 // target directory specified by --fonts_dir 00138 static bool IsAvailableFont(const char* font_desc); 00139 // Outputs description names of available fonts. 00140 static const vector<string>& ListAvailableFonts(); 00141 00142 // Picks font among available fonts that covers and can render the given word, 00143 // and returns the font description name and the decomposition of the word to 00144 // graphemes. Returns false if no suitable font was found. 00145 static bool SelectFont(const char* utf8_word, const int utf8_len, 00146 string* font_name, vector<string>* graphemes); 00147 00148 // Picks font among all_fonts that covers and can render the given word, 00149 // and returns the font description name and the decomposition of the word to 00150 // graphemes. Returns false if no suitable font was found. 00151 static bool SelectFont(const char* utf8_word, const int utf8_len, 00152 const vector<string>& all_fonts, 00153 string* font_name, vector<string>* graphemes); 00154 00155 // Returns a bitmask where the value of true at index 'n' implies that unicode 00156 // value 'n' is renderable by at least one available font. 00157 static void GetAllRenderableCharacters(vector<bool>* unichar_bitmap); 00158 // Variant of the above function that inspects only the provided font names. 00159 static void GetAllRenderableCharacters(const vector<string>& font_names, 00160 vector<bool>* unichar_bitmap); 00161 static void GetAllRenderableCharacters(const string& font_name, 00162 vector<bool>* unichar_bitmap); 00163 00164 // NOTE: The following utilities were written to be backward compatible with 00165 // StringRender. 00166 00167 // BestFonts returns a font name and a bit vector of the characters it 00168 // can render for the fonts that score within some fraction of the best 00169 // font on the characters in the given hash map. 00170 // In the flags vector, each flag is set according to whether the 00171 // corresponding character (in order of iterating ch_map) can be rendered. 00172 // The return string is a list of the acceptable fonts that were used. 00173 static string BestFonts(const unordered_map<char32, inT64>& ch_map, 00174 vector<std::pair<const char*, vector<bool> > >* font_flag); 00175 00176 // FontScore returns the weighted renderability score of the given 00177 // hash map character table in the given font. The unweighted score 00178 // is also returned in raw_score. 00179 // The values in the bool vector ch_flags correspond to whether the 00180 // corresponding character (in order of iterating ch_map) can be rendered. 00181 static int FontScore(const unordered_map<char32, inT64>& ch_map, 00182 const string& fontname, int* raw_score, 00183 vector<bool>* ch_flags); 00184 }; 00185 } // namespace tesseract 00186 00187 #endif // TESSERACT_TRAINING_PANGO_FONT_INFO_H_