tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/pango_font_info.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        pango_font_info.cpp
00003  * Description: Font-related objects and helper functions
00004  * Author:      Ranjith Unnikrishnan
00005  * Created:     Mon Nov 18 2013
00006  *
00007  * (C) Copyright 2013, Google Inc.
00008  * Licensed under the Apache License, Version 2.0 (the "License");
00009  * you may not use this file except in compliance with the License.
00010  * You may obtain a copy of the License at
00011  * http://www.apache.org/licenses/LICENSE-2.0
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // Include automatically generated configuration file if running autoconf.
00021 #ifdef HAVE_CONFIG_H
00022 #include "config_auto.h"
00023 #endif
00024 
00025 #ifdef MINGW
00026 // workaround for stdlib.h and putenv
00027 #undef __STRICT_ANSI__
00028 #include "strcasestr.h"
00029 #endif  // MINGW
00030 #include <stdlib.h>
00031 #include <stdio.h>
00032 #include <string.h>
00033 #include <sys/param.h>
00034 #include <algorithm>
00035 
00036 #include "pango_font_info.h"
00037 #include "commandlineflags.h"
00038 #include "fileio.h"
00039 #include "normstrngs.h"
00040 #include "tlog.h"
00041 #include "unichar.h"
00042 #include "util.h"
00043 #include "pango/pango-context.h"
00044 #include "pango/pango-font.h"
00045 #include "pango/pango-glyph-item.h"
00046 #include "pango/pango-glyph.h"
00047 #include "pango/pango-layout.h"
00048 #include "pango/pango-utils.h"
00049 #include "pango/pangocairo.h"
00050 #include "pango/pangofc-font.h"
00051 
00052 STRING_PARAM_FLAG(fonts_dir, "/auto/ocr-data/tesstraining/fonts",
00053                   "Overrides system default font location");
00054 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
00055                   "Overrides fontconfig default temporary dir");
00056 BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,
00057                 "Does a one-time deletion of cache files from the "
00058                 "fontconfig_tmpdir before initializing fontconfig.");
00059 
00060 #ifndef USE_STD_NAMESPACE
00061 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
00062 BOOL_PARAM_FLAG(use_only_legacy_fonts, false,
00063                 "Overrides --fonts_dir and sets the known universe of fonts to"
00064                 "the list in legacy_fonts.h");
00065 // Compatability with pango 1.20.
00066 #include "pango/pango-glyph-item-private.h"
00067 #define pango_glyph_item_iter_init_start _pango_glyph_item_iter_init_start
00068 #define pango_glyph_item_iter_next_cluster _pango_glyph_item_iter_next_cluster
00069 #else
00070 using std::pair;
00071 #endif
00072 
00073 namespace tesseract {
00074 
00075 // Default assumed output resolution. Required only for providing font metrics
00076 // in pixels.
00077 const int kDefaultResolution = 300;
00078 
00079 PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
00080   Clear();
00081 }
00082 
00083 PangoFontInfo::PangoFontInfo(const string& desc)
00084     : desc_(NULL), resolution_(kDefaultResolution) {
00085   if (!ParseFontDescriptionName(desc)) {
00086     tprintf("ERROR: Could not parse %s\n", desc.c_str());
00087     Clear();
00088   }
00089 }
00090 
00091 void PangoFontInfo::Clear() {
00092   font_size_ = 0;
00093   is_bold_ = false;
00094   is_italic_ = false;
00095   is_smallcaps_ = false;
00096   is_monospace_ = false;
00097   family_name_.clear();
00098   font_type_ = UNKNOWN;
00099   if (desc_) {
00100     pango_font_description_free(desc_);
00101     desc_ = NULL;
00102   }
00103 }
00104 
00105 string PangoFontInfo::DescriptionName() const {
00106   if (!desc_) return "";
00107   char* desc_str = pango_font_description_to_string(desc_);
00108   string desc_name(desc_str);
00109   g_free(desc_str);
00110   return desc_name;
00111 }
00112 
00113 // Initializes Fontconfig for use by writing a fake fonts.conf file into the
00114 // FLAGS_fontconfigs_tmpdir directory, that points to the supplied
00115 // FLAGS_fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
00116 // to point to this fonts.conf file.
00117 static void InitFontconfig() {
00118   static bool init_fontconfig = false;
00119   if (init_fontconfig || FLAGS_fonts_dir.empty()) {
00120     init_fontconfig = true;
00121     return;
00122   }
00123   if (FLAGS_fontconfig_refresh_cache) {
00124     tprintf("Deleting cache files from %s\n", FLAGS_fontconfig_tmpdir.c_str());
00125     File::DeleteMatchingFiles(File::JoinPath(
00126         FLAGS_fontconfig_tmpdir.c_str(), "*cache-2").c_str());
00127   }
00128   tprintf("Initializing fontconfig\n");
00129   string fonts_dir = File::JoinPath(
00130       FLAGS_fonts_dir.c_str(), "google3/ocr/trainingdata/typesetting/testdata");
00131   const int MAX_FONTCONF_FILESIZE = 1024;
00132   char fonts_conf_template[MAX_FONTCONF_FILESIZE];
00133   snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
00134            "<?xml version=\"1.0\"?>\n"
00135            "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
00136            "<fontconfig>\n"
00137            "<dir>%s</dir>\n"
00138            "<cachedir>%s</cachedir>\n"
00139            "<config></config>\n"
00140            "</fontconfig>", FLAGS_fonts_dir.c_str(),
00141            FLAGS_fontconfig_tmpdir.c_str());
00142   string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
00143                                           "fonts.conf");
00144   File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
00145 #ifdef _WIN32
00146   std::string env("FONTCONFIG_PATH=");
00147   env.append(FLAGS_fontconfig_tmpdir.c_str());
00148   putenv(env.c_str());
00149   putenv("LANG=en_US.utf8");
00150 #else
00151   setenv("FONTCONFIG_PATH", FLAGS_fontconfig_tmpdir.c_str(), true);
00152   // Fix the locale so that the reported font names are consistent.
00153   setenv("LANG", "en_US.utf8", true);
00154 #endif  // _WIN32
00155   init_fontconfig = true;
00156 }
00157 
00158 static void ListFontFamilies(PangoFontFamily*** families,
00159                              int* n_families) {
00160   InitFontconfig();
00161   PangoFontMap* font_map = pango_cairo_font_map_get_default();
00162   DISABLE_HEAP_LEAK_CHECK;
00163   pango_font_map_list_families(font_map, families, n_families);
00164 }
00165 
00166 // Inspects whether a given font family is monospace. If the font is not
00167 // available, it cannot make a decision and returns false by default.
00168 static bool IsMonospaceFontFamily(const char* family_name) {
00169   PangoFontFamily** families = 0;
00170   int n_families = 0;
00171   bool is_monospace = false;
00172   ListFontFamilies(&families, &n_families);
00173   ASSERT_HOST(n_families > 0);
00174   bool found = false;
00175   for (int i = 0; i < n_families; ++i) {
00176     if (!strcasecmp(family_name, pango_font_family_get_name(families[i]))) {
00177       is_monospace = pango_font_family_is_monospace(families[i]);
00178       found = true;
00179       break;
00180     }
00181   }
00182   if (!found) {
00183     tlog(1, "Could not find monospace property of family %s\n", family_name);
00184   }
00185   g_free(families);
00186   return is_monospace;
00187 }
00188 
00189 bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) {
00190   Clear();
00191   const char* family = pango_font_description_get_family(desc);
00192   if (!family) {
00193     char* desc_str = pango_font_description_to_string(desc);
00194     tprintf("WARNING: Could not parse family name from description: '%s'\n",
00195             desc_str);
00196     g_free(desc_str);
00197     return false;
00198   }
00199   family_name_ = string(family);
00200   desc_ = pango_font_description_copy(desc);
00201   is_monospace_ = IsMonospaceFontFamily(family);
00202 
00203   // Set font size in points
00204   font_size_ = pango_font_description_get_size(desc);
00205   if (!pango_font_description_get_size_is_absolute(desc)) {
00206     font_size_ /= PANGO_SCALE;
00207   }
00208 
00209   PangoStyle style = pango_font_description_get_style(desc);
00210   is_italic_ = (PANGO_STYLE_ITALIC == style ||
00211                 PANGO_STYLE_OBLIQUE == style);
00212   is_smallcaps_ = (pango_font_description_get_variant(desc)
00213                    == PANGO_VARIANT_SMALL_CAPS);
00214 
00215   is_bold_ = (pango_font_description_get_weight(desc) >= PANGO_WEIGHT_BOLD);
00216   // We dont have a way to detect whether a font is of type Fraktur. The fonts
00217   // we currently use all have "Fraktur" in their family name, so we do a
00218   // fragile but functional check for that here.
00219   is_fraktur_ = (strcasestr(family, "Fraktur") != NULL);
00220   return true;
00221 }
00222 
00223 bool PangoFontInfo::ParseFontDescriptionName(const string& name) {
00224   PangoFontDescription *desc = pango_font_description_from_string(name.c_str());
00225   bool success = ParseFontDescription(desc);
00226   pango_font_description_free(desc);
00227   return success;
00228 }
00229 
00230 // Returns the PangoFont structure corresponding to the closest available font
00231 // in the font map. Note that if the font is wholly missing, this could
00232 // correspond to a completely different font family and face.
00233 PangoFont* PangoFontInfo::ToPangoFont() const {
00234   InitFontconfig();
00235   PangoFontMap* font_map = pango_cairo_font_map_get_default();
00236   PangoContext* context = pango_context_new();
00237   pango_cairo_context_set_resolution(context, resolution_);
00238   pango_context_set_font_map(context, font_map);
00239   PangoFont* font = NULL;
00240   {
00241     DISABLE_HEAP_LEAK_CHECK;
00242     font = pango_font_map_load_font(font_map, context, desc_);
00243   }
00244   g_object_unref(context);
00245   return font;
00246 }
00247 
00248 bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const {
00249   PangoFont* font = ToPangoFont();
00250   PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
00251   for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length);
00252        it != UNICHAR::end(utf8_text, byte_length);
00253        ++it) {
00254     if (IsWhitespace(*it) || pango_is_zero_width(*it))
00255       continue;
00256     if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
00257       char tmp[5];
00258       int len = it.get_utf8(tmp);
00259       tmp[len] = '\0';
00260       tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
00261       return false;
00262     }
00263   }
00264   return true;
00265 }
00266 
00267 int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
00268   PangoFont* font = ToPangoFont();
00269   PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
00270   int num_dropped_chars = 0;
00271   // Maintain two iterators that point into the string. For space efficiency, we
00272   // will repeatedly copy one covered UTF8 character from one to the other, and
00273   // at the end resize the string to the right length.
00274   char* out = const_cast<char*>(utf8_text->c_str());
00275   const UNICHAR::const_iterator it_begin =
00276       UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
00277   const UNICHAR::const_iterator it_end =
00278       UNICHAR::end(utf8_text->c_str(), utf8_text->length());
00279   for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
00280     if (!IsWhitespace(*it) && !pango_is_zero_width(*it) &&
00281         pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
00282       if (TLOG_IS_ON(2)) {
00283         char tmp[5];
00284         int len = it.get_utf8(tmp);
00285         tmp[len] = '\0';
00286         tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
00287       }
00288       ++num_dropped_chars;
00289       continue;
00290     }
00291     strncpy(out, it.utf8_data(), it.utf8_len());
00292     out += it.utf8_len();
00293   }
00294   utf8_text->resize(out - utf8_text->c_str());
00295   return num_dropped_chars;
00296 }
00297 
00298 bool PangoFontInfo::GetSpacingProperties(const string& utf8_char,
00299                                          int* x_bearing, int* x_advance) const {
00300   // Convert to equivalent PangoFont structure
00301   PangoFont* font = ToPangoFont();
00302   // Find the glyph index in the font for the supplied utf8 character.
00303   int total_advance = 0;
00304   int min_bearing = 0;
00305   // Handle multi-unicode strings by reporting the left-most position of the
00306   // x-bearing, and right-most position of the x-advance if the string were to
00307   // be rendered.
00308   const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(),
00309                                                           utf8_char.length());
00310   const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(),
00311                                                       utf8_char.length());
00312   for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
00313     PangoGlyph glyph_index = pango_fc_font_get_glyph(
00314         reinterpret_cast<PangoFcFont*>(font), *it);
00315     if (!glyph_index) {
00316       // Glyph for given unicode character doesn't exist in font.
00317       return false;
00318     }
00319     // Find the ink glyph extents for the glyph
00320     PangoRectangle ink_rect, logical_rect;
00321     pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);
00322     pango_extents_to_pixels(&ink_rect, NULL);
00323     pango_extents_to_pixels(&logical_rect, NULL);
00324 
00325     int bearing = total_advance + PANGO_LBEARING(ink_rect);
00326     if (it == it_begin || bearing < min_bearing) {
00327       min_bearing = bearing;
00328     }
00329     total_advance += PANGO_RBEARING(logical_rect);
00330   }
00331   *x_bearing = min_bearing;
00332   *x_advance = total_advance;
00333   return true;
00334 }
00335 
00336 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const {
00337   vector<string> graphemes;
00338   return CanRenderString(utf8_word, len, &graphemes);
00339 }
00340 
00341 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
00342                                     vector<string>* graphemes) const {
00343   if (graphemes) graphemes->clear();
00344   // We check for font coverage of the text first, as otherwise Pango could
00345   // (undesirably) fall back to another font that does have the required
00346   // coverage.
00347   if (!CoversUTF8Text(utf8_word, len)) {
00348     return false;
00349   }
00350   // U+25CC dotted circle character that often (but not always) gets rendered
00351   // when there is an illegal grapheme sequence.
00352   const char32 kDottedCircleGlyph = 9676;
00353   bool bad_glyph = false;
00354   PangoFontMap* font_map = pango_cairo_font_map_get_default();
00355   PangoContext* context = pango_context_new();
00356   pango_context_set_font_map(context, font_map);
00357   PangoLayout* layout = pango_layout_new(context);
00358   if (desc_) {
00359     pango_layout_set_font_description(layout, desc_);
00360   } else {
00361     PangoFontDescription *desc = pango_font_description_from_string(
00362         DescriptionName().c_str());
00363     pango_layout_set_font_description(layout, desc);
00364     pango_font_description_free(desc);
00365   }
00366   pango_layout_set_text(layout, utf8_word, len);
00367   PangoLayoutIter* run_iter = NULL;
00368   { // Fontconfig caches some information here that is not freed before exit.
00369     DISABLE_HEAP_LEAK_CHECK;
00370     run_iter = pango_layout_get_iter(layout);
00371   }
00372   do {
00373     PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
00374     if (!run) {
00375       tlog(2, "Found end of line NULL run marker\n");
00376       continue;
00377     }
00378     PangoGlyph dotted_circle_glyph;
00379     PangoFont* font = run->item->analysis.font;
00380     dotted_circle_glyph = pango_fc_font_get_glyph(
00381         reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph);
00382     if (TLOG_IS_ON(2)) {
00383       PangoFontDescription* desc = pango_font_describe(font);
00384       char* desc_str = pango_font_description_to_string(desc);
00385       tlog(2, "Desc of font in run: %s\n", desc_str);
00386       g_free(desc_str);
00387       pango_font_description_free(desc);
00388     }
00389 
00390     PangoGlyphItemIter cluster_iter;
00391     gboolean have_cluster;
00392     for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
00393                                                          run, utf8_word);
00394          have_cluster && !bad_glyph;
00395          have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
00396       const int start_byte_index = cluster_iter.start_index;
00397       const int end_byte_index = cluster_iter.end_index;
00398       int start_glyph_index = cluster_iter.start_glyph;
00399       int end_glyph_index = cluster_iter.end_glyph;
00400       string cluster_text = string(utf8_word + start_byte_index,
00401                                    end_byte_index - start_byte_index);
00402       if (graphemes) graphemes->push_back(cluster_text);
00403       if (IsUTF8Whitespace(cluster_text.c_str())) {
00404         tlog(2, "Skipping whitespace\n");
00405         continue;
00406       }
00407       if (TLOG_IS_ON(2)) {
00408         printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ",
00409                start_byte_index, end_byte_index,
00410                start_glyph_index, end_glyph_index);
00411       }
00412       for (int i = start_glyph_index,
00413                step = (end_glyph_index > start_glyph_index) ? 1 : -1;
00414            !bad_glyph && i != end_glyph_index; i+= step) {
00415         const bool unknown_glyph =
00416             (cluster_iter.glyph_item->glyphs->glyphs[i].glyph &
00417              PANGO_GLYPH_UNKNOWN_FLAG);
00418         const bool illegal_glyph =
00419             (cluster_iter.glyph_item->glyphs->glyphs[i].glyph ==
00420              dotted_circle_glyph);
00421         bad_glyph = unknown_glyph || illegal_glyph;
00422         if (TLOG_IS_ON(2)) {
00423           printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph,
00424                  bad_glyph ? 1 : 0);
00425         }
00426       }
00427       if (TLOG_IS_ON(2)) {
00428         printf("  '%s'\n", cluster_text.c_str());
00429       }
00430       if (bad_glyph)
00431         tlog(1, "Found illegal glyph!\n");
00432     }
00433   } while (!bad_glyph && pango_layout_iter_next_run(run_iter));
00434 
00435   pango_layout_iter_free(run_iter);
00436   g_object_unref(context);
00437   g_object_unref(layout);
00438   if (bad_glyph && graphemes) graphemes->clear();
00439   return !bad_glyph;
00440 }
00441 
00442 
00443 // ------------------------ FontUtils ------------------------------------
00444 
00445 // Returns whether the specified font description is available in the fonts
00446 // directory.
00447 //
00448 // The generated list of font families and faces includes "synthesized" font
00449 // faces that are not truly loadable. Pango versions >=1.18 have a
00450 // pango_font_face_is_synthesized method that can be used to prune the list.
00451 // Until then, we are restricted to using a hack where we try to load the font
00452 // from the font_map, and then check what we loaded to see if it has the
00453 // description we expected. If it is not, then the font is deemed unavailable.
00454 /* static */
00455 bool FontUtils::IsAvailableFont(const char* query_desc) {
00456   PangoFontDescription *desc = pango_font_description_from_string(query_desc);
00457   PangoFont* selected_font = NULL;
00458   {
00459     InitFontconfig();
00460     PangoFontMap* font_map = pango_cairo_font_map_get_default();
00461     PangoContext* context = pango_context_new();
00462     pango_context_set_font_map(context, font_map);
00463     {
00464       DISABLE_HEAP_LEAK_CHECK;
00465       selected_font = pango_font_map_load_font(font_map, context, desc);
00466     }
00467     g_object_unref(context);
00468   }
00469   PangoFontDescription* selected_desc = pango_font_describe(selected_font);
00470 
00471   bool equal = pango_font_description_equal(desc, selected_desc);
00472   tlog(3, "query weight = %d \t selected weight =%d\n",
00473        pango_font_description_get_weight(desc),
00474        pango_font_description_get_weight(selected_desc));
00475 
00476   char* selected_desc_str = pango_font_description_to_string(selected_desc);
00477   tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc, selected_desc_str);
00478 
00479   g_free(selected_desc_str);
00480   pango_font_description_free(selected_desc);
00481   pango_font_description_free(desc);
00482   return equal;
00483 }
00484 
00485 static bool ShouldIgnoreFontFamilyName(const char* query) {
00486   static const char* kIgnoredFamilyNames[]
00487       = { "Sans", "Serif", "Monospace", NULL };
00488   const char** list = kIgnoredFamilyNames;
00489   for (; *list != NULL; ++list) {
00490     if (!strcmp(*list, query))
00491       return true;
00492   }
00493   return false;
00494 }
00495 
00496 // Outputs description names of available fonts.
00497 /* static */
00498 const vector<string>& FontUtils::ListAvailableFonts() {
00499   static vector<string> available_fonts_;  // cache list
00500   if (available_fonts_.size()) {
00501     return available_fonts_;
00502   }
00503 #ifndef USE_STD_NAMESPACE
00504   if (FLAGS_use_only_legacy_fonts) {
00505     // Restrict view to list of fonts in legacy_fonts.h
00506     tprintf("Using list of legacy fonts only\n");
00507     const int kNumFontLists = 4;
00508     for (int i = 0; i < kNumFontLists; ++i) {
00509       for (int j = 0; kFontlists[i][j] != NULL; ++j) {
00510         available_fonts_.push_back(kFontlists[i][j]);
00511       }
00512     }
00513     return available_fonts_;
00514   }
00515 #endif
00516 
00517   PangoFontFamily** families = 0;
00518   int n_families = 0;
00519   ListFontFamilies(&families, &n_families);
00520   for (int i = 0; i < n_families; ++i) {
00521     const char* family_name = pango_font_family_get_name(families[i]);
00522     tlog(2, "Listing family %s\n", family_name);
00523     if (ShouldIgnoreFontFamilyName(family_name))
00524       continue;
00525 
00526     int n_faces;
00527     PangoFontFace** faces = NULL;
00528     pango_font_family_list_faces(families[i], &faces, &n_faces);
00529     for (int j = 0; j < n_faces; ++j) {
00530       PangoFontDescription* desc = pango_font_face_describe(faces[j]);
00531       char* desc_str = pango_font_description_to_string(desc);
00532       if (IsAvailableFont(desc_str)) {
00533         available_fonts_.push_back(desc_str);
00534       }
00535       pango_font_description_free(desc);
00536       g_free(desc_str);
00537     }
00538     g_free(faces);
00539   }
00540   g_free(families);
00541   sort(available_fonts_.begin(), available_fonts_.end());
00542   return available_fonts_;
00543 }
00544 
00545 
00546 static void CharCoverageMapToBitmap(PangoCoverage* coverage,
00547                                     vector<bool>* unichar_bitmap) {
00548   const int kMinUnicodeValue = 33;
00549   const int kMaxUnicodeValue = 0x10FFFF;
00550   unichar_bitmap->resize(kMaxUnicodeValue + 1, false);
00551   // Mark off characters that the font can render.
00552   for (int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) {
00553     if (IsInterchangeValid(i)) {
00554       (*unichar_bitmap)[i]
00555           = (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT);
00556     }
00557   }
00558 }
00559 
00560 /* static */
00561 void FontUtils::GetAllRenderableCharacters(vector<bool>* unichar_bitmap) {
00562   const vector<string>& all_fonts = ListAvailableFonts();
00563   return GetAllRenderableCharacters(all_fonts, unichar_bitmap);
00564 }
00565 
00566 /* static */
00567 void FontUtils::GetAllRenderableCharacters(const string& font_name,
00568                                            vector<bool>* unichar_bitmap) {
00569   PangoFontInfo font_info(font_name);
00570   PangoCoverage* coverage = pango_font_get_coverage(
00571       font_info.ToPangoFont(), NULL);
00572   CharCoverageMapToBitmap(coverage, unichar_bitmap);
00573 }
00574 
00575 /* static */
00576 void FontUtils::GetAllRenderableCharacters(const vector<string>& fonts,
00577                                            vector<bool>* unichar_bitmap) {
00578   // Form the union of coverage maps from the fonts
00579   PangoCoverage* all_coverage = pango_coverage_new();
00580   tlog(1, "Processing %d fonts\n", fonts.size());
00581   for (int i = 0; i < fonts.size(); ++i) {
00582     PangoFontInfo font_info(fonts[i]);
00583     PangoCoverage* coverage = pango_font_get_coverage(
00584         font_info.ToPangoFont(), NULL);
00585     // Mark off characters that any font can render.
00586     pango_coverage_max(all_coverage, coverage);
00587   }
00588   CharCoverageMapToBitmap(all_coverage, unichar_bitmap);
00589   pango_coverage_unref(all_coverage);
00590 }
00591 
00592 
00593 // Utilities written to be backward compatible with StringRender
00594 
00595 /* static */
00596 int FontUtils::FontScore(const unordered_map<char32, inT64>& ch_map,
00597                          const string& fontname,
00598                          int* raw_score,
00599                          vector<bool>* ch_flags) {
00600   PangoFontInfo font_info;
00601   if (!font_info.ParseFontDescriptionName(fontname)) {
00602     tprintf("ERROR: Could not parse %s\n", fontname.c_str());
00603   }
00604   PangoFont* font = font_info.ToPangoFont();
00605   PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
00606 
00607   if (ch_flags) {
00608     ch_flags->clear();
00609     ch_flags->reserve(ch_map.size());
00610   }
00611   *raw_score = 0;
00612   int ok_chars = 0;
00613   for (unordered_map<char32, inT64>::const_iterator it = ch_map.begin();
00614        it != ch_map.end(); ++it) {
00615     bool covered = (IsWhitespace(it->first) ||
00616                     (pango_coverage_get(coverage, it->first)
00617                      == PANGO_COVERAGE_EXACT));
00618     if (covered) {
00619       ++(*raw_score);
00620       ok_chars += it->second;
00621     }
00622     if (ch_flags) {
00623       ch_flags->push_back(covered);
00624     }
00625   }
00626   return ok_chars;
00627 }
00628 
00629 
00630 /* static */
00631 string FontUtils::BestFonts(const unordered_map<char32, inT64>& ch_map,
00632                             vector<pair<const char*, vector<bool> > >* fonts) {
00633   const double kMinOKFraction = 0.99;
00634   // Weighted fraction of characters that must be renderable in a font to make
00635   // it OK even if the raw count is not good.
00636   const double kMinWeightedFraction = 0.99995;
00637 
00638   fonts->clear();
00639   vector<vector<bool> > font_flags;
00640   vector<int> font_scores;
00641   vector<int> raw_scores;
00642   int most_ok_chars = 0;
00643   int best_raw_score = 0;
00644   const vector<string>& font_names = FontUtils::ListAvailableFonts();
00645   for (int i = 0; i < font_names.size(); ++i) {
00646     vector<bool> ch_flags;
00647     int raw_score = 0;
00648     int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags);
00649     most_ok_chars = MAX(ok_chars, most_ok_chars);
00650     best_raw_score = MAX(raw_score, best_raw_score);
00651 
00652     font_flags.push_back(ch_flags);
00653     font_scores.push_back(ok_chars);
00654     raw_scores.push_back(raw_score);
00655   }
00656 
00657   // Now select the fonts with a score above a threshold fraction
00658   // of both the raw and weighted best scores. To prevent bogus fonts being
00659   // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of
00660   // BOTH weighted and raw scores.
00661   // In low character-count scripts, the issue is more getting enough fonts,
00662   // when only 1 or 2 might have all those rare dingbats etc in them, so we
00663   // allow a font with a very high weighted (coverage) score
00664   // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor.
00665   int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction);
00666   int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction);
00667   int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction);
00668 
00669   string font_list;
00670   for (int i = 0; i < font_names.size(); ++i) {
00671     int score = font_scores[i];
00672     int raw_score = raw_scores[i];
00673     if ((score >= least_good_enough && raw_score >= least_raw_enough) ||
00674         score >= override_enough) {
00675       fonts->push_back(make_pair(font_names[i].c_str(), font_flags[i]));
00676       tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n",
00677            font_names[i].c_str(),
00678            100.0 * score / most_ok_chars,
00679            raw_score, 100.0 * raw_score / best_raw_score);
00680       font_list += font_names[i];
00681       font_list += "\n";
00682     } else if (score >= least_good_enough || raw_score >= least_raw_enough) {
00683       tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n",
00684            font_names[i].c_str(),
00685            100.0 * score / most_ok_chars,
00686            raw_score, 100.0 * raw_score / best_raw_score);
00687     }
00688   }
00689   return font_list;
00690 }
00691 
00692 /* static */
00693 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
00694                            string* font_name, vector<string>* graphemes) {
00695   return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name,
00696                     graphemes);
00697 }
00698 
00699 /* static */
00700 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
00701                            const vector<string>& all_fonts,
00702                            string* font_name, vector<string>* graphemes) {
00703   if (font_name) font_name->clear();
00704   if (graphemes) graphemes->clear();
00705   for (int i = 0; i < all_fonts.size(); ++i) {
00706     PangoFontInfo font;
00707     vector<string> found_graphemes;
00708     ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]),
00709                     "Could not parse font desc name %s\n",
00710                     all_fonts[i].c_str());
00711     if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) {
00712       if (graphemes) graphemes->swap(found_graphemes);
00713       if (font_name) *font_name = all_fonts[i];
00714       return true;
00715     }
00716   }
00717   return false;
00718 }
00719 
00720 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines