tesseract
3.03
|
00001 /********************************************************************** 00002 * File: pango_font_info.cpp 00003 * Description: Font-related objects and helper functions 00004 * Author: Ranjith Unnikrishnan 00005 * Created: Mon Nov 18 2013 00006 * 00007 * (C) Copyright 2013, Google Inc. 00008 * Licensed under the Apache License, Version 2.0 (the "License"); 00009 * you may not use this file except in compliance with the License. 00010 * You may obtain a copy of the License at 00011 * http://www.apache.org/licenses/LICENSE-2.0 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 // Include automatically generated configuration file if running autoconf. 00021 #ifdef HAVE_CONFIG_H 00022 #include "config_auto.h" 00023 #endif 00024 00025 #ifdef MINGW 00026 // workaround for stdlib.h and putenv 00027 #undef __STRICT_ANSI__ 00028 #include "strcasestr.h" 00029 #endif // MINGW 00030 #include <stdlib.h> 00031 #include <stdio.h> 00032 #include <string.h> 00033 #include <sys/param.h> 00034 #include <algorithm> 00035 00036 #include "pango_font_info.h" 00037 #include "commandlineflags.h" 00038 #include "fileio.h" 00039 #include "normstrngs.h" 00040 #include "tlog.h" 00041 #include "unichar.h" 00042 #include "util.h" 00043 #include "pango/pango-context.h" 00044 #include "pango/pango-font.h" 00045 #include "pango/pango-glyph-item.h" 00046 #include "pango/pango-glyph.h" 00047 #include "pango/pango-layout.h" 00048 #include "pango/pango-utils.h" 00049 #include "pango/pangocairo.h" 00050 #include "pango/pangofc-font.h" 00051 00052 STRING_PARAM_FLAG(fonts_dir, "/auto/ocr-data/tesstraining/fonts", 00053 "Overrides system default font location"); 00054 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", 00055 "Overrides fontconfig default temporary dir"); 00056 BOOL_PARAM_FLAG(fontconfig_refresh_cache, false, 00057 "Does a one-time deletion of cache files from the " 00058 "fontconfig_tmpdir before initializing fontconfig."); 00059 00060 #ifndef USE_STD_NAMESPACE 00061 #include "ocr/trainingdata/typesetting/legacy_fonts.h" 00062 BOOL_PARAM_FLAG(use_only_legacy_fonts, false, 00063 "Overrides --fonts_dir and sets the known universe of fonts to" 00064 "the list in legacy_fonts.h"); 00065 // Compatability with pango 1.20. 00066 #include "pango/pango-glyph-item-private.h" 00067 #define pango_glyph_item_iter_init_start _pango_glyph_item_iter_init_start 00068 #define pango_glyph_item_iter_next_cluster _pango_glyph_item_iter_next_cluster 00069 #else 00070 using std::pair; 00071 #endif 00072 00073 namespace tesseract { 00074 00075 // Default assumed output resolution. Required only for providing font metrics 00076 // in pixels. 00077 const int kDefaultResolution = 300; 00078 00079 PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) { 00080 Clear(); 00081 } 00082 00083 PangoFontInfo::PangoFontInfo(const string& desc) 00084 : desc_(NULL), resolution_(kDefaultResolution) { 00085 if (!ParseFontDescriptionName(desc)) { 00086 tprintf("ERROR: Could not parse %s\n", desc.c_str()); 00087 Clear(); 00088 } 00089 } 00090 00091 void PangoFontInfo::Clear() { 00092 font_size_ = 0; 00093 is_bold_ = false; 00094 is_italic_ = false; 00095 is_smallcaps_ = false; 00096 is_monospace_ = false; 00097 family_name_.clear(); 00098 font_type_ = UNKNOWN; 00099 if (desc_) { 00100 pango_font_description_free(desc_); 00101 desc_ = NULL; 00102 } 00103 } 00104 00105 string PangoFontInfo::DescriptionName() const { 00106 if (!desc_) return ""; 00107 char* desc_str = pango_font_description_to_string(desc_); 00108 string desc_name(desc_str); 00109 g_free(desc_str); 00110 return desc_name; 00111 } 00112 00113 // Initializes Fontconfig for use by writing a fake fonts.conf file into the 00114 // FLAGS_fontconfigs_tmpdir directory, that points to the supplied 00115 // FLAGS_fonts_dir, and then overrides the FONTCONFIG_PATH environment variable 00116 // to point to this fonts.conf file. 00117 static void InitFontconfig() { 00118 static bool init_fontconfig = false; 00119 if (init_fontconfig || FLAGS_fonts_dir.empty()) { 00120 init_fontconfig = true; 00121 return; 00122 } 00123 if (FLAGS_fontconfig_refresh_cache) { 00124 tprintf("Deleting cache files from %s\n", FLAGS_fontconfig_tmpdir.c_str()); 00125 File::DeleteMatchingFiles(File::JoinPath( 00126 FLAGS_fontconfig_tmpdir.c_str(), "*cache-2").c_str()); 00127 } 00128 tprintf("Initializing fontconfig\n"); 00129 string fonts_dir = File::JoinPath( 00130 FLAGS_fonts_dir.c_str(), "google3/ocr/trainingdata/typesetting/testdata"); 00131 const int MAX_FONTCONF_FILESIZE = 1024; 00132 char fonts_conf_template[MAX_FONTCONF_FILESIZE]; 00133 snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE, 00134 "<?xml version=\"1.0\"?>\n" 00135 "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n" 00136 "<fontconfig>\n" 00137 "<dir>%s</dir>\n" 00138 "<cachedir>%s</cachedir>\n" 00139 "<config></config>\n" 00140 "</fontconfig>", FLAGS_fonts_dir.c_str(), 00141 FLAGS_fontconfig_tmpdir.c_str()); 00142 string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(), 00143 "fonts.conf"); 00144 File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file); 00145 #ifdef _WIN32 00146 std::string env("FONTCONFIG_PATH="); 00147 env.append(FLAGS_fontconfig_tmpdir.c_str()); 00148 putenv(env.c_str()); 00149 putenv("LANG=en_US.utf8"); 00150 #else 00151 setenv("FONTCONFIG_PATH", FLAGS_fontconfig_tmpdir.c_str(), true); 00152 // Fix the locale so that the reported font names are consistent. 00153 setenv("LANG", "en_US.utf8", true); 00154 #endif // _WIN32 00155 init_fontconfig = true; 00156 } 00157 00158 static void ListFontFamilies(PangoFontFamily*** families, 00159 int* n_families) { 00160 InitFontconfig(); 00161 PangoFontMap* font_map = pango_cairo_font_map_get_default(); 00162 DISABLE_HEAP_LEAK_CHECK; 00163 pango_font_map_list_families(font_map, families, n_families); 00164 } 00165 00166 // Inspects whether a given font family is monospace. If the font is not 00167 // available, it cannot make a decision and returns false by default. 00168 static bool IsMonospaceFontFamily(const char* family_name) { 00169 PangoFontFamily** families = 0; 00170 int n_families = 0; 00171 bool is_monospace = false; 00172 ListFontFamilies(&families, &n_families); 00173 ASSERT_HOST(n_families > 0); 00174 bool found = false; 00175 for (int i = 0; i < n_families; ++i) { 00176 if (!strcasecmp(family_name, pango_font_family_get_name(families[i]))) { 00177 is_monospace = pango_font_family_is_monospace(families[i]); 00178 found = true; 00179 break; 00180 } 00181 } 00182 if (!found) { 00183 tlog(1, "Could not find monospace property of family %s\n", family_name); 00184 } 00185 g_free(families); 00186 return is_monospace; 00187 } 00188 00189 bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) { 00190 Clear(); 00191 const char* family = pango_font_description_get_family(desc); 00192 if (!family) { 00193 char* desc_str = pango_font_description_to_string(desc); 00194 tprintf("WARNING: Could not parse family name from description: '%s'\n", 00195 desc_str); 00196 g_free(desc_str); 00197 return false; 00198 } 00199 family_name_ = string(family); 00200 desc_ = pango_font_description_copy(desc); 00201 is_monospace_ = IsMonospaceFontFamily(family); 00202 00203 // Set font size in points 00204 font_size_ = pango_font_description_get_size(desc); 00205 if (!pango_font_description_get_size_is_absolute(desc)) { 00206 font_size_ /= PANGO_SCALE; 00207 } 00208 00209 PangoStyle style = pango_font_description_get_style(desc); 00210 is_italic_ = (PANGO_STYLE_ITALIC == style || 00211 PANGO_STYLE_OBLIQUE == style); 00212 is_smallcaps_ = (pango_font_description_get_variant(desc) 00213 == PANGO_VARIANT_SMALL_CAPS); 00214 00215 is_bold_ = (pango_font_description_get_weight(desc) >= PANGO_WEIGHT_BOLD); 00216 // We dont have a way to detect whether a font is of type Fraktur. The fonts 00217 // we currently use all have "Fraktur" in their family name, so we do a 00218 // fragile but functional check for that here. 00219 is_fraktur_ = (strcasestr(family, "Fraktur") != NULL); 00220 return true; 00221 } 00222 00223 bool PangoFontInfo::ParseFontDescriptionName(const string& name) { 00224 PangoFontDescription *desc = pango_font_description_from_string(name.c_str()); 00225 bool success = ParseFontDescription(desc); 00226 pango_font_description_free(desc); 00227 return success; 00228 } 00229 00230 // Returns the PangoFont structure corresponding to the closest available font 00231 // in the font map. Note that if the font is wholly missing, this could 00232 // correspond to a completely different font family and face. 00233 PangoFont* PangoFontInfo::ToPangoFont() const { 00234 InitFontconfig(); 00235 PangoFontMap* font_map = pango_cairo_font_map_get_default(); 00236 PangoContext* context = pango_context_new(); 00237 pango_cairo_context_set_resolution(context, resolution_); 00238 pango_context_set_font_map(context, font_map); 00239 PangoFont* font = NULL; 00240 { 00241 DISABLE_HEAP_LEAK_CHECK; 00242 font = pango_font_map_load_font(font_map, context, desc_); 00243 } 00244 g_object_unref(context); 00245 return font; 00246 } 00247 00248 bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const { 00249 PangoFont* font = ToPangoFont(); 00250 PangoCoverage* coverage = pango_font_get_coverage(font, NULL); 00251 for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length); 00252 it != UNICHAR::end(utf8_text, byte_length); 00253 ++it) { 00254 if (IsWhitespace(*it) || pango_is_zero_width(*it)) 00255 continue; 00256 if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) { 00257 char tmp[5]; 00258 int len = it.get_utf8(tmp); 00259 tmp[len] = '\0'; 00260 tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it); 00261 return false; 00262 } 00263 } 00264 return true; 00265 } 00266 00267 int PangoFontInfo::DropUncoveredChars(string* utf8_text) const { 00268 PangoFont* font = ToPangoFont(); 00269 PangoCoverage* coverage = pango_font_get_coverage(font, NULL); 00270 int num_dropped_chars = 0; 00271 // Maintain two iterators that point into the string. For space efficiency, we 00272 // will repeatedly copy one covered UTF8 character from one to the other, and 00273 // at the end resize the string to the right length. 00274 char* out = const_cast<char*>(utf8_text->c_str()); 00275 const UNICHAR::const_iterator it_begin = 00276 UNICHAR::begin(utf8_text->c_str(), utf8_text->length()); 00277 const UNICHAR::const_iterator it_end = 00278 UNICHAR::end(utf8_text->c_str(), utf8_text->length()); 00279 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) { 00280 if (!IsWhitespace(*it) && !pango_is_zero_width(*it) && 00281 pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) { 00282 if (TLOG_IS_ON(2)) { 00283 char tmp[5]; 00284 int len = it.get_utf8(tmp); 00285 tmp[len] = '\0'; 00286 tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it); 00287 } 00288 ++num_dropped_chars; 00289 continue; 00290 } 00291 strncpy(out, it.utf8_data(), it.utf8_len()); 00292 out += it.utf8_len(); 00293 } 00294 utf8_text->resize(out - utf8_text->c_str()); 00295 return num_dropped_chars; 00296 } 00297 00298 bool PangoFontInfo::GetSpacingProperties(const string& utf8_char, 00299 int* x_bearing, int* x_advance) const { 00300 // Convert to equivalent PangoFont structure 00301 PangoFont* font = ToPangoFont(); 00302 // Find the glyph index in the font for the supplied utf8 character. 00303 int total_advance = 0; 00304 int min_bearing = 0; 00305 // Handle multi-unicode strings by reporting the left-most position of the 00306 // x-bearing, and right-most position of the x-advance if the string were to 00307 // be rendered. 00308 const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(), 00309 utf8_char.length()); 00310 const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(), 00311 utf8_char.length()); 00312 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) { 00313 PangoGlyph glyph_index = pango_fc_font_get_glyph( 00314 reinterpret_cast<PangoFcFont*>(font), *it); 00315 if (!glyph_index) { 00316 // Glyph for given unicode character doesn't exist in font. 00317 return false; 00318 } 00319 // Find the ink glyph extents for the glyph 00320 PangoRectangle ink_rect, logical_rect; 00321 pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect); 00322 pango_extents_to_pixels(&ink_rect, NULL); 00323 pango_extents_to_pixels(&logical_rect, NULL); 00324 00325 int bearing = total_advance + PANGO_LBEARING(ink_rect); 00326 if (it == it_begin || bearing < min_bearing) { 00327 min_bearing = bearing; 00328 } 00329 total_advance += PANGO_RBEARING(logical_rect); 00330 } 00331 *x_bearing = min_bearing; 00332 *x_advance = total_advance; 00333 return true; 00334 } 00335 00336 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const { 00337 vector<string> graphemes; 00338 return CanRenderString(utf8_word, len, &graphemes); 00339 } 00340 00341 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len, 00342 vector<string>* graphemes) const { 00343 if (graphemes) graphemes->clear(); 00344 // We check for font coverage of the text first, as otherwise Pango could 00345 // (undesirably) fall back to another font that does have the required 00346 // coverage. 00347 if (!CoversUTF8Text(utf8_word, len)) { 00348 return false; 00349 } 00350 // U+25CC dotted circle character that often (but not always) gets rendered 00351 // when there is an illegal grapheme sequence. 00352 const char32 kDottedCircleGlyph = 9676; 00353 bool bad_glyph = false; 00354 PangoFontMap* font_map = pango_cairo_font_map_get_default(); 00355 PangoContext* context = pango_context_new(); 00356 pango_context_set_font_map(context, font_map); 00357 PangoLayout* layout = pango_layout_new(context); 00358 if (desc_) { 00359 pango_layout_set_font_description(layout, desc_); 00360 } else { 00361 PangoFontDescription *desc = pango_font_description_from_string( 00362 DescriptionName().c_str()); 00363 pango_layout_set_font_description(layout, desc); 00364 pango_font_description_free(desc); 00365 } 00366 pango_layout_set_text(layout, utf8_word, len); 00367 PangoLayoutIter* run_iter = NULL; 00368 { // Fontconfig caches some information here that is not freed before exit. 00369 DISABLE_HEAP_LEAK_CHECK; 00370 run_iter = pango_layout_get_iter(layout); 00371 } 00372 do { 00373 PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter); 00374 if (!run) { 00375 tlog(2, "Found end of line NULL run marker\n"); 00376 continue; 00377 } 00378 PangoGlyph dotted_circle_glyph; 00379 PangoFont* font = run->item->analysis.font; 00380 dotted_circle_glyph = pango_fc_font_get_glyph( 00381 reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph); 00382 if (TLOG_IS_ON(2)) { 00383 PangoFontDescription* desc = pango_font_describe(font); 00384 char* desc_str = pango_font_description_to_string(desc); 00385 tlog(2, "Desc of font in run: %s\n", desc_str); 00386 g_free(desc_str); 00387 pango_font_description_free(desc); 00388 } 00389 00390 PangoGlyphItemIter cluster_iter; 00391 gboolean have_cluster; 00392 for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter, 00393 run, utf8_word); 00394 have_cluster && !bad_glyph; 00395 have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) { 00396 const int start_byte_index = cluster_iter.start_index; 00397 const int end_byte_index = cluster_iter.end_index; 00398 int start_glyph_index = cluster_iter.start_glyph; 00399 int end_glyph_index = cluster_iter.end_glyph; 00400 string cluster_text = string(utf8_word + start_byte_index, 00401 end_byte_index - start_byte_index); 00402 if (graphemes) graphemes->push_back(cluster_text); 00403 if (IsUTF8Whitespace(cluster_text.c_str())) { 00404 tlog(2, "Skipping whitespace\n"); 00405 continue; 00406 } 00407 if (TLOG_IS_ON(2)) { 00408 printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ", 00409 start_byte_index, end_byte_index, 00410 start_glyph_index, end_glyph_index); 00411 } 00412 for (int i = start_glyph_index, 00413 step = (end_glyph_index > start_glyph_index) ? 1 : -1; 00414 !bad_glyph && i != end_glyph_index; i+= step) { 00415 const bool unknown_glyph = 00416 (cluster_iter.glyph_item->glyphs->glyphs[i].glyph & 00417 PANGO_GLYPH_UNKNOWN_FLAG); 00418 const bool illegal_glyph = 00419 (cluster_iter.glyph_item->glyphs->glyphs[i].glyph == 00420 dotted_circle_glyph); 00421 bad_glyph = unknown_glyph || illegal_glyph; 00422 if (TLOG_IS_ON(2)) { 00423 printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph, 00424 bad_glyph ? 1 : 0); 00425 } 00426 } 00427 if (TLOG_IS_ON(2)) { 00428 printf(" '%s'\n", cluster_text.c_str()); 00429 } 00430 if (bad_glyph) 00431 tlog(1, "Found illegal glyph!\n"); 00432 } 00433 } while (!bad_glyph && pango_layout_iter_next_run(run_iter)); 00434 00435 pango_layout_iter_free(run_iter); 00436 g_object_unref(context); 00437 g_object_unref(layout); 00438 if (bad_glyph && graphemes) graphemes->clear(); 00439 return !bad_glyph; 00440 } 00441 00442 00443 // ------------------------ FontUtils ------------------------------------ 00444 00445 // Returns whether the specified font description is available in the fonts 00446 // directory. 00447 // 00448 // The generated list of font families and faces includes "synthesized" font 00449 // faces that are not truly loadable. Pango versions >=1.18 have a 00450 // pango_font_face_is_synthesized method that can be used to prune the list. 00451 // Until then, we are restricted to using a hack where we try to load the font 00452 // from the font_map, and then check what we loaded to see if it has the 00453 // description we expected. If it is not, then the font is deemed unavailable. 00454 /* static */ 00455 bool FontUtils::IsAvailableFont(const char* query_desc) { 00456 PangoFontDescription *desc = pango_font_description_from_string(query_desc); 00457 PangoFont* selected_font = NULL; 00458 { 00459 InitFontconfig(); 00460 PangoFontMap* font_map = pango_cairo_font_map_get_default(); 00461 PangoContext* context = pango_context_new(); 00462 pango_context_set_font_map(context, font_map); 00463 { 00464 DISABLE_HEAP_LEAK_CHECK; 00465 selected_font = pango_font_map_load_font(font_map, context, desc); 00466 } 00467 g_object_unref(context); 00468 } 00469 PangoFontDescription* selected_desc = pango_font_describe(selected_font); 00470 00471 bool equal = pango_font_description_equal(desc, selected_desc); 00472 tlog(3, "query weight = %d \t selected weight =%d\n", 00473 pango_font_description_get_weight(desc), 00474 pango_font_description_get_weight(selected_desc)); 00475 00476 char* selected_desc_str = pango_font_description_to_string(selected_desc); 00477 tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc, selected_desc_str); 00478 00479 g_free(selected_desc_str); 00480 pango_font_description_free(selected_desc); 00481 pango_font_description_free(desc); 00482 return equal; 00483 } 00484 00485 static bool ShouldIgnoreFontFamilyName(const char* query) { 00486 static const char* kIgnoredFamilyNames[] 00487 = { "Sans", "Serif", "Monospace", NULL }; 00488 const char** list = kIgnoredFamilyNames; 00489 for (; *list != NULL; ++list) { 00490 if (!strcmp(*list, query)) 00491 return true; 00492 } 00493 return false; 00494 } 00495 00496 // Outputs description names of available fonts. 00497 /* static */ 00498 const vector<string>& FontUtils::ListAvailableFonts() { 00499 static vector<string> available_fonts_; // cache list 00500 if (available_fonts_.size()) { 00501 return available_fonts_; 00502 } 00503 #ifndef USE_STD_NAMESPACE 00504 if (FLAGS_use_only_legacy_fonts) { 00505 // Restrict view to list of fonts in legacy_fonts.h 00506 tprintf("Using list of legacy fonts only\n"); 00507 const int kNumFontLists = 4; 00508 for (int i = 0; i < kNumFontLists; ++i) { 00509 for (int j = 0; kFontlists[i][j] != NULL; ++j) { 00510 available_fonts_.push_back(kFontlists[i][j]); 00511 } 00512 } 00513 return available_fonts_; 00514 } 00515 #endif 00516 00517 PangoFontFamily** families = 0; 00518 int n_families = 0; 00519 ListFontFamilies(&families, &n_families); 00520 for (int i = 0; i < n_families; ++i) { 00521 const char* family_name = pango_font_family_get_name(families[i]); 00522 tlog(2, "Listing family %s\n", family_name); 00523 if (ShouldIgnoreFontFamilyName(family_name)) 00524 continue; 00525 00526 int n_faces; 00527 PangoFontFace** faces = NULL; 00528 pango_font_family_list_faces(families[i], &faces, &n_faces); 00529 for (int j = 0; j < n_faces; ++j) { 00530 PangoFontDescription* desc = pango_font_face_describe(faces[j]); 00531 char* desc_str = pango_font_description_to_string(desc); 00532 if (IsAvailableFont(desc_str)) { 00533 available_fonts_.push_back(desc_str); 00534 } 00535 pango_font_description_free(desc); 00536 g_free(desc_str); 00537 } 00538 g_free(faces); 00539 } 00540 g_free(families); 00541 sort(available_fonts_.begin(), available_fonts_.end()); 00542 return available_fonts_; 00543 } 00544 00545 00546 static void CharCoverageMapToBitmap(PangoCoverage* coverage, 00547 vector<bool>* unichar_bitmap) { 00548 const int kMinUnicodeValue = 33; 00549 const int kMaxUnicodeValue = 0x10FFFF; 00550 unichar_bitmap->resize(kMaxUnicodeValue + 1, false); 00551 // Mark off characters that the font can render. 00552 for (int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) { 00553 if (IsInterchangeValid(i)) { 00554 (*unichar_bitmap)[i] 00555 = (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT); 00556 } 00557 } 00558 } 00559 00560 /* static */ 00561 void FontUtils::GetAllRenderableCharacters(vector<bool>* unichar_bitmap) { 00562 const vector<string>& all_fonts = ListAvailableFonts(); 00563 return GetAllRenderableCharacters(all_fonts, unichar_bitmap); 00564 } 00565 00566 /* static */ 00567 void FontUtils::GetAllRenderableCharacters(const string& font_name, 00568 vector<bool>* unichar_bitmap) { 00569 PangoFontInfo font_info(font_name); 00570 PangoCoverage* coverage = pango_font_get_coverage( 00571 font_info.ToPangoFont(), NULL); 00572 CharCoverageMapToBitmap(coverage, unichar_bitmap); 00573 } 00574 00575 /* static */ 00576 void FontUtils::GetAllRenderableCharacters(const vector<string>& fonts, 00577 vector<bool>* unichar_bitmap) { 00578 // Form the union of coverage maps from the fonts 00579 PangoCoverage* all_coverage = pango_coverage_new(); 00580 tlog(1, "Processing %d fonts\n", fonts.size()); 00581 for (int i = 0; i < fonts.size(); ++i) { 00582 PangoFontInfo font_info(fonts[i]); 00583 PangoCoverage* coverage = pango_font_get_coverage( 00584 font_info.ToPangoFont(), NULL); 00585 // Mark off characters that any font can render. 00586 pango_coverage_max(all_coverage, coverage); 00587 } 00588 CharCoverageMapToBitmap(all_coverage, unichar_bitmap); 00589 pango_coverage_unref(all_coverage); 00590 } 00591 00592 00593 // Utilities written to be backward compatible with StringRender 00594 00595 /* static */ 00596 int FontUtils::FontScore(const unordered_map<char32, inT64>& ch_map, 00597 const string& fontname, 00598 int* raw_score, 00599 vector<bool>* ch_flags) { 00600 PangoFontInfo font_info; 00601 if (!font_info.ParseFontDescriptionName(fontname)) { 00602 tprintf("ERROR: Could not parse %s\n", fontname.c_str()); 00603 } 00604 PangoFont* font = font_info.ToPangoFont(); 00605 PangoCoverage* coverage = pango_font_get_coverage(font, NULL); 00606 00607 if (ch_flags) { 00608 ch_flags->clear(); 00609 ch_flags->reserve(ch_map.size()); 00610 } 00611 *raw_score = 0; 00612 int ok_chars = 0; 00613 for (unordered_map<char32, inT64>::const_iterator it = ch_map.begin(); 00614 it != ch_map.end(); ++it) { 00615 bool covered = (IsWhitespace(it->first) || 00616 (pango_coverage_get(coverage, it->first) 00617 == PANGO_COVERAGE_EXACT)); 00618 if (covered) { 00619 ++(*raw_score); 00620 ok_chars += it->second; 00621 } 00622 if (ch_flags) { 00623 ch_flags->push_back(covered); 00624 } 00625 } 00626 return ok_chars; 00627 } 00628 00629 00630 /* static */ 00631 string FontUtils::BestFonts(const unordered_map<char32, inT64>& ch_map, 00632 vector<pair<const char*, vector<bool> > >* fonts) { 00633 const double kMinOKFraction = 0.99; 00634 // Weighted fraction of characters that must be renderable in a font to make 00635 // it OK even if the raw count is not good. 00636 const double kMinWeightedFraction = 0.99995; 00637 00638 fonts->clear(); 00639 vector<vector<bool> > font_flags; 00640 vector<int> font_scores; 00641 vector<int> raw_scores; 00642 int most_ok_chars = 0; 00643 int best_raw_score = 0; 00644 const vector<string>& font_names = FontUtils::ListAvailableFonts(); 00645 for (int i = 0; i < font_names.size(); ++i) { 00646 vector<bool> ch_flags; 00647 int raw_score = 0; 00648 int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags); 00649 most_ok_chars = MAX(ok_chars, most_ok_chars); 00650 best_raw_score = MAX(raw_score, best_raw_score); 00651 00652 font_flags.push_back(ch_flags); 00653 font_scores.push_back(ok_chars); 00654 raw_scores.push_back(raw_score); 00655 } 00656 00657 // Now select the fonts with a score above a threshold fraction 00658 // of both the raw and weighted best scores. To prevent bogus fonts being 00659 // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of 00660 // BOTH weighted and raw scores. 00661 // In low character-count scripts, the issue is more getting enough fonts, 00662 // when only 1 or 2 might have all those rare dingbats etc in them, so we 00663 // allow a font with a very high weighted (coverage) score 00664 // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor. 00665 int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction); 00666 int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction); 00667 int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction); 00668 00669 string font_list; 00670 for (int i = 0; i < font_names.size(); ++i) { 00671 int score = font_scores[i]; 00672 int raw_score = raw_scores[i]; 00673 if ((score >= least_good_enough && raw_score >= least_raw_enough) || 00674 score >= override_enough) { 00675 fonts->push_back(make_pair(font_names[i].c_str(), font_flags[i])); 00676 tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n", 00677 font_names[i].c_str(), 00678 100.0 * score / most_ok_chars, 00679 raw_score, 100.0 * raw_score / best_raw_score); 00680 font_list += font_names[i]; 00681 font_list += "\n"; 00682 } else if (score >= least_good_enough || raw_score >= least_raw_enough) { 00683 tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n", 00684 font_names[i].c_str(), 00685 100.0 * score / most_ok_chars, 00686 raw_score, 100.0 * raw_score / best_raw_score); 00687 } 00688 } 00689 return font_list; 00690 } 00691 00692 /* static */ 00693 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len, 00694 string* font_name, vector<string>* graphemes) { 00695 return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name, 00696 graphemes); 00697 } 00698 00699 /* static */ 00700 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len, 00701 const vector<string>& all_fonts, 00702 string* font_name, vector<string>* graphemes) { 00703 if (font_name) font_name->clear(); 00704 if (graphemes) graphemes->clear(); 00705 for (int i = 0; i < all_fonts.size(); ++i) { 00706 PangoFontInfo font; 00707 vector<string> found_graphemes; 00708 ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]), 00709 "Could not parse font desc name %s\n", 00710 all_fonts[i].c_str()); 00711 if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) { 00712 if (graphemes) graphemes->swap(found_graphemes); 00713 if (font_name) *font_name = all_fonts[i]; 00714 return true; 00715 } 00716 } 00717 return false; 00718 } 00719 00720 } // namespace tesseract