tesseract
3.03
|
#include <ligature_table.h>
Public Member Functions | |
string | AddLigatures (const string &str, const PangoFontInfo *font) const |
string | RemoveLigatures (const string &str) const |
string | RemoveCustomLigatures (const string &str) const |
const LigHash & | norm_to_lig_table () const |
const LigHash & | lig_to_norm_table () const |
Static Public Member Functions | |
static LigatureTable * | Get () |
Protected Member Functions | |
LigatureTable () | |
void | Init () |
Protected Attributes | |
LigHash | norm_to_lig_table_ |
LigHash | lig_to_norm_table_ |
int | min_lig_length_ |
int | max_lig_length_ |
int | min_norm_length_ |
int | max_norm_length_ |
Static Protected Attributes | |
static SmartPtr< LigatureTable > | instance_ |
Definition at line 37 of file ligature_table.h.
tesseract::LigatureTable::LigatureTable | ( | ) | [protected] |
Definition at line 60 of file ligature_table.cpp.
: min_lig_length_(0), max_lig_length_(0), min_norm_length_(0), max_norm_length_(0) {}
string tesseract::LigatureTable::AddLigatures | ( | const string & | str, |
const PangoFontInfo * | font | ||
) | const |
Definition at line 156 of file ligature_table.cpp.
{ string result; int len = str.size(); int step = 0; int i = 0; for (i = 0; i < len - min_norm_length_ + 1; i += step) { step = 0; for (int liglen = max_norm_length_; liglen >= min_norm_length_; --liglen) { if (i + liglen <= len) { string lig_cand = str.substr(i, liglen); LigHash::const_iterator it = norm_to_lig_table_.find(lig_cand); if (it != norm_to_lig_table_.end()) { tlog(3, "Considering %s -> %s\n", lig_cand.c_str(), it->second.c_str()); if (font) { // Test for renderability. if (!font->CanRenderString(it->second.data(), it->second.length())) continue; // Not renderable } // Found a match so convert it. step = liglen; result += it->second; tlog(2, "Substituted %s -> %s\n", lig_cand.c_str(), it->second.c_str()); break; } } } if (step == 0) { result += str[i]; step = 1; } } result += str.substr(i, len - i); return result; }
LigatureTable * tesseract::LigatureTable::Get | ( | ) | [static] |
Definition at line 52 of file ligature_table.cpp.
void tesseract::LigatureTable::Init | ( | ) | [protected] |
Definition at line 63 of file ligature_table.cpp.
{ if (norm_to_lig_table_.empty()) { for (char32 lig = kMinLigature; lig <= kMaxLigature; ++lig) { // For each char in the range, convert to utf8, nfkc normalize, and if // the strings are different put the both mappings in the hash_maps. string lig8 = EncodeAsUTF8(lig); icu::UnicodeString unicode_lig8(static_cast<UChar32>(lig)); icu::UnicodeString normed8_result; icu::ErrorCode status; icu::Normalizer::normalize(unicode_lig8, UNORM_NFKC, 0, normed8_result, status); string normed8; normed8_result.toUTF8String(normed8); // The icu::Normalizer maps the "LONG S T" ligature to "st". Correct that // here manually so that AddLigatures() will work as desired. if (lig8 == "\uFB05") normed8 = "ſt"; int lig_length = lig8.length(); int norm_length = normed8.size(); if (normed8 != lig8 && lig_length > 1 && norm_length > 1) { norm_to_lig_table_[normed8] = lig8; lig_to_norm_table_[lig8] = normed8; if (min_lig_length_ == 0 || lig_length < min_lig_length_) min_lig_length_ = lig_length; if (lig_length > max_lig_length_) max_lig_length_ = lig_length; if (min_norm_length_ == 0 || norm_length < min_norm_length_) min_norm_length_ = norm_length; if (norm_length > max_norm_length_) max_norm_length_ = norm_length; } } // Add custom extra ligatures. for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) { norm_to_lig_table_[UNICHARSET::kCustomLigatures[i][0]] = UNICHARSET::kCustomLigatures[i][1]; int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]); if (min_norm_length_ == 0 || norm_length < min_norm_length_) min_norm_length_ = norm_length; if (norm_length > max_norm_length_) max_norm_length_ = norm_length; lig_to_norm_table_[UNICHARSET::kCustomLigatures[i][1]] = UNICHARSET::kCustomLigatures[i][0]; } } }
const LigHash& tesseract::LigatureTable::lig_to_norm_table | ( | ) | const [inline] |
Definition at line 54 of file ligature_table.h.
{ return lig_to_norm_table_; }
const LigHash& tesseract::LigatureTable::norm_to_lig_table | ( | ) | const [inline] |
Definition at line 51 of file ligature_table.h.
{ return norm_to_lig_table_; }
string tesseract::LigatureTable::RemoveCustomLigatures | ( | const string & | str | ) | const |
Definition at line 130 of file ligature_table.cpp.
{ string result; UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length()); UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length()); char tmp[5]; int len; int norm_ind; for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) { len = it.get_utf8(tmp); tmp[len] = '\0'; norm_ind = -1; for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL && norm_ind < 0; ++i) { if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) { norm_ind = i; } } if (norm_ind >= 0) { result += UNICHARSET::kCustomLigatures[norm_ind][0]; } else { result += tmp; } } return result; }
string tesseract::LigatureTable::RemoveLigatures | ( | const string & | str | ) | const |
Definition at line 111 of file ligature_table.cpp.
{ string result; UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length()); UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length()); char tmp[5]; int len; for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) { len = it.get_utf8(tmp); tmp[len] = '\0'; LigHash::const_iterator lig_it = lig_to_norm_table_.find(tmp); if (lig_it != lig_to_norm_table_.end()) { result += lig_it->second; } else { result += tmp; } } return result; }
SmartPtr< LigatureTable > tesseract::LigatureTable::instance_ [static, protected] |
Definition at line 64 of file ligature_table.h.
LigHash tesseract::LigatureTable::lig_to_norm_table_ [protected] |
Definition at line 66 of file ligature_table.h.
int tesseract::LigatureTable::max_lig_length_ [protected] |
Definition at line 68 of file ligature_table.h.
int tesseract::LigatureTable::max_norm_length_ [protected] |
Definition at line 70 of file ligature_table.h.
int tesseract::LigatureTable::min_lig_length_ [protected] |
Definition at line 67 of file ligature_table.h.
int tesseract::LigatureTable::min_norm_length_ [protected] |
Definition at line 69 of file ligature_table.h.
LigHash tesseract::LigatureTable::norm_to_lig_table_ [protected] |
Definition at line 65 of file ligature_table.h.