tesseract
3.03
|
00001 /********************************************************************** 00002 * File: ligature_table.cpp 00003 * Description: Class for adding and removing optional latin ligatures, 00004 * conditional on codepoint support by a specified font 00005 * (if specified). 00006 * Author: Ranjith Unnikrishnan 00007 * Created: Mon Nov 18 2013 00008 * 00009 * (C) Copyright 2013, Google Inc. 00010 * Licensed under the Apache License, Version 2.0 (the "License"); 00011 * you may not use this file except in compliance with the License. 00012 * You may obtain a copy of the License at 00013 * http://www.apache.org/licenses/LICENSE-2.0 00014 * Unless required by applicable law or agreed to in writing, software 00015 * distributed under the License is distributed on an "AS IS" BASIS, 00016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00017 * See the License for the specific language governing permissions and 00018 * limitations under the License. 00019 * 00020 **********************************************************************/ 00021 00022 #include "ligature_table.h" 00023 00024 #include <utility> 00025 00026 #include "pango_font_info.h" 00027 #include "tlog.h" 00028 #include "unichar.h" 00029 #include "unicharset.h" 00030 #include "unicode/errorcode.h" // from libicu 00031 #include "unicode/normlzr.h" // from libicu 00032 #include "unicode/unistr.h" // from libicu 00033 #include "unicode/utypes.h" // from libicu 00034 00035 namespace tesseract { 00036 00037 static string EncodeAsUTF8(const char32 ch32) { 00038 UNICHAR uni_ch(ch32); 00039 return string(uni_ch.utf8(), uni_ch.utf8_len()); 00040 } 00041 00042 // Range of optional latin ligature characters in Unicode to build ligatures 00043 // from. Note that this range does not contain the custom ligatures that we 00044 // encode in the private use area. 00045 const int kMinLigature = 0xfb00; 00046 const int kMaxLigature = 0xfb4f; 00047 00048 /* static */ 00049 SmartPtr<LigatureTable> LigatureTable::instance_; 00050 00051 /* static */ 00052 LigatureTable* LigatureTable::Get() { 00053 if (instance_ == NULL) { 00054 instance_.reset(new LigatureTable()); 00055 instance_->Init(); 00056 } 00057 return instance_.get(); 00058 } 00059 00060 LigatureTable::LigatureTable() : min_lig_length_(0), max_lig_length_(0), 00061 min_norm_length_(0), max_norm_length_(0) {} 00062 00063 void LigatureTable::Init() { 00064 if (norm_to_lig_table_.empty()) { 00065 for (char32 lig = kMinLigature; lig <= kMaxLigature; ++lig) { 00066 // For each char in the range, convert to utf8, nfkc normalize, and if 00067 // the strings are different put the both mappings in the hash_maps. 00068 string lig8 = EncodeAsUTF8(lig); 00069 icu::UnicodeString unicode_lig8(static_cast<UChar32>(lig)); 00070 icu::UnicodeString normed8_result; 00071 icu::ErrorCode status; 00072 icu::Normalizer::normalize(unicode_lig8, UNORM_NFKC, 0, normed8_result, 00073 status); 00074 string normed8; 00075 normed8_result.toUTF8String(normed8); 00076 // The icu::Normalizer maps the "LONG S T" ligature to "st". Correct that 00077 // here manually so that AddLigatures() will work as desired. 00078 if (lig8 == "\uFB05") 00079 normed8 = "ſt"; 00080 int lig_length = lig8.length(); 00081 int norm_length = normed8.size(); 00082 if (normed8 != lig8 && lig_length > 1 && norm_length > 1) { 00083 norm_to_lig_table_[normed8] = lig8; 00084 lig_to_norm_table_[lig8] = normed8; 00085 if (min_lig_length_ == 0 || lig_length < min_lig_length_) 00086 min_lig_length_ = lig_length; 00087 if (lig_length > max_lig_length_) 00088 max_lig_length_ = lig_length; 00089 if (min_norm_length_ == 0 || norm_length < min_norm_length_) 00090 min_norm_length_ = norm_length; 00091 if (norm_length > max_norm_length_) 00092 max_norm_length_ = norm_length; 00093 } 00094 } 00095 // Add custom extra ligatures. 00096 for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) { 00097 norm_to_lig_table_[UNICHARSET::kCustomLigatures[i][0]] = 00098 UNICHARSET::kCustomLigatures[i][1]; 00099 int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]); 00100 if (min_norm_length_ == 0 || norm_length < min_norm_length_) 00101 min_norm_length_ = norm_length; 00102 if (norm_length > max_norm_length_) 00103 max_norm_length_ = norm_length; 00104 00105 lig_to_norm_table_[UNICHARSET::kCustomLigatures[i][1]] = 00106 UNICHARSET::kCustomLigatures[i][0]; 00107 } 00108 } 00109 } 00110 00111 string LigatureTable::RemoveLigatures(const string& str) const { 00112 string result; 00113 UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length()); 00114 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length()); 00115 char tmp[5]; 00116 int len; 00117 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) { 00118 len = it.get_utf8(tmp); 00119 tmp[len] = '\0'; 00120 LigHash::const_iterator lig_it = lig_to_norm_table_.find(tmp); 00121 if (lig_it != lig_to_norm_table_.end()) { 00122 result += lig_it->second; 00123 } else { 00124 result += tmp; 00125 } 00126 } 00127 return result; 00128 } 00129 00130 string LigatureTable::RemoveCustomLigatures(const string& str) const { 00131 string result; 00132 UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length()); 00133 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length()); 00134 char tmp[5]; 00135 int len; 00136 int norm_ind; 00137 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) { 00138 len = it.get_utf8(tmp); 00139 tmp[len] = '\0'; 00140 norm_ind = -1; 00141 for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL && norm_ind < 0; 00142 ++i) { 00143 if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) { 00144 norm_ind = i; 00145 } 00146 } 00147 if (norm_ind >= 0) { 00148 result += UNICHARSET::kCustomLigatures[norm_ind][0]; 00149 } else { 00150 result += tmp; 00151 } 00152 } 00153 return result; 00154 } 00155 00156 string LigatureTable::AddLigatures(const string& str, 00157 const PangoFontInfo* font) const { 00158 string result; 00159 int len = str.size(); 00160 int step = 0; 00161 int i = 0; 00162 for (i = 0; i < len - min_norm_length_ + 1; i += step) { 00163 step = 0; 00164 for (int liglen = max_norm_length_; liglen >= min_norm_length_; --liglen) { 00165 if (i + liglen <= len) { 00166 string lig_cand = str.substr(i, liglen); 00167 LigHash::const_iterator it = norm_to_lig_table_.find(lig_cand); 00168 if (it != norm_to_lig_table_.end()) { 00169 tlog(3, "Considering %s -> %s\n", lig_cand.c_str(), 00170 it->second.c_str()); 00171 if (font) { 00172 // Test for renderability. 00173 if (!font->CanRenderString(it->second.data(), it->second.length())) 00174 continue; // Not renderable 00175 } 00176 // Found a match so convert it. 00177 step = liglen; 00178 result += it->second; 00179 tlog(2, "Substituted %s -> %s\n", lig_cand.c_str(), 00180 it->second.c_str()); 00181 break; 00182 } 00183 } 00184 } 00185 if (step == 0) { 00186 result += str[i]; 00187 step = 1; 00188 } 00189 } 00190 result += str.substr(i, len - i); 00191 return result; 00192 } 00193 00194 } // namespace tesseract