tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/ligature_table.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        ligature_table.cpp
00003  * Description: Class for adding and removing optional latin ligatures,
00004  *              conditional on codepoint support by a specified font
00005  *              (if specified).
00006  * Author:      Ranjith Unnikrishnan
00007  * Created:     Mon Nov 18 2013
00008  *
00009  * (C) Copyright 2013, Google Inc.
00010  * Licensed under the Apache License, Version 2.0 (the "License");
00011  * you may not use this file except in compliance with the License.
00012  * You may obtain a copy of the License at
00013  * http://www.apache.org/licenses/LICENSE-2.0
00014  * Unless required by applicable law or agreed to in writing, software
00015  * distributed under the License is distributed on an "AS IS" BASIS,
00016  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017  * See the License for the specific language governing permissions and
00018  * limitations under the License.
00019  *
00020  **********************************************************************/
00021 
00022 #include "ligature_table.h"
00023 
00024 #include <utility>
00025 
00026 #include "pango_font_info.h"
00027 #include "tlog.h"
00028 #include "unichar.h"
00029 #include "unicharset.h"
00030 #include "unicode/errorcode.h"  // from libicu
00031 #include "unicode/normlzr.h"    // from libicu
00032 #include "unicode/unistr.h"     // from libicu
00033 #include "unicode/utypes.h"     // from libicu
00034 
00035 namespace tesseract {
00036 
00037 static string EncodeAsUTF8(const char32 ch32) {
00038   UNICHAR uni_ch(ch32);
00039   return string(uni_ch.utf8(), uni_ch.utf8_len());
00040 }
00041 
00042 // Range of optional latin ligature characters in Unicode to build ligatures
00043 // from. Note that this range does not contain the custom ligatures that we
00044 // encode in the private use area.
00045 const int kMinLigature = 0xfb00;
00046 const int kMaxLigature = 0xfb4f;
00047 
00048 /* static */
00049 SmartPtr<LigatureTable> LigatureTable::instance_;
00050 
00051 /* static */
00052 LigatureTable* LigatureTable::Get() {
00053   if (instance_ == NULL) {
00054     instance_.reset(new LigatureTable());
00055     instance_->Init();
00056   }
00057   return instance_.get();
00058 }
00059 
00060 LigatureTable::LigatureTable() : min_lig_length_(0), max_lig_length_(0),
00061                                  min_norm_length_(0), max_norm_length_(0) {}
00062 
00063 void LigatureTable::Init() {
00064   if (norm_to_lig_table_.empty()) {
00065     for (char32 lig = kMinLigature; lig <= kMaxLigature; ++lig) {
00066       // For each char in the range, convert to utf8, nfkc normalize, and if
00067       // the strings are different put the both mappings in the hash_maps.
00068       string lig8 = EncodeAsUTF8(lig);
00069       icu::UnicodeString unicode_lig8(static_cast<UChar32>(lig));
00070       icu::UnicodeString normed8_result;
00071       icu::ErrorCode status;
00072       icu::Normalizer::normalize(unicode_lig8, UNORM_NFKC, 0, normed8_result,
00073                                  status);
00074       string normed8;
00075       normed8_result.toUTF8String(normed8);
00076       // The icu::Normalizer maps the "LONG S T" ligature to "st". Correct that
00077       // here manually so that AddLigatures() will work as desired.
00078       if (lig8 == "\uFB05")
00079         normed8 = "ſt";
00080       int lig_length = lig8.length();
00081       int norm_length = normed8.size();
00082       if (normed8 != lig8 && lig_length > 1 && norm_length > 1) {
00083         norm_to_lig_table_[normed8] = lig8;
00084         lig_to_norm_table_[lig8] = normed8;
00085         if (min_lig_length_ == 0 || lig_length < min_lig_length_)
00086           min_lig_length_ = lig_length;
00087         if (lig_length > max_lig_length_)
00088           max_lig_length_ = lig_length;
00089         if (min_norm_length_ == 0 || norm_length < min_norm_length_)
00090           min_norm_length_ = norm_length;
00091         if (norm_length > max_norm_length_)
00092           max_norm_length_ = norm_length;
00093       }
00094     }
00095     // Add custom extra ligatures.
00096     for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
00097       norm_to_lig_table_[UNICHARSET::kCustomLigatures[i][0]] =
00098           UNICHARSET::kCustomLigatures[i][1];
00099       int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]);
00100       if (min_norm_length_ == 0 || norm_length < min_norm_length_)
00101         min_norm_length_ = norm_length;
00102       if (norm_length > max_norm_length_)
00103         max_norm_length_ = norm_length;
00104 
00105       lig_to_norm_table_[UNICHARSET::kCustomLigatures[i][1]] =
00106           UNICHARSET::kCustomLigatures[i][0];
00107     }
00108   }
00109 }
00110 
00111 string LigatureTable::RemoveLigatures(const string& str) const {
00112   string result;
00113   UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
00114   UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
00115   char tmp[5];
00116   int len;
00117   for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
00118     len = it.get_utf8(tmp);
00119     tmp[len] = '\0';
00120     LigHash::const_iterator lig_it = lig_to_norm_table_.find(tmp);
00121     if (lig_it != lig_to_norm_table_.end()) {
00122       result += lig_it->second;
00123     } else {
00124       result += tmp;
00125     }
00126   }
00127   return result;
00128 }
00129 
00130 string LigatureTable::RemoveCustomLigatures(const string& str) const {
00131   string result;
00132   UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
00133   UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
00134   char tmp[5];
00135   int len;
00136   int norm_ind;
00137   for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
00138     len = it.get_utf8(tmp);
00139     tmp[len] = '\0';
00140     norm_ind = -1;
00141     for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL && norm_ind < 0;
00142          ++i) {
00143       if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) {
00144         norm_ind = i;
00145       }
00146     }
00147     if (norm_ind >= 0) {
00148       result += UNICHARSET::kCustomLigatures[norm_ind][0];
00149     } else {
00150       result += tmp;
00151     }
00152   }
00153   return result;
00154 }
00155 
00156 string LigatureTable::AddLigatures(const string& str,
00157                                    const PangoFontInfo* font) const {
00158   string result;
00159   int len = str.size();
00160   int step = 0;
00161   int i = 0;
00162   for (i = 0; i < len - min_norm_length_ + 1; i += step) {
00163     step = 0;
00164     for (int liglen = max_norm_length_; liglen >= min_norm_length_; --liglen) {
00165       if (i + liglen <= len) {
00166         string lig_cand = str.substr(i, liglen);
00167         LigHash::const_iterator it = norm_to_lig_table_.find(lig_cand);
00168         if (it != norm_to_lig_table_.end()) {
00169           tlog(3, "Considering %s -> %s\n", lig_cand.c_str(),
00170                it->second.c_str());
00171           if (font) {
00172             // Test for renderability.
00173             if (!font->CanRenderString(it->second.data(), it->second.length()))
00174               continue;  // Not renderable
00175           }
00176           // Found a match so convert it.
00177           step = liglen;
00178           result += it->second;
00179           tlog(2, "Substituted %s -> %s\n", lig_cand.c_str(),
00180                it->second.c_str());
00181           break;
00182         }
00183       }
00184     }
00185     if (step == 0) {
00186       result += str[i];
00187       step = 1;
00188     }
00189   }
00190   result += str.substr(i, len - i);
00191   return result;
00192 }
00193 
00194 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines