tesseract
3.03
|
00001 /********************************************************************** 00002 * File: ligature_table.h 00003 * Description: Class for adding and removing optional latin ligatures, 00004 * conditional on codepoint support by a specified font 00005 * (if specified). 00006 * Author: Ranjith Unnikrishnan 00007 * Created: Mon Nov 18 2013 00008 * 00009 * (C) Copyright 2013, Google Inc. 00010 * Licensed under the Apache License, Version 2.0 (the "License"); 00011 * you may not use this file except in compliance with the License. 00012 * You may obtain a copy of the License at 00013 * http://www.apache.org/licenses/LICENSE-2.0 00014 * Unless required by applicable law or agreed to in writing, software 00015 * distributed under the License is distributed on an "AS IS" BASIS, 00016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00017 * See the License for the specific language governing permissions and 00018 * limitations under the License. 00019 * 00020 **********************************************************************/ 00021 00022 #ifndef TRAININGDATA_LIGATURE_TABLE_H_ 00023 #define TRAININGDATA_LIGATURE_TABLE_H_ 00024 00025 #include <string> 00026 00027 #include "hashfn.h" 00028 #include "util.h" 00029 00030 namespace tesseract { 00031 00032 class PangoFontInfo; // defined in pango_font_info.h 00033 00034 // Map to substitute strings for ligatures. 00035 typedef hash_map<string, string, StringHash> LigHash; 00036 00037 class LigatureTable { 00038 public: 00039 // Get a static instance of this class. 00040 static LigatureTable* Get(); 00041 00042 // Convert the utf8 string so that ligaturizable sequences, such as "fi" get 00043 // replaced by the (utf8 code for) appropriate ligature characters. Only do so 00044 // if the corresponding ligature character is renderable in the current font. 00045 string AddLigatures(const string& str, const PangoFontInfo* font) const; 00046 // Remove all ligatures. 00047 string RemoveLigatures(const string& str) const; 00048 // Remove only custom ligatures (eg. "ct") encoded in the private-use-area. 00049 string RemoveCustomLigatures(const string& str) const; 00050 00051 const LigHash& norm_to_lig_table() const { 00052 return norm_to_lig_table_; 00053 } 00054 const LigHash& lig_to_norm_table() const { 00055 return lig_to_norm_table_; 00056 } 00057 00058 protected: 00059 LigatureTable(); 00060 // Initialize the hash tables mapping between ligature strings and the 00061 // corresponding ligature characters. 00062 void Init(); 00063 00064 static SmartPtr<LigatureTable> instance_; 00065 LigHash norm_to_lig_table_; 00066 LigHash lig_to_norm_table_; 00067 int min_lig_length_; 00068 int max_lig_length_; 00069 int min_norm_length_; 00070 int max_norm_length_; 00071 00072 private: 00073 LigatureTable(const LigatureTable&); 00074 void operator=(const LigatureTable&); 00075 }; 00076 00077 } // namespace tesseract 00078 00079 #endif // OCR_TRAININGDATA_TYPESETTING_LIGATURE_TABLE_H_