tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/ligature_table.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        ligature_table.h
00003  * Description: Class for adding and removing optional latin ligatures,
00004  *              conditional on codepoint support by a specified font
00005  *              (if specified).
00006  * Author:      Ranjith Unnikrishnan
00007  * Created:     Mon Nov 18 2013
00008  *
00009  * (C) Copyright 2013, Google Inc.
00010  * Licensed under the Apache License, Version 2.0 (the "License");
00011  * you may not use this file except in compliance with the License.
00012  * You may obtain a copy of the License at
00013  * http://www.apache.org/licenses/LICENSE-2.0
00014  * Unless required by applicable law or agreed to in writing, software
00015  * distributed under the License is distributed on an "AS IS" BASIS,
00016  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017  * See the License for the specific language governing permissions and
00018  * limitations under the License.
00019  *
00020  **********************************************************************/
00021 
00022 #ifndef TRAININGDATA_LIGATURE_TABLE_H_
00023 #define TRAININGDATA_LIGATURE_TABLE_H_
00024 
00025 #include <string>
00026 
00027 #include "hashfn.h"
00028 #include "util.h"
00029 
00030 namespace tesseract {
00031 
00032 class PangoFontInfo;  // defined in pango_font_info.h
00033 
00034 // Map to substitute strings for ligatures.
00035 typedef hash_map<string, string, StringHash> LigHash;
00036 
00037 class LigatureTable {
00038  public:
00039   // Get a static instance of this class.
00040   static LigatureTable* Get();
00041 
00042   // Convert the utf8 string so that ligaturizable sequences, such as "fi" get
00043   // replaced by the (utf8 code for) appropriate ligature characters. Only do so
00044   // if the corresponding ligature character is renderable in the current font.
00045   string AddLigatures(const string& str, const PangoFontInfo* font) const;
00046   // Remove all ligatures.
00047   string RemoveLigatures(const string& str) const;
00048   // Remove only custom ligatures (eg. "ct") encoded in the private-use-area.
00049   string RemoveCustomLigatures(const string& str) const;
00050 
00051   const LigHash& norm_to_lig_table() const {
00052     return norm_to_lig_table_;
00053   }
00054   const LigHash& lig_to_norm_table() const {
00055     return lig_to_norm_table_;
00056   }
00057 
00058  protected:
00059   LigatureTable();
00060   // Initialize the hash tables mapping between ligature strings and the
00061   // corresponding ligature characters.
00062   void Init();
00063 
00064   static SmartPtr<LigatureTable> instance_;
00065   LigHash norm_to_lig_table_;
00066   LigHash lig_to_norm_table_;
00067   int min_lig_length_;
00068   int max_lig_length_;
00069   int min_norm_length_;
00070   int max_norm_length_;
00071 
00072  private:
00073   LigatureTable(const LigatureTable&);
00074   void operator=(const LigatureTable&);
00075 };
00076 
00077 }  // namespace tesseract
00078 
00079 #endif  // OCR_TRAININGDATA_TYPESETTING_LIGATURE_TABLE_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines