tesseract
3.03
|
00001 /********************************************************************** 00002 * File: normstrngs.cpp 00003 * Description: Utilities to normalize and manipulate UTF-32 and 00004 * UTF-8 strings. 00005 * Author: Ranjith Unnikrishnan 00006 * Created: Thu July 4 2013 00007 * 00008 * (C) Copyright 2013, Google Inc. 00009 * Licensed under the Apache License, Version 2.0 (the "License"); 00010 * you may not use this file except in compliance with the License. 00011 * You may obtain a copy of the License at 00012 * http://www.apache.org/licenses/LICENSE-2.0 00013 * Unless required by applicable law or agreed to in writing, software 00014 * distributed under the License is distributed on an "AS IS" BASIS, 00015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 * See the License for the specific language governing permissions and 00017 * limitations under the License. 00018 * 00019 **********************************************************************/ 00020 00021 #include "normstrngs.h" 00022 00023 #include "icuerrorcode.h" 00024 #include "unichar.h" 00025 #include "unicode/normalizer2.h" // From libicu 00026 #include "unicode/translit.h" // From libicu 00027 #include "unicode/unorm2.h" // From libicu 00028 00029 namespace tesseract { 00030 00031 void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32) { 00032 str32->clear(); 00033 str32->reserve(strlen(utf8_str)); 00034 int len = strlen(utf8_str); 00035 int step = 0; 00036 for (int ch = 0; ch < len; ch += step) { 00037 step = UNICHAR::utf8_step(utf8_str + ch); 00038 if (step > 0) { 00039 UNICHAR uni_ch(utf8_str + ch, step); 00040 (*str32) += uni_ch.first_uni(); 00041 } 00042 } 00043 } 00044 00045 void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str) { 00046 utf8_str->ensure(str32.length()); 00047 utf8_str->assign("", 0); 00048 for (int i = 0; i < str32.length(); ++i) { 00049 UNICHAR uni_ch(str32[i]); 00050 char *utf8 = uni_ch.utf8_str(); 00051 if (utf8 != NULL) { 00052 (*utf8_str) += utf8; 00053 delete[] utf8; 00054 } 00055 } 00056 } 00057 00058 bool is_hyphen_punc(const char32 ch) { 00059 static const int kNumHyphenPuncUnicodes = 13; 00060 static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = { 00061 '-', 00062 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar 00063 0x207b, // superscript minus 00064 0x208b, // subscript minus 00065 0x2212, // minus sign 00066 0xfe58, // small em dash 00067 0xfe63, // small hyphen-minus 00068 0xff0d, // fullwidth hyphen-minus 00069 }; 00070 for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) { 00071 if (kHyphenPuncUnicodes[i] == ch) 00072 return true; 00073 } 00074 return false; 00075 } 00076 00077 bool is_single_quote(const char32 ch) { 00078 static const int kNumSingleQuoteUnicodes = 8; 00079 static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = { 00080 '\'', 00081 '`', 00082 0x2018, // left single quotation mark (English, others) 00083 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.) 00084 // We may have to introduce a comma set with 0x201a 00085 0x201B, // single high-reveresed-9 quotation mark (PropList.txt) 00086 0x2032, // prime 00087 0x300C, // left corner bracket (East Asian languages) 00088 0xFF07, // fullwidth apostrophe 00089 }; 00090 for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) { 00091 if (kSingleQuoteUnicodes[i] == ch) 00092 return true; 00093 } 00094 return false; 00095 } 00096 00097 bool is_double_quote(const char32 ch) { 00098 static const int kNumDoubleQuoteUnicodes = 8; 00099 static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = { 00100 '"', 00101 0x201C, // left double quotation mark (English, others) 00102 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.) 00103 0x201F, // double high-reversed-9 quotation mark (PropList.txt) 00104 0x2033, // double prime 00105 0x301D, // reversed double prime quotation mark (East Asian langs, horiz.) 00106 0x301E, // close double prime (East Asian languages written horizontally) 00107 0xFF02, // fullwidth quotation mark 00108 }; 00109 for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) { 00110 if (kDoubleQuoteUnicodes[i] == ch) 00111 return true; 00112 } 00113 return false; 00114 } 00115 00116 STRING NormalizeUTF8String(const char* str8) { 00117 GenericVector<char32> str32, out_str32, norm_str; 00118 UTF8ToUTF32(str8, &str32); 00119 for (int i = 0; i < str32.length(); ++i) { 00120 norm_str.clear(); 00121 NormalizeChar32(str32[i], &norm_str); 00122 for (int j = 0; j < norm_str.length(); ++j) { 00123 out_str32.push_back(norm_str[j]); 00124 } 00125 } 00126 STRING out_str8; 00127 UTF32ToUTF8(out_str32, &out_str8); 00128 return out_str8; 00129 } 00130 00131 void NormalizeChar32(char32 ch, GenericVector<char32>* str) { 00132 IcuErrorCode error_code; 00133 const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance( 00134 NULL, "nfkc", UNORM2_COMPOSE, error_code); 00135 error_code.assertSuccess(); 00136 error_code.reset(); 00137 00138 icu::UnicodeString uch_str(static_cast<UChar32>(ch)); 00139 icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code); 00140 error_code.assertSuccess(); 00141 00142 str->clear(); 00143 for (int i = 0; i < norm_str.length(); ++i) { 00144 // If any spaces were added by NFKC, pretend normalization is a nop. 00145 if (norm_str[i] == ' ') { 00146 str->clear(); 00147 str->push_back(ch); 00148 break; 00149 } else { 00150 str->push_back(OCRNormalize(static_cast<char32>(norm_str[i]))); 00151 } 00152 } 00153 } 00154 00155 // Apply just the OCR-specific normalizations and return the normalized char. 00156 char32 OCRNormalize(char32 ch) { 00157 if (is_hyphen_punc(ch)) 00158 return '-'; 00159 else if (is_single_quote(ch)) 00160 return '\''; 00161 else if (is_double_quote(ch)) 00162 return '"'; 00163 return ch; 00164 } 00165 00166 bool IsOCREquivalent(char32 ch1, char32 ch2) { 00167 return OCRNormalize(ch1) == OCRNormalize(ch2); 00168 } 00169 00170 bool IsValidCodepoint(const char32 ch) { 00171 // In the range [0, 0xD800) or [0xE000, 0x10FFFF] 00172 return (static_cast<uinT32>(ch) < 0xD800) 00173 || (ch >= 0xE000 && ch <= 0x10FFFF); 00174 } 00175 00176 bool IsWhitespace(const char32 ch) { 00177 ASSERT_HOST_MSG(IsValidCodepoint(ch), 00178 "Invalid Unicode codepoint: 0x%x\n", ch); 00179 return u_isUWhiteSpace(static_cast<UChar32>(ch)); 00180 } 00181 00182 bool IsUTF8Whitespace(const char* text) { 00183 return SpanUTF8Whitespace(text) == strlen(text); 00184 } 00185 00186 int SpanUTF8Whitespace(const char* text) { 00187 int n_white = 0; 00188 for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text)); 00189 it != UNICHAR::end(text, strlen(text)); 00190 ++it) { 00191 if (!IsWhitespace(*it)) break; 00192 n_white += it.utf8_len(); 00193 } 00194 return n_white; 00195 } 00196 00197 int SpanUTF8NotWhitespace(const char* text) { 00198 int n_notwhite = 0; 00199 for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text)); 00200 it != UNICHAR::end(text, strlen(text)); 00201 ++it) { 00202 if (IsWhitespace(*it)) break; 00203 n_notwhite += it.utf8_len(); 00204 } 00205 return n_notwhite; 00206 } 00207 00208 bool IsInterchangeValid(const char32 ch) { 00209 return IsValidCodepoint(ch) && 00210 !(ch >= 0xFDD0 && ch <= 0xFDEF) && // Noncharacters. 00211 !(ch >= 0xFFFE && ch <= 0xFFFF) && 00212 !(ch >= 0x1FFFE && ch <= 0x1FFFF) && 00213 !(ch >= 0x2FFFE && ch <= 0x2FFFF) && 00214 !(ch >= 0x3FFFE && ch <= 0x3FFFF) && 00215 !(ch >= 0x4FFFE && ch <= 0x4FFFF) && 00216 !(ch >= 0x5FFFE && ch <= 0x5FFFF) && 00217 !(ch >= 0x6FFFE && ch <= 0x6FFFF) && 00218 !(ch >= 0x7FFFE && ch <= 0x7FFFF) && 00219 !(ch >= 0x8FFFE && ch <= 0x8FFFF) && 00220 !(ch >= 0x9FFFE && ch <= 0x9FFFF) && 00221 !(ch >= 0xAFFFE && ch <= 0xAFFFF) && 00222 !(ch >= 0xBFFFE && ch <= 0xBFFFF) && 00223 !(ch >= 0xCFFFE && ch <= 0xCFFFF) && 00224 !(ch >= 0xDFFFE && ch <= 0xDFFFF) && 00225 !(ch >= 0xEFFFE && ch <= 0xEFFFF) && 00226 !(ch >= 0xFFFFE && ch <= 0xFFFFF) && 00227 !(ch >= 0x10FFFE && ch <= 0x10FFFF) && 00228 (!u_isISOControl(static_cast<UChar32>(ch)) || 00229 ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r'); 00230 } 00231 00232 bool IsInterchangeValid7BitAscii(const char32 ch) { 00233 return IsValidCodepoint(ch) && 00234 ch <= 128 && 00235 (!u_isISOControl(static_cast<UChar32>(ch)) || 00236 ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r'); 00237 } 00238 00239 char32 FullwidthToHalfwidth(const char32 ch) { 00240 // Return unchanged if not in the fullwidth-halfwidth Unicode block. 00241 if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) { 00242 if (ch != 0x3000) return ch; 00243 } 00244 // Special case for fullwidth left and right "white parentheses". 00245 if (ch == 0xFF5F) return 0x2985; 00246 if (ch == 0xFF60) return 0x2986; 00247 // Construct a full-to-half width transliterator. 00248 IcuErrorCode error_code; 00249 icu::UnicodeString uch_str(static_cast<UChar32>(ch)); 00250 const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance( 00251 "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code); 00252 error_code.assertSuccess(); 00253 error_code.reset(); 00254 00255 fulltohalf->transliterate(uch_str); 00256 delete fulltohalf; 00257 ASSERT_HOST(uch_str.length() != 0); 00258 return uch_str[0]; 00259 } 00260 00261 } // namespace tesseract