tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/normstrngs.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        normstrngs.cpp
00003  * Description: Utilities to normalize and manipulate UTF-32 and
00004  *              UTF-8 strings.
00005  * Author:      Ranjith Unnikrishnan
00006  * Created:     Thu July 4 2013
00007  *
00008  * (C) Copyright 2013, Google Inc.
00009  * Licensed under the Apache License, Version 2.0 (the "License");
00010  * you may not use this file except in compliance with the License.
00011  * You may obtain a copy of the License at
00012  * http://www.apache.org/licenses/LICENSE-2.0
00013  * Unless required by applicable law or agreed to in writing, software
00014  * distributed under the License is distributed on an "AS IS" BASIS,
00015  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016  * See the License for the specific language governing permissions and
00017  * limitations under the License.
00018  *
00019  **********************************************************************/
00020 
00021 #include "normstrngs.h"
00022 
00023 #include "icuerrorcode.h"
00024 #include "unichar.h"
00025 #include "unicode/normalizer2.h"  // From libicu
00026 #include "unicode/translit.h"     // From libicu
00027 #include "unicode/unorm2.h"       // From libicu
00028 
00029 namespace tesseract {
00030 
00031 void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32) {
00032   str32->clear();
00033   str32->reserve(strlen(utf8_str));
00034   int len = strlen(utf8_str);
00035   int step = 0;
00036   for (int ch = 0; ch < len; ch += step) {
00037     step = UNICHAR::utf8_step(utf8_str + ch);
00038     if (step > 0) {
00039       UNICHAR uni_ch(utf8_str + ch, step);
00040       (*str32) += uni_ch.first_uni();
00041     }
00042   }
00043 }
00044 
00045 void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str) {
00046   utf8_str->ensure(str32.length());
00047   utf8_str->assign("", 0);
00048   for (int i = 0; i < str32.length(); ++i) {
00049     UNICHAR uni_ch(str32[i]);
00050     char *utf8 = uni_ch.utf8_str();
00051     if (utf8 != NULL) {
00052       (*utf8_str) += utf8;
00053       delete[] utf8;
00054     }
00055   }
00056 }
00057 
00058 bool is_hyphen_punc(const char32 ch) {
00059   static const int kNumHyphenPuncUnicodes = 13;
00060   static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
00061     '-',
00062     0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015,  // hyphen..horizontal bar
00063     0x207b,  // superscript minus
00064     0x208b,  // subscript minus
00065     0x2212,  // minus sign
00066     0xfe58,  // small em dash
00067     0xfe63,  // small hyphen-minus
00068     0xff0d,  // fullwidth hyphen-minus
00069   };
00070   for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
00071     if (kHyphenPuncUnicodes[i] == ch)
00072       return true;
00073   }
00074   return false;
00075 }
00076 
00077 bool is_single_quote(const char32 ch) {
00078   static const int kNumSingleQuoteUnicodes = 8;
00079   static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
00080     '\'',
00081     '`',
00082     0x2018,  // left single quotation mark (English, others)
00083     0x2019,  // right single quotation mark (Danish, Finnish, Swedish, Norw.)
00084              // We may have to introduce a comma set with 0x201a
00085     0x201B,  // single high-reveresed-9 quotation mark (PropList.txt)
00086     0x2032,  // prime
00087     0x300C,  // left corner bracket (East Asian languages)
00088     0xFF07,  // fullwidth apostrophe
00089   };
00090   for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
00091     if (kSingleQuoteUnicodes[i] == ch)
00092       return true;
00093   }
00094   return false;
00095 }
00096 
00097 bool is_double_quote(const char32 ch) {
00098   static const int kNumDoubleQuoteUnicodes = 8;
00099   static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
00100     '"',
00101     0x201C,  // left double quotation mark (English, others)
00102     0x201D,  // right double quotation mark (Danish, Finnish, Swedish, Norw.)
00103     0x201F,  // double high-reversed-9 quotation mark (PropList.txt)
00104     0x2033,  // double prime
00105     0x301D,  // reversed double prime quotation mark (East Asian langs, horiz.)
00106     0x301E,  // close double prime (East Asian languages written horizontally)
00107     0xFF02,  // fullwidth quotation mark
00108   };
00109   for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
00110     if (kDoubleQuoteUnicodes[i] == ch)
00111       return true;
00112   }
00113   return false;
00114 }
00115 
00116 STRING NormalizeUTF8String(const char* str8) {
00117   GenericVector<char32> str32, out_str32, norm_str;
00118   UTF8ToUTF32(str8, &str32);
00119   for (int i = 0; i < str32.length(); ++i) {
00120     norm_str.clear();
00121     NormalizeChar32(str32[i], &norm_str);
00122     for (int j = 0; j < norm_str.length(); ++j) {
00123       out_str32.push_back(norm_str[j]);
00124     }
00125   }
00126   STRING out_str8;
00127   UTF32ToUTF8(out_str32, &out_str8);
00128   return out_str8;
00129 }
00130 
00131 void NormalizeChar32(char32 ch, GenericVector<char32>* str) {
00132   IcuErrorCode error_code;
00133   const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
00134       NULL, "nfkc", UNORM2_COMPOSE, error_code);
00135   error_code.assertSuccess();
00136   error_code.reset();
00137 
00138   icu::UnicodeString uch_str(static_cast<UChar32>(ch));
00139   icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
00140   error_code.assertSuccess();
00141 
00142   str->clear();
00143   for (int i = 0; i < norm_str.length(); ++i) {
00144     // If any spaces were added by NFKC, pretend normalization is a nop.
00145     if (norm_str[i] == ' ') {
00146       str->clear();
00147       str->push_back(ch);
00148       break;
00149     } else {
00150       str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
00151     }
00152   }
00153 }
00154 
00155 // Apply just the OCR-specific normalizations and return the normalized char.
00156 char32 OCRNormalize(char32 ch) {
00157   if (is_hyphen_punc(ch))
00158     return '-';
00159   else if (is_single_quote(ch))
00160     return '\'';
00161   else if (is_double_quote(ch))
00162     return '"';
00163   return ch;
00164 }
00165 
00166 bool IsOCREquivalent(char32 ch1, char32 ch2) {
00167   return OCRNormalize(ch1) == OCRNormalize(ch2);
00168 }
00169 
00170 bool IsValidCodepoint(const char32 ch) {
00171   // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
00172   return (static_cast<uinT32>(ch) < 0xD800)
00173       || (ch >= 0xE000 && ch <= 0x10FFFF);
00174 }
00175 
00176 bool IsWhitespace(const char32 ch) {
00177   ASSERT_HOST_MSG(IsValidCodepoint(ch),
00178                   "Invalid Unicode codepoint: 0x%x\n", ch);
00179   return u_isUWhiteSpace(static_cast<UChar32>(ch));
00180 }
00181 
00182 bool IsUTF8Whitespace(const char* text) {
00183   return SpanUTF8Whitespace(text) == strlen(text);
00184 }
00185 
00186 int SpanUTF8Whitespace(const char* text) {
00187   int n_white = 0;
00188   for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
00189        it != UNICHAR::end(text, strlen(text));
00190        ++it) {
00191     if (!IsWhitespace(*it)) break;
00192     n_white += it.utf8_len();
00193   }
00194   return n_white;
00195 }
00196 
00197 int SpanUTF8NotWhitespace(const char* text) {
00198   int n_notwhite = 0;
00199   for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
00200        it != UNICHAR::end(text, strlen(text));
00201        ++it) {
00202     if (IsWhitespace(*it)) break;
00203     n_notwhite += it.utf8_len();
00204   }
00205   return n_notwhite;
00206 }
00207 
00208 bool IsInterchangeValid(const char32 ch) {
00209   return IsValidCodepoint(ch) &&
00210       !(ch >= 0xFDD0 && ch <= 0xFDEF) &&  // Noncharacters.
00211       !(ch >= 0xFFFE && ch <= 0xFFFF) &&
00212       !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
00213       !(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
00214       !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
00215       !(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
00216       !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
00217       !(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
00218       !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
00219       !(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
00220       !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
00221       !(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
00222       !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
00223       !(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
00224       !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
00225       !(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
00226       !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
00227       !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
00228       (!u_isISOControl(static_cast<UChar32>(ch)) ||
00229        ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r');
00230 }
00231 
00232 bool IsInterchangeValid7BitAscii(const char32 ch) {
00233   return IsValidCodepoint(ch) &&
00234       ch <= 128 &&
00235       (!u_isISOControl(static_cast<UChar32>(ch)) ||
00236        ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r');
00237 }
00238 
00239 char32 FullwidthToHalfwidth(const char32 ch) {
00240   // Return unchanged if not in the fullwidth-halfwidth Unicode block.
00241   if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) {
00242     if (ch != 0x3000) return ch;
00243   }
00244   // Special case for fullwidth left and right "white parentheses".
00245   if (ch == 0xFF5F) return 0x2985;
00246   if (ch == 0xFF60) return 0x2986;
00247   // Construct a full-to-half width transliterator.
00248   IcuErrorCode error_code;
00249   icu::UnicodeString uch_str(static_cast<UChar32>(ch));
00250   const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
00251       "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
00252   error_code.assertSuccess();
00253   error_code.reset();
00254 
00255   fulltohalf->transliterate(uch_str);
00256   delete fulltohalf;
00257   ASSERT_HOST(uch_str.length() != 0);
00258   return uch_str[0];
00259 }
00260 
00261 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines