tesseract
3.03
|
00001 /********************************************************************** 00002 * File: normstrngs.h 00003 * Description: Utilities to normalize and manipulate UTF-32 and 00004 * UTF-8 strings. 00005 * Author: Ranjith Unnikrishnan 00006 * Created: Thu July 4 2013 00007 * 00008 * (C) Copyright 2013, Google Inc. 00009 * Licensed under the Apache License, Version 2.0 (the "License"); 00010 * you may not use this file except in compliance with the License. 00011 * You may obtain a copy of the License at 00012 * http://www.apache.org/licenses/LICENSE-2.0 00013 * Unless required by applicable law or agreed to in writing, software 00014 * distributed under the License is distributed on an "AS IS" BASIS, 00015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 * See the License for the specific language governing permissions and 00017 * limitations under the License. 00018 * 00019 **********************************************************************/ 00020 00021 #ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_ 00022 #define TESSERACT_CCUTIL_NORMSTRNGS_H_ 00023 00024 #include "genericvector.h" 00025 #include "strngs.h" 00026 00027 typedef signed int char32; 00028 00029 namespace tesseract { 00030 00031 // UTF-8 to UTF-32 conversion function. 00032 void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32); 00033 00034 // UTF-32 to UTF-8 convesion function. 00035 void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str); 00036 00037 // Normalize a single char32 using NFKC + OCR-specific transformations. 00038 // NOTE that proper NFKC may require multiple characters as input. The 00039 // assumption of this function is that the input is already as fully composed 00040 // as it can be, but may require some compatibility normalizations or just 00041 // OCR evaluation related normalizations. 00042 void NormalizeChar32(char32 ch, GenericVector<char32>* str); 00043 00044 // Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that 00045 // can contain multiple UTF32 code points. 00046 STRING NormalizeUTF8String(const char* str8); 00047 00048 // Apply just the OCR-specific normalizations and return the normalized char. 00049 char32 OCRNormalize(char32 ch); 00050 00051 // Returns true if the OCRNormalized ch1 and ch2 are the same. 00052 bool IsOCREquivalent(char32 ch1, char32 ch2); 00053 00054 // Returns true if the value lies in the range of valid unicodes. 00055 bool IsValidCodepoint(const char32 ch); 00056 00057 // Returns true a code point has the White_Space Unicode property. 00058 bool IsWhitespace(const char32 ch); 00059 // Returns true if every char in the given (null-terminated) string has the 00060 // White_Space Unicode property. 00061 bool IsUTF8Whitespace(const char* text); 00062 00063 // Returns the length of bytes of the prefix of 'text' that have the White_Space 00064 // unicode property. 00065 int SpanUTF8Whitespace(const char* text); 00066 00067 // Returns the length of bytes of the prefix of 'text' that DO NOT have the 00068 // White_Space unicode property. 00069 int SpanUTF8NotWhitespace(const char* text); 00070 00071 // Returns true if the char is interchange valid i.e. no C0 or C1 control codes 00072 // (other than CR LF HT FF) and no non-characters. 00073 bool IsInterchangeValid(const char32 ch); 00074 // Same as above but restricted to 7-bit ASCII. 00075 bool IsInterchangeValid7BitAscii(const char32 ch); 00076 00077 // Convert a full-width UTF-8 string to half-width. 00078 char32 FullwidthToHalfwidth(const char32 ch); 00079 00080 } // namespace tesseract 00081 00082 #endif // TESSERACT_CCUTIL_NORMSTRNGS_H_