tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/normstrngs.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        normstrngs.h
00003  * Description: Utilities to normalize and manipulate UTF-32 and
00004  *              UTF-8 strings.
00005  * Author:      Ranjith Unnikrishnan
00006  * Created:     Thu July 4 2013
00007  *
00008  * (C) Copyright 2013, Google Inc.
00009  * Licensed under the Apache License, Version 2.0 (the "License");
00010  * you may not use this file except in compliance with the License.
00011  * You may obtain a copy of the License at
00012  * http://www.apache.org/licenses/LICENSE-2.0
00013  * Unless required by applicable law or agreed to in writing, software
00014  * distributed under the License is distributed on an "AS IS" BASIS,
00015  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016  * See the License for the specific language governing permissions and
00017  * limitations under the License.
00018  *
00019  **********************************************************************/
00020 
00021 #ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
00022 #define TESSERACT_CCUTIL_NORMSTRNGS_H_
00023 
00024 #include "genericvector.h"
00025 #include "strngs.h"
00026 
00027 typedef signed int char32;
00028 
00029 namespace tesseract {
00030 
00031 // UTF-8 to UTF-32 conversion function.
00032 void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32);
00033 
00034 // UTF-32 to UTF-8 convesion function.
00035 void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str);
00036 
00037 // Normalize a single char32 using NFKC + OCR-specific transformations.
00038 // NOTE that proper NFKC may require multiple characters as input. The
00039 // assumption of this function is that the input is already as fully composed
00040 // as it can be, but may require some compatibility normalizations or just
00041 // OCR evaluation related normalizations.
00042 void NormalizeChar32(char32 ch, GenericVector<char32>* str);
00043 
00044 // Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that
00045 // can contain multiple UTF32 code points.
00046 STRING NormalizeUTF8String(const char* str8);
00047 
00048 // Apply just the OCR-specific normalizations and return the normalized char.
00049 char32 OCRNormalize(char32 ch);
00050 
00051 // Returns true if the OCRNormalized ch1 and ch2 are the same.
00052 bool IsOCREquivalent(char32 ch1, char32 ch2);
00053 
00054 // Returns true if the value lies in the range of valid unicodes.
00055 bool IsValidCodepoint(const char32 ch);
00056 
00057 // Returns true a code point has the White_Space Unicode property.
00058 bool IsWhitespace(const char32 ch);
00059 // Returns true if every char in the given (null-terminated) string has the
00060 // White_Space Unicode property.
00061 bool IsUTF8Whitespace(const char* text);
00062 
00063 // Returns the length of bytes of the prefix of 'text' that have the White_Space
00064 // unicode property.
00065 int SpanUTF8Whitespace(const char* text);
00066 
00067 // Returns the length of bytes of the prefix of 'text' that DO NOT have the
00068 // White_Space unicode property.
00069 int SpanUTF8NotWhitespace(const char* text);
00070 
00071 // Returns true if the char is interchange valid i.e. no C0 or C1 control codes
00072 // (other than CR LF HT FF) and no non-characters.
00073 bool IsInterchangeValid(const char32 ch);
00074 // Same as above but restricted to 7-bit ASCII.
00075 bool IsInterchangeValid7BitAscii(const char32 ch);
00076 
00077 // Convert a full-width UTF-8 string to half-width.
00078 char32 FullwidthToHalfwidth(const char32 ch);
00079 
00080 }  // namespace tesseract
00081 
00082 #endif  // TESSERACT_CCUTIL_NORMSTRNGS_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines