tesseract
3.03
|
00001 00002 // File: unichar.cpp 00003 // Description: Unicode character/ligature class. 00004 // Author: Ray Smith 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #include "unichar.h" 00021 #include "errcode.h" 00022 #include "tprintf.h" 00023 00024 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF 00025 00026 // Construct from a utf8 string. If len<0 then the string is null terminated. 00027 // If the string is too long to fit in the UNICHAR then it takes only what 00028 // will fit. Checks for illegal input and stops at an illegal sequence. 00029 // The resulting UNICHAR may be empty. 00030 UNICHAR::UNICHAR(const char* utf8_str, int len) { 00031 int total_len = 0; 00032 int step = 0; 00033 if (len < 0) { 00034 for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len); 00035 } 00036 for (total_len = 0; total_len < len; total_len += step) { 00037 step = utf8_step(utf8_str + total_len); 00038 if (total_len + step > UNICHAR_LEN) 00039 break; // Too long. 00040 if (step == 0) 00041 break; // Illegal first byte. 00042 int i; 00043 for (i = 1; i < step; ++i) 00044 if ((utf8_str[total_len + i] & 0xc0) != 0x80) 00045 break; 00046 if (i < step) 00047 break; // Illegal surrogate 00048 } 00049 memcpy(chars, utf8_str, total_len); 00050 if (total_len < UNICHAR_LEN) { 00051 chars[UNICHAR_LEN - 1] = total_len; 00052 while (total_len < UNICHAR_LEN - 1) 00053 chars[total_len++] = 0; 00054 } 00055 } 00056 00057 // Construct from a single UCS4 character. Illegal values are ignored, 00058 // resulting in an empty UNICHAR. 00059 UNICHAR::UNICHAR(int unicode) { 00060 const int bytemask = 0xBF; 00061 const int bytemark = 0x80; 00062 00063 if (unicode < 0x80) { 00064 chars[UNICHAR_LEN - 1] = 1; 00065 chars[2] = 0; 00066 chars[1] = 0; 00067 chars[0] = static_cast<char>(unicode); 00068 } else if (unicode < 0x800) { 00069 chars[UNICHAR_LEN - 1] = 2; 00070 chars[2] = 0; 00071 chars[1] = static_cast<char>((unicode | bytemark) & bytemask); 00072 unicode >>= 6; 00073 chars[0] = static_cast<char>(unicode | 0xc0); 00074 } else if (unicode < 0x10000) { 00075 chars[UNICHAR_LEN - 1] = 3; 00076 chars[2] = static_cast<char>((unicode | bytemark) & bytemask); 00077 unicode >>= 6; 00078 chars[1] = static_cast<char>((unicode | bytemark) & bytemask); 00079 unicode >>= 6; 00080 chars[0] = static_cast<char>(unicode | 0xe0); 00081 } else if (unicode <= UNI_MAX_LEGAL_UTF32) { 00082 chars[UNICHAR_LEN - 1] = 4; 00083 chars[3] = static_cast<char>((unicode | bytemark) & bytemask); 00084 unicode >>= 6; 00085 chars[2] = static_cast<char>((unicode | bytemark) & bytemask); 00086 unicode >>= 6; 00087 chars[1] = static_cast<char>((unicode | bytemark) & bytemask); 00088 unicode >>= 6; 00089 chars[0] = static_cast<char>(unicode | 0xf0); 00090 } else { 00091 memset(chars, 0, UNICHAR_LEN); 00092 } 00093 } 00094 00095 // Get the first character as UCS-4. 00096 int UNICHAR::first_uni() const { 00097 static const int utf8_offsets[5] = { 00098 0, 0, 0x3080, 0xE2080, 0x3C82080 00099 }; 00100 int uni = 0; 00101 int len = utf8_step(chars); 00102 const char* src = chars; 00103 00104 switch (len) { 00105 default: 00106 break; 00107 case 4: 00108 uni += static_cast<unsigned char>(*src++); 00109 uni <<= 6; 00110 case 3: 00111 uni += static_cast<unsigned char>(*src++); 00112 uni <<= 6; 00113 case 2: 00114 uni += static_cast<unsigned char>(*src++); 00115 uni <<= 6; 00116 case 1: 00117 uni += static_cast<unsigned char>(*src++); 00118 } 00119 uni -= utf8_offsets[len]; 00120 return uni; 00121 } 00122 00123 // Get a terminated UTF8 string: Must delete[] it after use. 00124 char* UNICHAR::utf8_str() const { 00125 int len = utf8_len(); 00126 char* str = new char[len + 1]; 00127 memcpy(str, chars, len); 00128 str[len] = 0; 00129 return str; 00130 } 00131 00132 // Get the number of bytes in the first character of the given utf8 string. 00133 int UNICHAR::utf8_step(const char* utf8_str) { 00134 static const char utf8_bytes[256] = { 00135 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00136 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00137 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00138 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00139 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00140 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00141 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 00142 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0 00143 }; 00144 00145 return utf8_bytes[static_cast<unsigned char>(*utf8_str)]; 00146 } 00147 00148 UNICHAR::const_iterator& UNICHAR::const_iterator::operator++() { 00149 ASSERT_HOST(it_ != NULL); 00150 int step = utf8_step(it_); 00151 if (step == 0) { 00152 tprintf("ERROR: Illegal UTF8 encountered.\n"); 00153 for (int i = 0; i < 5 && it_[i] != '\0'; ++i) { 00154 tprintf("Index %d char = 0x%x", i, it_[i]); 00155 } 00156 step = 1; 00157 } 00158 it_ += step; 00159 return *this; 00160 } 00161 00162 int UNICHAR::const_iterator::operator*() const { 00163 ASSERT_HOST(it_ != NULL); 00164 const int len = utf8_step(it_); 00165 if (len == 0) { 00166 tprintf("WARNING: Illegal UTF8 encountered\n"); 00167 return ' '; 00168 } 00169 UNICHAR uch(it_, len); 00170 return uch.first_uni(); 00171 } 00172 00173 int UNICHAR::const_iterator::get_utf8(char* utf8_output) const { 00174 ASSERT_HOST(it_ != NULL); 00175 const int len = utf8_step(it_); 00176 if (len == 0) { 00177 tprintf("WARNING: Illegal UTF8 encountered\n"); 00178 utf8_output[0] = ' '; 00179 return 1; 00180 } 00181 strncpy(utf8_output, it_, len); 00182 return len; 00183 } 00184 00185 int UNICHAR::const_iterator::utf8_len() const { 00186 ASSERT_HOST(it_ != NULL); 00187 const int len = utf8_step(it_); 00188 if (len == 0) { 00189 tprintf("WARNING: Illegal UTF8 encountered\n"); 00190 return 1; 00191 } 00192 return len; 00193 } 00194 00195 UNICHAR::const_iterator UNICHAR::begin(const char* utf8_str, const int len) { 00196 return UNICHAR::const_iterator(utf8_str); 00197 } 00198 00199 UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) { 00200 return UNICHAR::const_iterator(utf8_str + len); 00201 }