tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccutil/unichar.cpp
Go to the documentation of this file.
00001 
00002 // File:        unichar.cpp
00003 // Description: Unicode character/ligature class.
00004 // Author:      Ray Smith
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "unichar.h"
00021 #include "errcode.h"
00022 #include "tprintf.h"
00023 
00024 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF
00025 
00026 // Construct from a utf8 string. If len<0 then the string is null terminated.
00027 // If the string is too long to fit in the UNICHAR then it takes only what
00028 // will fit. Checks for illegal input and stops at an illegal sequence.
00029 // The resulting UNICHAR may be empty.
00030 UNICHAR::UNICHAR(const char* utf8_str, int len) {
00031   int total_len = 0;
00032   int step = 0;
00033   if (len < 0) {
00034     for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len);
00035   }
00036   for (total_len = 0; total_len < len; total_len += step) {
00037     step = utf8_step(utf8_str + total_len);
00038     if (total_len + step > UNICHAR_LEN)
00039       break;  // Too long.
00040     if (step == 0)
00041       break;  // Illegal first byte.
00042     int i;
00043     for (i = 1; i < step; ++i)
00044       if ((utf8_str[total_len + i] & 0xc0) != 0x80)
00045         break;
00046     if (i < step)
00047       break;  // Illegal surrogate
00048   }
00049   memcpy(chars, utf8_str, total_len);
00050   if (total_len < UNICHAR_LEN) {
00051     chars[UNICHAR_LEN - 1] = total_len;
00052     while (total_len < UNICHAR_LEN - 1)
00053       chars[total_len++] = 0;
00054   }
00055 }
00056 
00057 // Construct from a single UCS4 character. Illegal values are ignored,
00058 // resulting in an empty UNICHAR.
00059 UNICHAR::UNICHAR(int unicode) {
00060   const int bytemask = 0xBF;
00061   const int bytemark = 0x80;
00062 
00063   if (unicode < 0x80) {
00064     chars[UNICHAR_LEN - 1] = 1;
00065     chars[2] = 0;
00066     chars[1] = 0;
00067     chars[0] = static_cast<char>(unicode);
00068   } else if (unicode < 0x800) {
00069     chars[UNICHAR_LEN - 1] = 2;
00070     chars[2] = 0;
00071     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00072     unicode >>= 6;
00073     chars[0] = static_cast<char>(unicode | 0xc0);
00074   } else if (unicode < 0x10000) {
00075     chars[UNICHAR_LEN - 1] = 3;
00076     chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
00077     unicode >>= 6;
00078     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00079     unicode >>= 6;
00080     chars[0] = static_cast<char>(unicode | 0xe0);
00081   } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
00082     chars[UNICHAR_LEN - 1] = 4;
00083     chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
00084     unicode >>= 6;
00085     chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
00086     unicode >>= 6;
00087     chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
00088     unicode >>= 6;
00089     chars[0] = static_cast<char>(unicode | 0xf0);
00090   } else {
00091     memset(chars, 0, UNICHAR_LEN);
00092   }
00093 }
00094 
00095 // Get the first character as UCS-4.
00096 int UNICHAR::first_uni() const {
00097   static const int utf8_offsets[5] = {
00098     0, 0, 0x3080, 0xE2080, 0x3C82080
00099   };
00100   int uni = 0;
00101   int len = utf8_step(chars);
00102   const char* src = chars;
00103 
00104   switch (len) {
00105   default:
00106     break;
00107   case 4:
00108     uni += static_cast<unsigned char>(*src++);
00109     uni <<= 6;
00110   case 3:
00111     uni += static_cast<unsigned char>(*src++);
00112     uni <<= 6;
00113   case 2:
00114     uni += static_cast<unsigned char>(*src++);
00115     uni <<= 6;
00116   case 1:
00117     uni += static_cast<unsigned char>(*src++);
00118   }
00119   uni -= utf8_offsets[len];
00120   return uni;
00121 }
00122 
00123 // Get a terminated UTF8 string: Must delete[] it after use.
00124 char* UNICHAR::utf8_str() const {
00125   int len = utf8_len();
00126   char* str = new char[len + 1];
00127   memcpy(str, chars, len);
00128   str[len] = 0;
00129   return str;
00130 }
00131 
00132 // Get the number of bytes in the first character of the given utf8 string.
00133 int UNICHAR::utf8_step(const char* utf8_str) {
00134   static const char utf8_bytes[256] = {
00135     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00136     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00137     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00138     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00139     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00140     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00141     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
00142     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
00143   };
00144 
00145   return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
00146 }
00147 
00148 UNICHAR::const_iterator& UNICHAR::const_iterator::operator++() {
00149   ASSERT_HOST(it_ != NULL);
00150   int step = utf8_step(it_);
00151   if (step == 0) {
00152     tprintf("ERROR: Illegal UTF8 encountered.\n");
00153     for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
00154       tprintf("Index %d char = 0x%x", i, it_[i]);
00155     }
00156     step = 1;
00157   }
00158   it_ += step;
00159   return *this;
00160 }
00161 
00162 int UNICHAR::const_iterator::operator*() const {
00163   ASSERT_HOST(it_ != NULL);
00164   const int len = utf8_step(it_);
00165   if (len == 0) {
00166     tprintf("WARNING: Illegal UTF8 encountered\n");
00167     return ' ';
00168   }
00169   UNICHAR uch(it_, len);
00170   return uch.first_uni();
00171 }
00172 
00173 int UNICHAR::const_iterator::get_utf8(char* utf8_output) const {
00174   ASSERT_HOST(it_ != NULL);
00175   const int len = utf8_step(it_);
00176   if (len == 0) {
00177     tprintf("WARNING: Illegal UTF8 encountered\n");
00178     utf8_output[0] = ' ';
00179     return 1;
00180   }
00181   strncpy(utf8_output, it_, len);
00182   return len;
00183 }
00184 
00185 int UNICHAR::const_iterator::utf8_len() const {
00186   ASSERT_HOST(it_ != NULL);
00187   const int len = utf8_step(it_);
00188   if (len == 0) {
00189     tprintf("WARNING: Illegal UTF8 encountered\n");
00190     return 1;
00191   }
00192   return len;
00193 }
00194 
00195 UNICHAR::const_iterator UNICHAR::begin(const char* utf8_str, const int len) {
00196   return UNICHAR::const_iterator(utf8_str);
00197 }
00198 
00199 UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) {
00200   return UNICHAR::const_iterator(utf8_str + len);
00201 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines