tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccutil/strngs.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        strngs.c  (Formerly strings.c)
00003  * Description: STRING class functions.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Fri Feb 15 09:13:30 GMT 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include          "helpers.h"
00021 #include          "tprintf.h"
00022 #include          "strngs.h"
00023 #include          "genericvector.h"
00024 
00025 #include <assert.h>
00026 // Size of buffer needed to host the decimal representation of the maximum
00027 // possible length of an int (in 64 bits), being -<20 digits>.
00028 const int kMaxIntSize = 22;
00029 // Size of buffer needed to host the decimal representation of the maximum
00030 // possible length of a %.8g being -0.12345678e+999<nul> = 15.
00031 const int kMaxDoubleSize = 15;
00032 
00033 /**********************************************************************
00034  * STRING_HEADER provides metadata about the allocated buffer,
00035  * including total capacity and how much used (strlen with '\0').
00036  *
00037  * The implementation hides this header at the start of the data
00038  * buffer and appends the string on the end to keep sizeof(STRING)
00039  * unchanged from earlier versions so serialization is not affected.
00040  *
00041  * The collection of MACROS provide different implementations depending
00042  * on whether the string keeps track of its strlen or not so that this
00043  * feature can be added in later when consumers dont modifify the string
00044  **********************************************************************/
00045 
00046 // Smallest string to allocate by default
00047 const int kMinCapacity = 16;
00048 
00049 char* STRING::AllocData(int used, int capacity) {
00050   data_ = (STRING_HEADER *)alloc_string(capacity + sizeof(STRING_HEADER));
00051 
00052   // header is the metadata for this memory block
00053   STRING_HEADER* header = GetHeader();
00054   header->capacity_ = capacity;
00055   header->used_ = used;
00056   return GetCStr();
00057 }
00058 
00059 void STRING::DiscardData() {
00060   free_string((char *)data_);
00061 }
00062 
00063 // This is a private method; ensure FixHeader is called (or used_ is well defined)
00064 // beforehand
00065 char* STRING::ensure_cstr(inT32 min_capacity) {
00066   STRING_HEADER* orig_header = GetHeader();
00067   if (min_capacity <= orig_header->capacity_)
00068     return ((char *)this->data_) + sizeof(STRING_HEADER);
00069 
00070   // if we are going to grow bigger, than double our existing
00071   // size, but if that still is not big enough then keep the
00072   // requested capacity
00073   if (min_capacity < 2 * orig_header->capacity_)
00074     min_capacity = 2 * orig_header->capacity_;
00075 
00076   int alloc = sizeof(STRING_HEADER) + min_capacity;
00077   STRING_HEADER* new_header = (STRING_HEADER*)(alloc_string(alloc));
00078 
00079   memcpy(&new_header[1], GetCStr(), orig_header->used_);
00080   new_header->capacity_ = min_capacity;
00081   new_header->used_ = orig_header->used_;
00082 
00083   // free old memory, then rebind to new memory
00084   DiscardData();
00085   data_ = new_header;
00086 
00087   assert(InvariantOk());
00088   return ((char *)data_) + sizeof(STRING_HEADER);
00089 }
00090 
00091 // This is const, but is modifying a mutable field
00092 // this way it can be used on const or non-const instances.
00093 void STRING::FixHeader() const {
00094   const STRING_HEADER* header = GetHeader();
00095   if (header->used_ < 0)
00096     header->used_ = strlen(GetCStr()) + 1;
00097 }
00098 
00099 
00100 STRING::STRING() {
00101   // Empty STRINGs contain just the "\0".
00102   memcpy(AllocData(1, kMinCapacity), "", 1);
00103 }
00104 
00105 STRING::STRING(const STRING& str) {
00106   str.FixHeader();
00107   const STRING_HEADER* str_header  = str.GetHeader();
00108   int   str_used  = str_header->used_;
00109   char *this_cstr = AllocData(str_used, str_used);
00110   memcpy(this_cstr, str.GetCStr(), str_used);
00111   assert(InvariantOk());
00112 }
00113 
00114 STRING::STRING(const char* cstr) {
00115   if (cstr == NULL) {
00116     // Empty STRINGs contain just the "\0".
00117     memcpy(AllocData(1, kMinCapacity), "", 1);
00118   } else {
00119     int len = strlen(cstr) + 1;
00120     char* this_cstr = AllocData(len, len);
00121     memcpy(this_cstr, cstr, len);
00122   }
00123   assert(InvariantOk());
00124 }
00125 
00126 STRING::~STRING() {
00127   DiscardData();
00128 }
00129 
00130 // Writes to the given file. Returns false in case of error.
00131 bool STRING::Serialize(FILE* fp) const {
00132   inT32 len = length();
00133   if (fwrite(&len, sizeof(len), 1, fp) != 1) return false;
00134   if (static_cast<int>(fwrite(GetCStr(), 1, len, fp)) != len) return false;
00135   return true;
00136 }
00137 // Reads from the given file. Returns false in case of error.
00138 // If swap is true, assumes a big/little-endian swap is needed.
00139 bool STRING::DeSerialize(bool swap, FILE* fp) {
00140   inT32 len;
00141   if (fread(&len, sizeof(len), 1, fp) != 1) return false;
00142   if (swap)
00143     ReverseN(&len, sizeof(len));
00144   truncate_at(len);
00145   if (static_cast<int>(fread(GetCStr(), 1, len, fp)) != len) return false;
00146   return true;
00147 }
00148 
00149 BOOL8 STRING::contains(const char c) const {
00150   return (c != '\0') && (strchr (GetCStr(), c) != NULL);
00151 }
00152 
00153 inT32 STRING::length() const {
00154   FixHeader();
00155   return GetHeader()->used_ - 1;
00156 }
00157 
00158 const char* STRING::string() const {
00159   const STRING_HEADER* header = GetHeader();
00160   if (header->used_ == 0)
00161     return NULL;
00162 
00163   // mark header length unreliable because tesseract might
00164   // cast away the const and mutate the string directly.
00165   header->used_ = -1;
00166   return GetCStr();
00167 }
00168 
00169 const char* STRING::c_str() const {
00170   return string();
00171 }
00172 
00173 /******
00174  * The STRING_IS_PROTECTED interface adds additional support to migrate
00175  * code that needs to modify the STRING in ways not otherwise supported
00176  * without violating encapsulation.
00177  *
00178  * Also makes the [] operator return a const so it is immutable
00179  */
00180 #if STRING_IS_PROTECTED
00181 const char& STRING::operator[](inT32 index) const {
00182   return GetCStr()[index];
00183 }
00184 
00185 void STRING::insert_range(inT32 index, const char* str, int len) {
00186   // if index is outside current range, then also grow size of string
00187   // to accmodate the requested range.
00188   STRING_HEADER* this_header = GetHeader();
00189   int used = this_header->used_;
00190   if (index > used)
00191     used = index;
00192 
00193   char* this_cstr = ensure_cstr(used + len + 1);
00194   if (index < used) {
00195     // move existing string from index to '\0' inclusive.
00196     memmove(this_cstr + index + len,
00197            this_cstr + index,
00198            this_header->used_ - index);
00199   } else if (len > 0) {
00200     // We are going to overwrite previous null terminator, so write the new one.
00201     this_cstr[this_header->used_ + len - 1] = '\0';
00202 
00203     // If the old header did not have the terminator,
00204     // then we need to account for it now that we've added it.
00205     // Otherwise it was already accounted for; we just moved it.
00206     if (this_header->used_ == 0)
00207       ++this_header->used_;
00208   }
00209 
00210   // Write new string to index.
00211   // The string is already terminated from the conditions above.
00212   memcpy(this_cstr + index, str, len);
00213   this_header->used_ += len;
00214 
00215   assert(InvariantOk());
00216 }
00217 
00218 void STRING::erase_range(inT32 index, int len) {
00219   char* this_cstr = GetCStr();
00220   STRING_HEADER* this_header = GetHeader();
00221 
00222   memcpy(this_cstr+index, this_cstr+index+len,
00223          this_header->used_ - index - len);
00224   this_header->used_ -= len;
00225   assert(InvariantOk());
00226 }
00227 
00228 #else
00229 void STRING::truncate_at(inT32 index) {
00230   ASSERT_HOST(index >= 0);
00231   FixHeader();
00232   char* this_cstr = ensure_cstr(index + 1);
00233   this_cstr[index] = '\0';
00234   GetHeader()->used_ = index + 1;
00235   assert(InvariantOk());
00236 }
00237 
00238 char& STRING::operator[](inT32 index) const {
00239   // Code is casting away this const and mutating the string,
00240   // so mark used_ as -1 to flag it unreliable.
00241   GetHeader()->used_ = -1;
00242   return ((char *)GetCStr())[index];
00243 }
00244 #endif
00245 
00246 void STRING::split(const char c, GenericVector<STRING> *splited) {
00247   int start_index = 0;
00248   for (int i = 0; i < length(); i++) {
00249     if ((*this)[i] == c) {
00250       if (i != start_index) {
00251         (*this)[i] = '\0';
00252         STRING tmp = GetCStr() + start_index;
00253         splited->push_back(tmp);
00254         (*this)[i] = c;
00255       }
00256       start_index = i + 1;
00257     }
00258   }
00259 
00260   if (length() != start_index) {
00261     STRING tmp = GetCStr() + start_index;
00262     splited->push_back(tmp);
00263   }
00264 }
00265 
00266 BOOL8 STRING::operator==(const STRING& str) const {
00267   FixHeader();
00268   str.FixHeader();
00269   const STRING_HEADER* str_header = str.GetHeader();
00270   const STRING_HEADER* this_header = GetHeader();
00271   int this_used = this_header->used_;
00272   int str_used  = str_header->used_;
00273 
00274   return (this_used == str_used)
00275           && (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
00276 }
00277 
00278 BOOL8 STRING::operator!=(const STRING& str) const {
00279   FixHeader();
00280   str.FixHeader();
00281   const STRING_HEADER* str_header = str.GetHeader();
00282   const STRING_HEADER* this_header = GetHeader();
00283   int this_used = this_header->used_;
00284   int str_used  = str_header->used_;
00285 
00286   return (this_used != str_used)
00287          || (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
00288 }
00289 
00290 BOOL8 STRING::operator!=(const char* cstr) const {
00291   FixHeader();
00292   const STRING_HEADER* this_header = GetHeader();
00293 
00294   if (cstr == NULL)
00295     return this_header->used_ > 1;  // either '\0' or NULL
00296   else {
00297     inT32 length = strlen(cstr) + 1;
00298     return (this_header->used_ != length)
00299             || (memcmp(GetCStr(), cstr, length) != 0);
00300   }
00301 }
00302 
00303 STRING& STRING::operator=(const STRING& str) {
00304   str.FixHeader();
00305   const STRING_HEADER* str_header = str.GetHeader();
00306   int   str_used = str_header->used_;
00307 
00308   GetHeader()->used_ = 0;  // clear since ensure doesnt need to copy data
00309   char* this_cstr = ensure_cstr(str_used);
00310   STRING_HEADER* this_header = GetHeader();
00311 
00312   memcpy(this_cstr, str.GetCStr(), str_used);
00313   this_header->used_ = str_used;
00314 
00315   assert(InvariantOk());
00316   return *this;
00317 }
00318 
00319 STRING & STRING::operator+=(const STRING& str) {
00320   FixHeader();
00321   str.FixHeader();
00322   const STRING_HEADER* str_header = str.GetHeader();
00323   const char* str_cstr = str.GetCStr();
00324   int  str_used  = str_header->used_;
00325   int  this_used = GetHeader()->used_;
00326   char* this_cstr = ensure_cstr(this_used + str_used);
00327 
00328   STRING_HEADER* this_header = GetHeader();  // after ensure for realloc
00329 
00330   if (this_used > 1) {
00331     memcpy(this_cstr + this_used - 1, str_cstr, str_used);
00332     this_header->used_ += str_used - 1;  // overwrite '\0'
00333   } else {
00334     memcpy(this_cstr, str_cstr, str_used);
00335     this_header->used_ = str_used;
00336   }
00337 
00338   assert(InvariantOk());
00339   return *this;
00340 }
00341 
00342 void STRING::add_str_int(const char* str, int number) {
00343   if (str != NULL)
00344     *this += str;
00345   // Allow space for the maximum possible length of inT64.
00346   char num_buffer[kMaxIntSize];
00347   snprintf(num_buffer, kMaxIntSize - 1, "%d", number);
00348   num_buffer[kMaxIntSize - 1] = '\0';
00349   *this += num_buffer;
00350 }
00351 // Appends the given string and double (as a %.8g) to this.
00352 void STRING::add_str_double(const char* str, double number) {
00353   if (str != NULL)
00354     *this += str;
00355   // Allow space for the maximum possible length of %8g.
00356   char num_buffer[kMaxDoubleSize];
00357   snprintf(num_buffer, kMaxDoubleSize - 1, "%.8g", number);
00358   num_buffer[kMaxDoubleSize - 1] = '\0';
00359   *this += num_buffer;
00360 }
00361 
00362 STRING & STRING::operator=(const char* cstr) {
00363   STRING_HEADER* this_header = GetHeader();
00364   if (cstr) {
00365     int len = strlen(cstr) + 1;
00366 
00367     this_header->used_ = 0;  // dont bother copying data if need to realloc
00368     char* this_cstr = ensure_cstr(len);
00369     this_header = GetHeader();  // for realloc
00370     memcpy(this_cstr, cstr, len);
00371     this_header->used_ = len;
00372   } else {
00373     // Reallocate to same state as default constructor.
00374     DiscardData();
00375     // Empty STRINGs contain just the "\0".
00376     memcpy(AllocData(1, kMinCapacity), "", 1);
00377   }
00378 
00379   assert(InvariantOk());
00380   return *this;
00381 }
00382 
00383 void STRING::assign(const char *cstr, int len) {
00384   STRING_HEADER* this_header = GetHeader();
00385   this_header->used_ = 0;  // dont bother copying data if need to realloc
00386   char* this_cstr = ensure_cstr(len + 1);  // +1 for '\0'
00387 
00388   this_header = GetHeader();  // for realloc
00389   memcpy(this_cstr, cstr, len);
00390   this_cstr[len] = '\0';
00391   this_header->used_ = len + 1;
00392 
00393   assert(InvariantOk());
00394 }
00395 
00396 STRING STRING::operator+(const STRING& str) const {
00397   STRING result(*this);
00398   result += str;
00399 
00400   assert(InvariantOk());
00401   return result;
00402 }
00403 
00404 
00405 STRING STRING::operator+(const char ch) const {
00406   STRING result;
00407   FixHeader();
00408   const STRING_HEADER* this_header = GetHeader();
00409   int this_used = this_header->used_;
00410   char* result_cstr = result.ensure_cstr(this_used + 1);
00411   STRING_HEADER* result_header = result.GetHeader();
00412   int result_used = result_header->used_;
00413 
00414   // copies '\0' but we'll overwrite that
00415   memcpy(result_cstr, GetCStr(), this_used);
00416   result_cstr[result_used] = ch;      // overwrite old '\0'
00417   result_cstr[result_used + 1] = '\0';  // append on '\0'
00418   ++result_header->used_;
00419 
00420   assert(InvariantOk());
00421   return result;
00422 }
00423 
00424 
00425 STRING&  STRING::operator+=(const char *str) {
00426   if (!str || !*str)  // empty string has no effect
00427     return *this;
00428 
00429   FixHeader();
00430   int len = strlen(str) + 1;
00431   int this_used = GetHeader()->used_;
00432   char* this_cstr = ensure_cstr(this_used + len);
00433   STRING_HEADER* this_header = GetHeader();  // after ensure for realloc
00434 
00435   // if we had non-empty string then append overwriting old '\0'
00436   // otherwise replace
00437   if (this_used > 0) {
00438     memcpy(this_cstr + this_used - 1, str, len);
00439     this_header->used_ += len - 1;
00440   } else {
00441     memcpy(this_cstr, str, len);
00442     this_header->used_ = len;
00443   }
00444 
00445   assert(InvariantOk());
00446   return *this;
00447 }
00448 
00449 
00450 STRING& STRING::operator+=(const char ch) {
00451   if (ch == '\0')
00452     return *this;
00453 
00454   FixHeader();
00455   int   this_used = GetHeader()->used_;
00456   char* this_cstr = ensure_cstr(this_used + 1);
00457   STRING_HEADER* this_header = GetHeader();
00458 
00459   if (this_used > 0)
00460     --this_used; // undo old empty null if there was one
00461 
00462   this_cstr[this_used++] = ch;   // append ch to end
00463   this_cstr[this_used++] = '\0'; // append '\0' after ch
00464   this_header->used_ = this_used;
00465 
00466   assert(InvariantOk());
00467   return *this;
00468 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines