tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccutil/bitvector.cpp
Go to the documentation of this file.
00001 // Copyright 2011 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00004 // File:        bitvector.cpp
00005 // Description: Class replacement for BITVECTOR.
00006 // Author:      Ray Smith
00007 // Created:     Mon Jan 10 17:45:01 PST 2011
00008 //
00009 // (C) Copyright 2011, Google Inc.
00010 // Licensed under the Apache License, Version 2.0 (the "License");
00011 // you may not use this file except in compliance with the License.
00012 // You may obtain a copy of the License at
00013 // http://www.apache.org/licenses/LICENSE-2.0
00014 // Unless required by applicable law or agreed to in writing, software
00015 // distributed under the License is distributed on an "AS IS" BASIS,
00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017 // See the License for the specific language governing permissions and
00018 // limitations under the License.
00019 //
00021 
00022 #include "bitvector.h"
00023 #include <string.h>
00024 #include "helpers.h"
00025 #include "ndminx.h"
00026 
00027 namespace tesseract {
00028 
00029 // Fast lookup table to get the first least significant set bit in a byte.
00030 // For zero, the table has 255, but since it is a special case, most code
00031 // that uses this table will check for zero before looking up lsb_index_.
00032 const uinT8 BitVector::lsb_index_[256] = {
00033   255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00034   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00035   5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00036   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00037   6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00038   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00039   5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00040   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00041   7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00042   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00043   5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00044   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00045   6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00046   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00047   5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
00048   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
00049 };
00050 
00051 // Fast lookup table to get the residual bits after zeroing the first (lowest)
00052 // set bit in a byte.
00053 const uinT8 BitVector::lsb_eroded_[256] = {
00054   0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6,
00055   0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e,
00056   0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16,
00057   0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e,
00058   0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26,
00059   0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e,
00060   0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36,
00061   0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e,
00062   0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46,
00063   0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e,
00064   0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56,
00065   0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e,
00066   0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66,
00067   0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e,
00068   0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76,
00069   0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e,
00070   0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86,
00071   0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e,
00072   0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96,
00073   0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e,
00074   0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6,
00075   0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae,
00076   0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6,
00077   0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe,
00078   0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6,
00079   0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce,
00080   0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6,
00081   0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde,
00082   0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6,
00083   0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee,
00084   0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6,
00085   0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe
00086 };
00087 
00088 // Fast lookup table to give the number of set bits in a byte.
00089 const int BitVector::hamming_table_[256] = {
00090     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
00091     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00092     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00093     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00094     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00095     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00096     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00097     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00098     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00099     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00100     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00101     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00102     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00103     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00104     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00105     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
00106 };
00107 
00108 
00109 BitVector::BitVector() : bit_size_(0), array_(NULL) {}
00110 
00111 BitVector::BitVector(int length) : bit_size_(length) {
00112   array_ = new uinT32[WordLength()];
00113   SetAllFalse();
00114 }
00115 
00116 BitVector::BitVector(const BitVector& src) : bit_size_(src.bit_size_) {
00117   array_ = new uinT32[WordLength()];
00118   memcpy(array_, src.array_, ByteLength());
00119 }
00120 
00121 BitVector& BitVector::operator=(const BitVector& src) {
00122   Alloc(src.bit_size_);
00123   memcpy(array_, src.array_, ByteLength());
00124   return *this;
00125 }
00126 
00127 BitVector::~BitVector() {
00128   delete [] array_;
00129 }
00130 
00131 // Initializes the array to length * false.
00132 void BitVector::Init(int length) {
00133   Alloc(length);
00134   SetAllFalse();
00135 }
00136 
00137 // Writes to the given file. Returns false in case of error.
00138 bool BitVector::Serialize(FILE* fp) const {
00139   if (fwrite(&bit_size_, sizeof(bit_size_), 1, fp) != 1) return false;
00140   int wordlen = WordLength();
00141   if (static_cast<int>(fwrite(array_, sizeof(*array_), wordlen, fp)) != wordlen)
00142       return false;
00143   return true;
00144 }
00145 
00146 // Reads from the given file. Returns false in case of error.
00147 // If swap is true, assumes a big/little-endian swap is needed.
00148 bool BitVector::DeSerialize(bool swap, FILE* fp) {
00149   uinT32 new_bit_size;
00150   if (fread(&new_bit_size, sizeof(new_bit_size), 1, fp) != 1) return false;
00151   if (swap) {
00152     ReverseN(&new_bit_size, sizeof(new_bit_size));
00153   }
00154   Alloc(new_bit_size);
00155   int wordlen = WordLength();
00156   if (static_cast<int>(fread(array_, sizeof(*array_), wordlen, fp)) != wordlen)
00157       return false;
00158   if (swap) {
00159     for (int i = 0; i < wordlen; ++i)
00160       ReverseN(&array_[i], sizeof(array_[i]));
00161   }
00162   return true;
00163 }
00164 
00165 void BitVector::SetAllFalse() {
00166   memset(array_, 0, ByteLength());
00167 }
00168 void BitVector::SetAllTrue() {
00169   memset(array_, ~0, ByteLength());
00170 }
00171 
00172 // Returns the index of the next set bit after the given index.
00173 // Useful for quickly iterating through the set bits in a sparse vector.
00174 int BitVector::NextSetBit(int prev_bit) const {
00175   // Move on to the next bit.
00176   int next_bit = prev_bit + 1;
00177   if (next_bit >= bit_size_) return -1;
00178   // Check the remains of the word containing the next_bit first.
00179   int next_word = WordIndex(next_bit);
00180   int bit_index = next_word * kBitFactor;
00181   int word_end = bit_index + kBitFactor;
00182   uinT32 word = array_[next_word];
00183   uinT8 byte = word & 0xff;
00184   while (bit_index < word_end) {
00185     if (bit_index + 8 > next_bit && byte != 0) {
00186       while (bit_index + lsb_index_[byte] < next_bit && byte != 0)
00187         byte = lsb_eroded_[byte];
00188       if (byte != 0)
00189         return bit_index + lsb_index_[byte];
00190     }
00191     word >>= 8;
00192     bit_index += 8;
00193     byte = word & 0xff;
00194   }
00195   // next_word didn't contain a 1, so find the next word with set bit.
00196   ++next_word;
00197   int wordlen = WordLength();
00198   while (next_word < wordlen && (word = array_[next_word]) == 0) {
00199     ++next_word;
00200     bit_index += kBitFactor;
00201   }
00202   if (bit_index >= bit_size_) return -1;
00203   // Find the first non-zero byte within the word.
00204   while ((word & 0xff) == 0) {
00205     word >>= 8;
00206     bit_index += 8;
00207   }
00208   return bit_index + lsb_index_[word & 0xff];
00209 }
00210 
00211 // Returns the number of set bits in the vector.
00212 int BitVector::NumSetBits() const {
00213   int wordlen = WordLength();
00214   int total_bits = 0;
00215   for (int w = 0; w < wordlen; ++w) {
00216     uinT32 word = array_[w];
00217     for (int i = 0; i < 4; ++i) {
00218       total_bits += hamming_table_[word & 0xff];
00219       word >>= 8;
00220     }
00221   }
00222   return total_bits;
00223 }
00224 
00225 // Logical in-place operations on whole bit vectors. Tries to do something
00226 // sensible if they aren't the same size, but they should be really.
00227 void BitVector::operator|=(const BitVector& other) {
00228   int length = MIN(WordLength(), other.WordLength());
00229   for (int w = 0; w < length; ++w)
00230     array_[w] |= other.array_[w];
00231 }
00232 void BitVector::operator&=(const BitVector& other) {
00233   int length = MIN(WordLength(), other.WordLength());
00234   for (int w = 0; w < length; ++w)
00235     array_[w] &= other.array_[w];
00236   for (int w = WordLength() - 1; w >= length; --w)
00237     array_[w] = 0;
00238 }
00239 void BitVector::operator^=(const BitVector& other) {
00240   int length = MIN(WordLength(), other.WordLength());
00241   for (int w = 0; w < length; ++w)
00242     array_[w] ^= other.array_[w];
00243 }
00244 // Set subtraction *this = v1 - v2.
00245 void BitVector::SetSubtract(const BitVector& v1, const BitVector& v2) {
00246   Alloc(v1.size());
00247   int length = MIN(v1.WordLength(), v2.WordLength());
00248   for (int w = 0; w < length; ++w)
00249     array_[w] = v1.array_[w] ^ (v1.array_[w] & v2.array_[w]);
00250   for (int w = WordLength() - 1; w >= length; --w)
00251     array_[w] = v1.array_[w];
00252 }
00253 
00254 // Allocates memory for a vector of the given length.
00255 // Reallocates if the array is a different size, larger or smaller.
00256 void BitVector::Alloc(int length) {
00257   int initial_wordlength = WordLength();
00258   bit_size_ = length;
00259   int new_wordlength = WordLength();
00260   if (new_wordlength != initial_wordlength) {
00261     delete [] array_;
00262     array_ = new uinT32[new_wordlength];
00263   }
00264 }
00265 
00266 
00267 }  // namespace tesseract.
00268 
00269 
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines