tesseract
3.03
|
00001 // Copyright 2011 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00004 // File: bitvector.cpp 00005 // Description: Class replacement for BITVECTOR. 00006 // Author: Ray Smith 00007 // Created: Mon Jan 10 17:45:01 PST 2011 00008 // 00009 // (C) Copyright 2011, Google Inc. 00010 // Licensed under the Apache License, Version 2.0 (the "License"); 00011 // you may not use this file except in compliance with the License. 00012 // You may obtain a copy of the License at 00013 // http://www.apache.org/licenses/LICENSE-2.0 00014 // Unless required by applicable law or agreed to in writing, software 00015 // distributed under the License is distributed on an "AS IS" BASIS, 00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00017 // See the License for the specific language governing permissions and 00018 // limitations under the License. 00019 // 00021 00022 #include "bitvector.h" 00023 #include <string.h> 00024 #include "helpers.h" 00025 #include "ndminx.h" 00026 00027 namespace tesseract { 00028 00029 // Fast lookup table to get the first least significant set bit in a byte. 00030 // For zero, the table has 255, but since it is a special case, most code 00031 // that uses this table will check for zero before looking up lsb_index_. 00032 const uinT8 BitVector::lsb_index_[256] = { 00033 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00034 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00035 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00036 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00037 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00038 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00039 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00040 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00041 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00042 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00043 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00044 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00045 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00046 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00047 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 00048 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 00049 }; 00050 00051 // Fast lookup table to get the residual bits after zeroing the first (lowest) 00052 // set bit in a byte. 00053 const uinT8 BitVector::lsb_eroded_[256] = { 00054 0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6, 00055 0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e, 00056 0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 00057 0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e, 00058 0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26, 00059 0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e, 00060 0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36, 00061 0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e, 00062 0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 00063 0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e, 00064 0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56, 00065 0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e, 00066 0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66, 00067 0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e, 00068 0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 00069 0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e, 00070 0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86, 00071 0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e, 00072 0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96, 00073 0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e, 00074 0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 00075 0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae, 00076 0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6, 00077 0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe, 00078 0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6, 00079 0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce, 00080 0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 00081 0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde, 00082 0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6, 00083 0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee, 00084 0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6, 00085 0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe 00086 }; 00087 00088 // Fast lookup table to give the number of set bits in a byte. 00089 const int BitVector::hamming_table_[256] = { 00090 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 00091 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 00092 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 00093 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 00094 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 00095 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 00096 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 00097 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 00098 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 00099 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 00100 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 00101 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 00102 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 00103 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 00104 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 00105 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 00106 }; 00107 00108 00109 BitVector::BitVector() : bit_size_(0), array_(NULL) {} 00110 00111 BitVector::BitVector(int length) : bit_size_(length) { 00112 array_ = new uinT32[WordLength()]; 00113 SetAllFalse(); 00114 } 00115 00116 BitVector::BitVector(const BitVector& src) : bit_size_(src.bit_size_) { 00117 array_ = new uinT32[WordLength()]; 00118 memcpy(array_, src.array_, ByteLength()); 00119 } 00120 00121 BitVector& BitVector::operator=(const BitVector& src) { 00122 Alloc(src.bit_size_); 00123 memcpy(array_, src.array_, ByteLength()); 00124 return *this; 00125 } 00126 00127 BitVector::~BitVector() { 00128 delete [] array_; 00129 } 00130 00131 // Initializes the array to length * false. 00132 void BitVector::Init(int length) { 00133 Alloc(length); 00134 SetAllFalse(); 00135 } 00136 00137 // Writes to the given file. Returns false in case of error. 00138 bool BitVector::Serialize(FILE* fp) const { 00139 if (fwrite(&bit_size_, sizeof(bit_size_), 1, fp) != 1) return false; 00140 int wordlen = WordLength(); 00141 if (static_cast<int>(fwrite(array_, sizeof(*array_), wordlen, fp)) != wordlen) 00142 return false; 00143 return true; 00144 } 00145 00146 // Reads from the given file. Returns false in case of error. 00147 // If swap is true, assumes a big/little-endian swap is needed. 00148 bool BitVector::DeSerialize(bool swap, FILE* fp) { 00149 uinT32 new_bit_size; 00150 if (fread(&new_bit_size, sizeof(new_bit_size), 1, fp) != 1) return false; 00151 if (swap) { 00152 ReverseN(&new_bit_size, sizeof(new_bit_size)); 00153 } 00154 Alloc(new_bit_size); 00155 int wordlen = WordLength(); 00156 if (static_cast<int>(fread(array_, sizeof(*array_), wordlen, fp)) != wordlen) 00157 return false; 00158 if (swap) { 00159 for (int i = 0; i < wordlen; ++i) 00160 ReverseN(&array_[i], sizeof(array_[i])); 00161 } 00162 return true; 00163 } 00164 00165 void BitVector::SetAllFalse() { 00166 memset(array_, 0, ByteLength()); 00167 } 00168 void BitVector::SetAllTrue() { 00169 memset(array_, ~0, ByteLength()); 00170 } 00171 00172 // Returns the index of the next set bit after the given index. 00173 // Useful for quickly iterating through the set bits in a sparse vector. 00174 int BitVector::NextSetBit(int prev_bit) const { 00175 // Move on to the next bit. 00176 int next_bit = prev_bit + 1; 00177 if (next_bit >= bit_size_) return -1; 00178 // Check the remains of the word containing the next_bit first. 00179 int next_word = WordIndex(next_bit); 00180 int bit_index = next_word * kBitFactor; 00181 int word_end = bit_index + kBitFactor; 00182 uinT32 word = array_[next_word]; 00183 uinT8 byte = word & 0xff; 00184 while (bit_index < word_end) { 00185 if (bit_index + 8 > next_bit && byte != 0) { 00186 while (bit_index + lsb_index_[byte] < next_bit && byte != 0) 00187 byte = lsb_eroded_[byte]; 00188 if (byte != 0) 00189 return bit_index + lsb_index_[byte]; 00190 } 00191 word >>= 8; 00192 bit_index += 8; 00193 byte = word & 0xff; 00194 } 00195 // next_word didn't contain a 1, so find the next word with set bit. 00196 ++next_word; 00197 int wordlen = WordLength(); 00198 while (next_word < wordlen && (word = array_[next_word]) == 0) { 00199 ++next_word; 00200 bit_index += kBitFactor; 00201 } 00202 if (bit_index >= bit_size_) return -1; 00203 // Find the first non-zero byte within the word. 00204 while ((word & 0xff) == 0) { 00205 word >>= 8; 00206 bit_index += 8; 00207 } 00208 return bit_index + lsb_index_[word & 0xff]; 00209 } 00210 00211 // Returns the number of set bits in the vector. 00212 int BitVector::NumSetBits() const { 00213 int wordlen = WordLength(); 00214 int total_bits = 0; 00215 for (int w = 0; w < wordlen; ++w) { 00216 uinT32 word = array_[w]; 00217 for (int i = 0; i < 4; ++i) { 00218 total_bits += hamming_table_[word & 0xff]; 00219 word >>= 8; 00220 } 00221 } 00222 return total_bits; 00223 } 00224 00225 // Logical in-place operations on whole bit vectors. Tries to do something 00226 // sensible if they aren't the same size, but they should be really. 00227 void BitVector::operator|=(const BitVector& other) { 00228 int length = MIN(WordLength(), other.WordLength()); 00229 for (int w = 0; w < length; ++w) 00230 array_[w] |= other.array_[w]; 00231 } 00232 void BitVector::operator&=(const BitVector& other) { 00233 int length = MIN(WordLength(), other.WordLength()); 00234 for (int w = 0; w < length; ++w) 00235 array_[w] &= other.array_[w]; 00236 for (int w = WordLength() - 1; w >= length; --w) 00237 array_[w] = 0; 00238 } 00239 void BitVector::operator^=(const BitVector& other) { 00240 int length = MIN(WordLength(), other.WordLength()); 00241 for (int w = 0; w < length; ++w) 00242 array_[w] ^= other.array_[w]; 00243 } 00244 // Set subtraction *this = v1 - v2. 00245 void BitVector::SetSubtract(const BitVector& v1, const BitVector& v2) { 00246 Alloc(v1.size()); 00247 int length = MIN(v1.WordLength(), v2.WordLength()); 00248 for (int w = 0; w < length; ++w) 00249 array_[w] = v1.array_[w] ^ (v1.array_[w] & v2.array_[w]); 00250 for (int w = WordLength() - 1; w >= length; --w) 00251 array_[w] = v1.array_[w]; 00252 } 00253 00254 // Allocates memory for a vector of the given length. 00255 // Reallocates if the array is a different size, larger or smaller. 00256 void BitVector::Alloc(int length) { 00257 int initial_wordlength = WordLength(); 00258 bit_size_ = length; 00259 int new_wordlength = WordLength(); 00260 if (new_wordlength != initial_wordlength) { 00261 delete [] array_; 00262 array_ = new uinT32[new_wordlength]; 00263 } 00264 } 00265 00266 00267 } // namespace tesseract. 00268 00269