tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccutil/genericheap.h
Go to the documentation of this file.
00001 // Copyright 2012 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00004 // File:        genericheap.h
00005 // Description: Template heap class.
00006 // Author:      Ray Smith, based on Dan Johnson's original code.
00007 // Created:     Wed Mar 14 08:13:00 PDT 2012
00008 //
00009 // (C) Copyright 2012, Google Inc.
00010 // Licensed under the Apache License, Version 2.0 (the "License");
00011 // you may not use this file except in compliance with the License.
00012 // You may obtain a copy of the License at
00013 // http://www.apache.org/licenses/LICENSE-2.0
00014 // Unless required by applicable law or agreed to in writing, software
00015 // distributed under the License is distributed on an "AS IS" BASIS,
00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017 // See the License for the specific language governing permissions and
00018 // limitations under the License.
00019 //
00021 
00022 #include "errcode.h"
00023 #include "genericvector.h"
00024 
00025 #ifndef TESSERACT_CCUTIL_GENERICHEAP_H_
00026 #define TESSERACT_CCUTIL_GENERICHEAP_H_
00027 
00028 namespace tesseract {
00029 
00030 // GenericHeap requires 1 template argument:
00031 // Pair will normally be either KDPairInc<Key, Data> or KDPairDec<Key, Data>
00032 // for some arbitrary Key and scalar, smart pointer, or non-ownership pointer
00033 // Data type, according to whether a MIN heap or a MAX heap is desired,
00034 // respectively. Using KDPtrPairInc<Key, Data> or KDPtrPairDec<Key, Data>,
00035 // GenericHeap can also handle simple Data pointers and own them.
00036 // If no additional data is required, Pair can also be a scalar, since
00037 // GenericHeap doesn't look inside it except for operator<.
00038 //
00039 // The heap is stored as a packed binary tree in an array hosted by a
00040 // GenericVector<Pair>, with the invariant that the children of each node are
00041 // both NOT Pair::operator< the parent node. KDPairInc defines Pair::operator<
00042 // to use Key::operator< to generate a MIN heap and KDPairDec defines
00043 // Pair::operator< to use Key::operator> to generate a MAX heap by reversing
00044 // all the comparisons.
00045 // See http://en.wikipedia.org/wiki/Heap_(data_structure) for more detail on
00046 // the basic heap implementation.
00047 //
00048 // Insertion and removal are both O(log n) and, unlike the STL heap, an
00049 // explicit Reshuffle function allows a node to be repositioned in time O(log n)
00050 // after changing its value.
00051 //
00052 // Accessing the element for revaluation is a more complex matter, since the
00053 // index and pointer can be changed arbitrarily by heap operations.
00054 // Revaluation can be done by making the Data type in the Pair derived from or
00055 // contain a DoublePtr as its first data element, making it possible to convert
00056 // the pointer to a Pair using KDPairInc::RecastDataPointer.
00057 template <typename Pair>
00058 class GenericHeap {
00059  public:
00060   GenericHeap() {}
00061   // The initial size is only a GenericVector::reserve. It is not enforced as
00062   // the size limit of the heap. Caller must implement their own enforcement.
00063   explicit GenericHeap(int initial_size) {
00064     heap_.reserve(initial_size);
00065   }
00066 
00067   // Simple accessors.
00068   bool empty() const {
00069     return heap_.empty();
00070   }
00071   int size() const {
00072     return heap_.size();
00073   }
00074   int size_reserved() const {
00075     return heap_.size_reserved();
00076   }
00077   void clear() {
00078     // Clear truncates to 0 to keep the number reserved in tact.
00079     heap_.truncate(0);
00080   }
00081   // Provides access to the underlying vector.
00082   // Caution! any changes that modify the keys will invalidate the heap!
00083   GenericVector<Pair>* heap() {
00084     return &heap_;
00085   }
00086   // Provides read-only access to an element of the underlying vector.
00087   const Pair& get(int index) const {
00088     return heap_[index];
00089   }
00090 
00091   // Add entry to the heap, keeping the smallest item at the top, by operator<.
00092   // Note that *entry is used as the source of operator=, but it is non-const
00093   // to allow for a smart pointer to be contained within.
00094   // Time = O(log n).
00095   void Push(Pair* entry) {
00096     int hole_index = heap_.size();
00097     // Make a hole in the end of heap_ and sift it up to be the correct
00098     // location for the new *entry. To avoid needing a default constructor
00099     // for primitive types, and to allow for use of DoublePtr in the Pair
00100     // somewhere, we have to incur a double copy here.
00101     heap_.push_back(*entry);
00102     *entry = heap_.back();
00103     hole_index = SiftUp(hole_index, *entry);
00104     heap_[hole_index] = *entry;
00105   }
00106 
00107   // Get the value of the top (smallest, defined by operator< ) element.
00108   const Pair& PeekTop() const {
00109     return heap_[0];
00110   }
00111 
00112   // Removes the top element of the heap. If entry is not NULL, the element
00113   // is copied into *entry, otherwise it is discarded.
00114   // Returns false if the heap was already empty.
00115   // Time = O(log n).
00116   bool Pop(Pair* entry) {
00117     int new_size = heap_.size() - 1;
00118     if (new_size < 0)
00119       return false;  // Already empty.
00120     if (entry != NULL)
00121       *entry = heap_[0];
00122     if (new_size > 0) {
00123       // Sift the hole at the start of the heap_ downwards to match the last
00124       // element.
00125       Pair hole_pair = heap_[new_size];
00126       heap_.truncate(new_size);
00127       int hole_index = SiftDown(0, hole_pair);
00128       heap_[hole_index] = hole_pair;
00129     } else {
00130       heap_.truncate(new_size);
00131     }
00132     return true;
00133   }
00134 
00135   // Removes the MAXIMUM element of the heap. (MIN from a MAX heap.) If entry is
00136   // not NULL, the element is copied into *entry, otherwise it is discarded.
00137   // Time = O(n). Returns false if the heap was already empty.
00138   bool PopWorst(Pair* entry) {
00139     int heap_size = heap_.size();
00140     if (heap_size == 0) return false;  // It cannot be empty!
00141 
00142     // Find the maximum element. Its index is guaranteed to be greater than
00143     // the index of the parent of the last element, since by the heap invariant
00144     // the parent must be less than or equal to the children.
00145     int worst_index = heap_size - 1;
00146     int end_parent = ParentNode(worst_index);
00147     for (int i = worst_index - 1; i > end_parent; --i) {
00148       if (heap_[worst_index] < heap_[i])
00149         worst_index = i;
00150     }
00151     // Extract the worst element from the heap, leaving a hole at worst_index.
00152     if (entry != NULL)
00153       *entry = heap_[worst_index];
00154     --heap_size;
00155     if (heap_size > 0) {
00156       // Sift the hole upwards to match the last element of the heap_
00157       Pair hole_pair = heap_[heap_size];
00158       int hole_index = SiftUp(worst_index, hole_pair);
00159       heap_[hole_index] = hole_pair;
00160     }
00161     heap_.truncate(heap_size);
00162     return true;
00163   }
00164 
00165   // The pointed-to Pair has changed its key value, so the location of pair
00166   // is reshuffled to maintain the heap invariant.
00167   // Must be a valid pointer to an element of the heap_!
00168   // Caution! Since GenericHeap is based on GenericVector, reallocs may occur
00169   // whenever the vector is extended and elements may get shuffled by any
00170   // Push or Pop operation. Therefore use this function only if Data in Pair is
00171   // of type DoublePtr, derived (first) from DoublePtr, or has a DoublePtr as
00172   // its first element. Reshuffles the heap to maintain the invariant.
00173   // Time = O(log n).
00174   void Reshuffle(Pair* pair) {
00175     int index = pair - &heap_[0];
00176     Pair hole_pair = heap_[index];
00177     index = SiftDown(index, hole_pair);
00178     index = SiftUp(index, hole_pair);
00179     heap_[index] = hole_pair;
00180   }
00181 
00182  private:
00183   // A hole in the heap exists at hole_index, and we want to fill it with the
00184   // given pair. SiftUp sifts the hole upward to the correct position and
00185   // returns the destination index without actually putting pair there.
00186   int SiftUp(int hole_index, const Pair& pair) {
00187     int parent;
00188     while (hole_index > 0 && pair < heap_[parent = ParentNode(hole_index)]) {
00189       heap_[hole_index] = heap_[parent];
00190       hole_index = parent;
00191     }
00192     return hole_index;
00193   }
00194 
00195   // A hole in the heap exists at hole_index, and we want to fill it with the
00196   // given pair. SiftDown sifts the hole downward to the correct position and
00197   // returns the destination index without actually putting pair there.
00198   int SiftDown(int hole_index, const Pair& pair) {
00199     int heap_size = heap_.size();
00200     int child;
00201     while ((child = LeftChild(hole_index)) < heap_size) {
00202       if (child + 1 < heap_size && heap_[child + 1] < heap_[child])
00203         ++child;
00204       if (heap_[child] < pair) {
00205         heap_[hole_index] = heap_[child];
00206         hole_index = child;
00207       } else {
00208         break;
00209       }
00210     }
00211     return hole_index;
00212   }
00213 
00214   // Functions to navigate the tree. Unlike the original implementation, we
00215   // store the root at index 0.
00216   int ParentNode(int index) const {
00217     return (index + 1) / 2 - 1;
00218   }
00219   int LeftChild(int index) const {
00220     return index * 2 + 1;
00221   }
00222 
00223  private:
00224   GenericVector<Pair> heap_;
00225 };
00226 
00227 }  // namespace tesseract
00228 
00229 #endif  // TESSERACT_CCUTIL_GENERICHEAP_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines