tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccstruct/statistc.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        statistc.h  (Formerly stats.h)
00003  * Description: Class description for STATS class.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Mon Feb 04 16:19:07 GMT 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef TESSERACT_CCSTRUCT_STATISTC_H_
00021 #define TESSERACT_CCSTRUCT_STATISTC_H_
00022 
00023 #include <stdio.h>
00024 #include "host.h"
00025 #include "kdpair.h"
00026 #include "scrollview.h"
00027 
00028 template <typename T> class GenericVector;
00029 
00030 
00031 // Simple histogram-based statistics for integer values in a known
00032 // range, such that the range is small compared to the number of samples.
00033 class STATS {
00034  public:
00035   // The histogram buckets are in the range
00036   // [min_bucket_value, max_bucket_value_plus_1 - 1] i.e.
00037   // [min_bucket_value, max_bucket_value].
00038   // Any data under min_bucket value is silently mapped to min_bucket_value,
00039   // and likewise, any data over max_bucket_value is silently mapped to
00040   // max_bucket_value.
00041   // In the internal array, min_bucket_value maps to 0 and
00042   // max_bucket_value_plus_1 - min_bucket_value to the array size.
00043   // TODO(rays) This is ugly. Convert the second argument to
00044   // max_bucket_value and all the code that uses it.
00045   STATS(inT32 min_bucket_value, inT32 max_bucket_value_plus_1);
00046   STATS();  // empty for arrays
00047 
00048   ~STATS();
00049 
00050   // (Re)Sets the range and clears the counts.
00051   // See the constructor for info on max and min values.
00052   bool set_range(inT32 min_bucket_value, inT32 max_bucket_value_plus_1);
00053 
00054   void clear();  // empty buckets
00055 
00056   void add(inT32 value, inT32 count);
00057 
00058   // "Accessors" return various statistics on the data.
00059   inT32 mode() const;  // get mode of samples
00060   double mean() const;  // get mean of samples
00061   double sd() const;  // standard deviation
00062   // Returns the fractile value such that frac fraction (in [0,1]) of samples
00063   // has a value less than the return value.
00064   double ile(double frac) const;
00065   // Returns the minimum used entry in the histogram (ie the minimum of the
00066   // data, NOT the minimum of the supplied range, nor is it an index.)
00067   // Would normally be called min(), but that is a reserved word in VC++.
00068   inT32 min_bucket() const;  // Find min
00069   // Returns the maximum used entry in the histogram (ie the maximum of the
00070   // data, NOT the maximum of the supplied range, nor is it an index.)
00071   inT32 max_bucket() const;  // Find max
00072   // Finds a more useful estimate of median than ile(0.5).
00073   // Overcomes a problem with ile() - if the samples are, for example,
00074   // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
00075   // between 6 and 13 = 9.5
00076   double median() const;  // get median of samples
00077   // Returns the count of the given value.
00078   inT32 pile_count(inT32 value ) const {
00079     if (value <= rangemin_)
00080       return buckets_[0];
00081     if (value >= rangemax_ - 1)
00082       return buckets_[rangemax_ - rangemin_ - 1];
00083     return buckets_[value - rangemin_];
00084   }
00085   // Returns the total count of all buckets.
00086   inT32 get_total() const {
00087     return total_count_;        // total of all piles
00088   }
00089   // Returns true if x is a local min.
00090   bool local_min(inT32 x) const;
00091 
00092   // Apply a triangular smoothing filter to the stats.
00093   // This makes the modes a bit more useful.
00094   // The factor gives the height of the triangle, i.e. the weight of the
00095   // centre.
00096   void smooth(inT32 factor);
00097 
00098   // Cluster the samples into max_cluster clusters.
00099   // Each call runs one iteration. The array of clusters must be
00100   // max_clusters+1 in size as cluster 0 is used to indicate which samples
00101   // have been used.
00102   // The return value is the current number of clusters.
00103   inT32 cluster(float lower,         // thresholds
00104                 float upper,
00105                 float multiple,      // distance threshold
00106                 inT32 max_clusters,  // max no to make
00107                 STATS *clusters);    // array of clusters
00108 
00109 // Finds (at most) the top max_modes modes, well actually the whole peak around
00110 // each mode, returning them in the given modes vector as a <mean of peak,
00111 // total count of peak> pair in order of decreasing total count.
00112 // Since the mean is the key and the count the data in the pair, a single call
00113 // to sort on the output will re-sort by increasing mean of peak if that is
00114 // more useful than decreasing total count.
00115 // Returns the actual number of modes found.
00116   int top_n_modes(
00117       int max_modes,
00118       GenericVector<tesseract::KDPairInc<float, int> >* modes) const;
00119 
00120   // Prints a summary and table of the histogram.
00121   void print() const;
00122   // Prints summary stats only of the histogram.
00123   void print_summary() const;
00124 
00125   #ifndef GRAPHICS_DISABLED
00126   // Draws the histogram as a series of rectangles.
00127   void plot(ScrollView* window,   // window to draw in
00128             float xorigin,   // origin of histo
00129             float yorigin,   // gram
00130             float xscale,    // size of one unit
00131             float yscale,    // size of one uint
00132             ScrollView::Color colour) const;  // colour to draw in
00133 
00134   // Draws a line graph of the histogram.
00135   void plotline(ScrollView* window,   // window to draw in
00136                 float xorigin,   // origin of histo
00137                 float yorigin,   // gram
00138                 float xscale,    // size of one unit
00139                 float yscale,    // size of one uint
00140                 ScrollView::Color colour) const;  // colour to draw in
00141   #endif  // GRAPHICS_DISABLED
00142 
00143  private:
00144   inT32 rangemin_;                // min of range
00145   // rangemax_ is not well named as it is really one past the max.
00146   inT32 rangemax_;                // max of range
00147   inT32 total_count_;             // no of samples
00148   inT32* buckets_;                // array of cells
00149 };
00150 
00151 // Returns the nth ordered item from the array, as if they were
00152 // ordered, but without ordering them, in linear time.
00153 // The array does get shuffled!
00154 inT32 choose_nth_item(inT32 index,   // index to choose
00155                       float *array,  // array of items
00156                       inT32 count);  // no of items
00157 // Generic version uses a defined comparator (with qsort semantics).
00158 inT32 choose_nth_item(inT32 index,   // index to choose
00159                       void *array,   // array of items
00160                       inT32 count,   // no of items
00161                       size_t size,   // element size
00162                       int (*compar)(const void*, const void*));  // comparator
00163 // Swaps 2 entries in an array in-place.
00164 void swap_entries(void *array,   // array of entries
00165                   size_t size,   // size of entry
00166                   inT32 index1,  // entries to swap
00167                   inT32 index2);
00168 
00169 #endif  // TESSERACT_CCSTRUCT_STATISTC_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines