tesseract
3.03
|
00001 /********************************************************************** 00002 * File: statistc.h (Formerly stats.h) 00003 * Description: Class description for STATS class. 00004 * Author: Ray Smith 00005 * Created: Mon Feb 04 16:19:07 GMT 1991 00006 * 00007 * (C) Copyright 1991, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifndef TESSERACT_CCSTRUCT_STATISTC_H_ 00021 #define TESSERACT_CCSTRUCT_STATISTC_H_ 00022 00023 #include <stdio.h> 00024 #include "host.h" 00025 #include "kdpair.h" 00026 #include "scrollview.h" 00027 00028 template <typename T> class GenericVector; 00029 00030 00031 // Simple histogram-based statistics for integer values in a known 00032 // range, such that the range is small compared to the number of samples. 00033 class STATS { 00034 public: 00035 // The histogram buckets are in the range 00036 // [min_bucket_value, max_bucket_value_plus_1 - 1] i.e. 00037 // [min_bucket_value, max_bucket_value]. 00038 // Any data under min_bucket value is silently mapped to min_bucket_value, 00039 // and likewise, any data over max_bucket_value is silently mapped to 00040 // max_bucket_value. 00041 // In the internal array, min_bucket_value maps to 0 and 00042 // max_bucket_value_plus_1 - min_bucket_value to the array size. 00043 // TODO(rays) This is ugly. Convert the second argument to 00044 // max_bucket_value and all the code that uses it. 00045 STATS(inT32 min_bucket_value, inT32 max_bucket_value_plus_1); 00046 STATS(); // empty for arrays 00047 00048 ~STATS(); 00049 00050 // (Re)Sets the range and clears the counts. 00051 // See the constructor for info on max and min values. 00052 bool set_range(inT32 min_bucket_value, inT32 max_bucket_value_plus_1); 00053 00054 void clear(); // empty buckets 00055 00056 void add(inT32 value, inT32 count); 00057 00058 // "Accessors" return various statistics on the data. 00059 inT32 mode() const; // get mode of samples 00060 double mean() const; // get mean of samples 00061 double sd() const; // standard deviation 00062 // Returns the fractile value such that frac fraction (in [0,1]) of samples 00063 // has a value less than the return value. 00064 double ile(double frac) const; 00065 // Returns the minimum used entry in the histogram (ie the minimum of the 00066 // data, NOT the minimum of the supplied range, nor is it an index.) 00067 // Would normally be called min(), but that is a reserved word in VC++. 00068 inT32 min_bucket() const; // Find min 00069 // Returns the maximum used entry in the histogram (ie the maximum of the 00070 // data, NOT the maximum of the supplied range, nor is it an index.) 00071 inT32 max_bucket() const; // Find max 00072 // Finds a more useful estimate of median than ile(0.5). 00073 // Overcomes a problem with ile() - if the samples are, for example, 00074 // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway 00075 // between 6 and 13 = 9.5 00076 double median() const; // get median of samples 00077 // Returns the count of the given value. 00078 inT32 pile_count(inT32 value ) const { 00079 if (value <= rangemin_) 00080 return buckets_[0]; 00081 if (value >= rangemax_ - 1) 00082 return buckets_[rangemax_ - rangemin_ - 1]; 00083 return buckets_[value - rangemin_]; 00084 } 00085 // Returns the total count of all buckets. 00086 inT32 get_total() const { 00087 return total_count_; // total of all piles 00088 } 00089 // Returns true if x is a local min. 00090 bool local_min(inT32 x) const; 00091 00092 // Apply a triangular smoothing filter to the stats. 00093 // This makes the modes a bit more useful. 00094 // The factor gives the height of the triangle, i.e. the weight of the 00095 // centre. 00096 void smooth(inT32 factor); 00097 00098 // Cluster the samples into max_cluster clusters. 00099 // Each call runs one iteration. The array of clusters must be 00100 // max_clusters+1 in size as cluster 0 is used to indicate which samples 00101 // have been used. 00102 // The return value is the current number of clusters. 00103 inT32 cluster(float lower, // thresholds 00104 float upper, 00105 float multiple, // distance threshold 00106 inT32 max_clusters, // max no to make 00107 STATS *clusters); // array of clusters 00108 00109 // Finds (at most) the top max_modes modes, well actually the whole peak around 00110 // each mode, returning them in the given modes vector as a <mean of peak, 00111 // total count of peak> pair in order of decreasing total count. 00112 // Since the mean is the key and the count the data in the pair, a single call 00113 // to sort on the output will re-sort by increasing mean of peak if that is 00114 // more useful than decreasing total count. 00115 // Returns the actual number of modes found. 00116 int top_n_modes( 00117 int max_modes, 00118 GenericVector<tesseract::KDPairInc<float, int> >* modes) const; 00119 00120 // Prints a summary and table of the histogram. 00121 void print() const; 00122 // Prints summary stats only of the histogram. 00123 void print_summary() const; 00124 00125 #ifndef GRAPHICS_DISABLED 00126 // Draws the histogram as a series of rectangles. 00127 void plot(ScrollView* window, // window to draw in 00128 float xorigin, // origin of histo 00129 float yorigin, // gram 00130 float xscale, // size of one unit 00131 float yscale, // size of one uint 00132 ScrollView::Color colour) const; // colour to draw in 00133 00134 // Draws a line graph of the histogram. 00135 void plotline(ScrollView* window, // window to draw in 00136 float xorigin, // origin of histo 00137 float yorigin, // gram 00138 float xscale, // size of one unit 00139 float yscale, // size of one uint 00140 ScrollView::Color colour) const; // colour to draw in 00141 #endif // GRAPHICS_DISABLED 00142 00143 private: 00144 inT32 rangemin_; // min of range 00145 // rangemax_ is not well named as it is really one past the max. 00146 inT32 rangemax_; // max of range 00147 inT32 total_count_; // no of samples 00148 inT32* buckets_; // array of cells 00149 }; 00150 00151 // Returns the nth ordered item from the array, as if they were 00152 // ordered, but without ordering them, in linear time. 00153 // The array does get shuffled! 00154 inT32 choose_nth_item(inT32 index, // index to choose 00155 float *array, // array of items 00156 inT32 count); // no of items 00157 // Generic version uses a defined comparator (with qsort semantics). 00158 inT32 choose_nth_item(inT32 index, // index to choose 00159 void *array, // array of items 00160 inT32 count, // no of items 00161 size_t size, // element size 00162 int (*compar)(const void*, const void*)); // comparator 00163 // Swaps 2 entries in an array in-place. 00164 void swap_entries(void *array, // array of entries 00165 size_t size, // size of entry 00166 inT32 index1, // entries to swap 00167 inT32 index2); 00168 00169 #endif // TESSERACT_CCSTRUCT_STATISTC_H_