tesseract
3.03
|
00001 /****************************************************************************** 00002 ** Filename: cluster.c 00003 ** Purpose: Routines for clustering points in N-D space 00004 ** Author: Dan Johnson 00005 ** History: 5/29/89, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 #include "const.h" 00019 #include "cluster.h" 00020 #include "emalloc.h" 00021 #include "genericheap.h" 00022 #include "helpers.h" 00023 #include "kdpair.h" 00024 #include "matrix.h" 00025 #include "tprintf.h" 00026 #include "danerror.h" 00027 #include "freelist.h" 00028 #include <math.h> 00029 00030 #define HOTELLING 1 // If true use Hotelling's test to decide where to split. 00031 #define FTABLE_X 10 // Size of FTable. 00032 #define FTABLE_Y 100 // Size of FTable. 00033 00034 // Table of values approximating the cumulative F-distribution for a confidence of 1%. 00035 const double FTable[FTABLE_Y][FTABLE_X] = { 00036 {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,}, 00037 {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,}, 00038 {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,}, 00039 {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,}, 00040 {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,}, 00041 {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,}, 00042 {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,}, 00043 {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,}, 00044 {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,}, 00045 {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,}, 00046 { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,}, 00047 { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,}, 00048 { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,}, 00049 { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,}, 00050 { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,}, 00051 { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,}, 00052 { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,}, 00053 { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,}, 00054 { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,}, 00055 { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,}, 00056 { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,}, 00057 { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,}, 00058 { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,}, 00059 { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,}, 00060 { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,}, 00061 { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,}, 00062 { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,}, 00063 { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,}, 00064 { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,}, 00065 { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,}, 00066 { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,}, 00067 { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,}, 00068 { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,}, 00069 { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,}, 00070 { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,}, 00071 { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,}, 00072 { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,}, 00073 { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,}, 00074 { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,}, 00075 { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,}, 00076 { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,}, 00077 { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,}, 00078 { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,}, 00079 { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,}, 00080 { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,}, 00081 { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,}, 00082 { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,}, 00083 { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,}, 00084 { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,}, 00085 { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,}, 00086 { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,}, 00087 { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,}, 00088 { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,}, 00089 { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,}, 00090 { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,}, 00091 { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,}, 00092 { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,}, 00093 { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,}, 00094 { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,}, 00095 { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,}, 00096 { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,}, 00097 { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,}, 00098 { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,}, 00099 { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,}, 00100 { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,}, 00101 { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,}, 00102 { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,}, 00103 { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,}, 00104 { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,}, 00105 { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,}, 00106 { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,}, 00107 { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,}, 00108 { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,}, 00109 { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,}, 00110 { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,}, 00111 { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,}, 00112 { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,}, 00113 { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,}, 00114 { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,}, 00115 { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,}, 00116 { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,}, 00117 { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,}, 00118 { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,}, 00119 { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,}, 00120 { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,}, 00121 { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,}, 00122 { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,}, 00123 { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,}, 00124 { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,}, 00125 { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,}, 00126 { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,}, 00127 { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,}, 00128 { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,}, 00129 { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,}, 00130 { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,}, 00131 { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,}, 00132 { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,}, 00133 { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,}, 00134 { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,}, 00135 { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503} 00136 }; 00137 00138 /* define the variance which will be used as a minimum variance for any 00139 dimension of any feature. Since most features are calculated from numbers 00140 with a precision no better than 1 in 128, the variance should never be 00141 less than the square of this number for parameters whose range is 1. */ 00142 #define MINVARIANCE 0.0004 00143 00144 /* define the absolute minimum number of samples which must be present in 00145 order to accurately test hypotheses about underlying probability 00146 distributions. Define separately the minimum samples that are needed 00147 before a statistical analysis is attempted; this number should be 00148 equal to MINSAMPLES but can be set to a lower number for early testing 00149 when very few samples are available. */ 00150 #define MINSAMPLESPERBUCKET 5 00151 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET) 00152 #define MINSAMPLESNEEDED 1 00153 00154 /* define the size of the table which maps normalized samples to 00155 histogram buckets. Also define the number of standard deviations 00156 in a normal distribution which are considered to be significant. 00157 The mapping table will be defined in such a way that it covers 00158 the specified number of standard deviations on either side of 00159 the mean. BUCKETTABLESIZE should always be even. */ 00160 #define BUCKETTABLESIZE 1024 00161 #define NORMALEXTENT 3.0 00162 00163 struct TEMPCLUSTER { 00164 CLUSTER *Cluster; 00165 CLUSTER *Neighbor; 00166 }; 00167 00168 typedef tesseract::KDPairInc<float, TEMPCLUSTER*> ClusterPair; 00169 typedef tesseract::GenericHeap<ClusterPair> ClusterHeap; 00170 00171 struct STATISTICS { 00172 FLOAT32 AvgVariance; 00173 FLOAT32 *CoVariance; 00174 FLOAT32 *Min; // largest negative distance from the mean 00175 FLOAT32 *Max; // largest positive distance from the mean 00176 }; 00177 00178 struct BUCKETS { 00179 DISTRIBUTION Distribution; // distribution being tested for 00180 uinT32 SampleCount; // # of samples in histogram 00181 FLOAT64 Confidence; // confidence level of test 00182 FLOAT64 ChiSquared; // test threshold 00183 uinT16 NumberOfBuckets; // number of cells in histogram 00184 uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets 00185 uinT32 *Count; // frequency of occurence histogram 00186 FLOAT32 *ExpectedCount; // expected histogram 00187 }; 00188 00189 struct CHISTRUCT{ 00190 uinT16 DegreesOfFreedom; 00191 FLOAT64 Alpha; 00192 FLOAT64 ChiSquared; 00193 }; 00194 00195 // For use with KDWalk / MakePotentialClusters 00196 struct ClusteringContext { 00197 ClusterHeap *heap; // heap used to hold temp clusters, "best" on top 00198 TEMPCLUSTER *candidates; // array of potential clusters 00199 KDTREE *tree; // kd-tree to be searched for neighbors 00200 inT32 next; // next candidate to be used 00201 }; 00202 00203 typedef FLOAT64 (*DENSITYFUNC) (inT32); 00204 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double); 00205 00206 #define Odd(N) ((N)%2) 00207 #define Mirror(N,R) ((R) - (N) - 1) 00208 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) ) 00209 00210 //--------------Global Data Definitions and Declarations---------------------- 00211 /* the following variables describe a discrete normal distribution 00212 which is used by NormalDensity() and NormalBucket(). The 00213 constant NORMALEXTENT determines how many standard 00214 deviations of the distribution are mapped onto the fixed 00215 discrete range of x. x=0 is mapped to -NORMALEXTENT standard 00216 deviations and x=BUCKETTABLESIZE is mapped to 00217 +NORMALEXTENT standard deviations. */ 00218 #define SqrtOf2Pi 2.506628275 00219 static const FLOAT64 kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT); 00220 static const FLOAT64 kNormalVariance = 00221 (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT); 00222 static const FLOAT64 kNormalMagnitude = 00223 (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE); 00224 static const FLOAT64 kNormalMean = BUCKETTABLESIZE / 2; 00225 00226 /* define lookup tables used to compute the number of histogram buckets 00227 that should be used for a given number of samples. */ 00228 #define LOOKUPTABLESIZE 8 00229 #define MAXDEGREESOFFREEDOM MAXBUCKETS 00230 00231 static const uinT32 kCountTable[LOOKUPTABLESIZE] = { 00232 MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000 00233 }; // number of samples 00234 00235 static const uinT16 kBucketsTable[LOOKUPTABLESIZE] = { 00236 MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS 00237 }; // number of buckets 00238 00239 /*------------------------------------------------------------------------- 00240 Private Function Prototypes 00241 --------------------------------------------------------------------------*/ 00242 void CreateClusterTree(CLUSTERER *Clusterer); 00243 00244 void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, 00245 inT32 Level); 00246 00247 CLUSTER *FindNearestNeighbor(KDTREE *Tree, 00248 CLUSTER *Cluster, 00249 FLOAT32 *Distance); 00250 00251 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster); 00252 00253 inT32 MergeClusters (inT16 N, 00254 register PARAM_DESC ParamDesc[], 00255 register inT32 n1, 00256 register inT32 n2, 00257 register FLOAT32 m[], 00258 register FLOAT32 m1[], register FLOAT32 m2[]); 00259 00260 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); 00261 00262 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, 00263 CLUSTERCONFIG *Config, 00264 CLUSTER *Cluster); 00265 00266 PROTOTYPE *MakeDegenerateProto(uinT16 N, 00267 CLUSTER *Cluster, 00268 STATISTICS *Statistics, 00269 PROTOSTYLE Style, 00270 inT32 MinSamples); 00271 00272 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, 00273 CLUSTERCONFIG *Config, 00274 CLUSTER *Cluster, 00275 STATISTICS *Statistics); 00276 00277 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, 00278 CLUSTER *Cluster, 00279 STATISTICS *Statistics, 00280 BUCKETS *Buckets); 00281 00282 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, 00283 CLUSTER *Cluster, 00284 STATISTICS *Statistics, 00285 BUCKETS *Buckets); 00286 00287 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, 00288 CLUSTER *Cluster, 00289 STATISTICS *Statistics, 00290 BUCKETS *NormalBuckets, 00291 FLOAT64 Confidence); 00292 00293 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc); 00294 00295 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics); 00296 00297 STATISTICS *ComputeStatistics (inT16 N, 00298 PARAM_DESC ParamDesc[], CLUSTER * Cluster); 00299 00300 PROTOTYPE *NewSphericalProto(uinT16 N, 00301 CLUSTER *Cluster, 00302 STATISTICS *Statistics); 00303 00304 PROTOTYPE *NewEllipticalProto(inT16 N, 00305 CLUSTER *Cluster, 00306 STATISTICS *Statistics); 00307 00308 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics); 00309 00310 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster); 00311 00312 BOOL8 Independent (PARAM_DESC ParamDesc[], 00313 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence); 00314 00315 BUCKETS *GetBuckets(CLUSTERER* clusterer, 00316 DISTRIBUTION Distribution, 00317 uinT32 SampleCount, 00318 FLOAT64 Confidence); 00319 00320 BUCKETS *MakeBuckets(DISTRIBUTION Distribution, 00321 uinT32 SampleCount, 00322 FLOAT64 Confidence); 00323 00324 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount); 00325 00326 FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha); 00327 00328 FLOAT64 NormalDensity(inT32 x); 00329 00330 FLOAT64 UniformDensity(inT32 x); 00331 00332 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx); 00333 00334 void FillBuckets(BUCKETS *Buckets, 00335 CLUSTER *Cluster, 00336 uinT16 Dim, 00337 PARAM_DESC *ParamDesc, 00338 FLOAT32 Mean, 00339 FLOAT32 StdDev); 00340 00341 uinT16 NormalBucket(PARAM_DESC *ParamDesc, 00342 FLOAT32 x, 00343 FLOAT32 Mean, 00344 FLOAT32 StdDev); 00345 00346 uinT16 UniformBucket(PARAM_DESC *ParamDesc, 00347 FLOAT32 x, 00348 FLOAT32 Mean, 00349 FLOAT32 StdDev); 00350 00351 BOOL8 DistributionOK(BUCKETS *Buckets); 00352 00353 void FreeStatistics(STATISTICS *Statistics); 00354 00355 void FreeBuckets(BUCKETS *Buckets); 00356 00357 void FreeCluster(CLUSTER *Cluster); 00358 00359 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets); 00360 00361 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram, 00362 void *arg2); // uinT16 *DesiredNumberOfBuckets); 00363 00364 int ListEntryMatch(void *arg1, void *arg2); 00365 00366 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount); 00367 00368 void InitBuckets(BUCKETS *Buckets); 00369 00370 int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct, 00371 void *arg2); // CHISTRUCT *SearchKey); 00372 00373 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha); 00374 00375 FLOAT64 Solve(SOLVEFUNC Function, 00376 void *FunctionParams, 00377 FLOAT64 InitialGuess, 00378 FLOAT64 Accuracy); 00379 00380 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x); 00381 00382 BOOL8 MultipleCharSamples(CLUSTERER *Clusterer, 00383 CLUSTER *Cluster, 00384 FLOAT32 MaxIllegal); 00385 00386 double InvertMatrix(const float* input, int size, float* inv); 00387 00388 //--------------------------Public Code-------------------------------------- 00398 CLUSTERER * 00399 MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) { 00400 CLUSTERER *Clusterer; 00401 int i; 00402 00403 // allocate main clusterer data structure and init simple fields 00404 Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER)); 00405 Clusterer->SampleSize = SampleSize; 00406 Clusterer->NumberOfSamples = 0; 00407 Clusterer->NumChar = 0; 00408 00409 // init fields which will not be used initially 00410 Clusterer->Root = NULL; 00411 Clusterer->ProtoList = NIL_LIST; 00412 00413 // maintain a copy of param descriptors in the clusterer data structure 00414 Clusterer->ParamDesc = 00415 (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC)); 00416 for (i = 0; i < SampleSize; i++) { 00417 Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular; 00418 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential; 00419 Clusterer->ParamDesc[i].Min = ParamDesc[i].Min; 00420 Clusterer->ParamDesc[i].Max = ParamDesc[i].Max; 00421 Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min; 00422 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2; 00423 Clusterer->ParamDesc[i].MidRange = 00424 (ParamDesc[i].Max + ParamDesc[i].Min) / 2; 00425 } 00426 00427 // allocate a kd tree to hold the samples 00428 Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc); 00429 00430 // Initialize cache of histogram buckets to minimize recomputing them. 00431 for (int d = 0; d < DISTRIBUTION_COUNT; ++d) { 00432 for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c) 00433 Clusterer->bucket_cache[d][c] = NULL; 00434 } 00435 00436 return Clusterer; 00437 } // MakeClusterer 00438 00439 00454 SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, 00455 inT32 CharID) { 00456 SAMPLE *Sample; 00457 int i; 00458 00459 // see if the samples have already been clustered - if so trap an error 00460 if (Clusterer->Root != NULL) 00461 DoError (ALREADYCLUSTERED, 00462 "Can't add samples after they have been clustered"); 00463 00464 // allocate the new sample and initialize it 00465 Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) + 00466 (Clusterer->SampleSize - 00467 1) * sizeof (FLOAT32)); 00468 Sample->Clustered = FALSE; 00469 Sample->Prototype = FALSE; 00470 Sample->SampleCount = 1; 00471 Sample->Left = NULL; 00472 Sample->Right = NULL; 00473 Sample->CharID = CharID; 00474 00475 for (i = 0; i < Clusterer->SampleSize; i++) 00476 Sample->Mean[i] = Feature[i]; 00477 00478 // add the sample to the KD tree - keep track of the total # of samples 00479 Clusterer->NumberOfSamples++; 00480 KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample); 00481 if (CharID >= Clusterer->NumChar) 00482 Clusterer->NumChar = CharID + 1; 00483 00484 // execute hook for monitoring clustering operation 00485 // (*SampleCreationHook)( Sample ); 00486 00487 return (Sample); 00488 } // MakeSample 00489 00490 00508 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { 00509 //only create cluster tree if samples have never been clustered before 00510 if (Clusterer->Root == NULL) 00511 CreateClusterTree(Clusterer); 00512 00513 //deallocate the old prototype list if one exists 00514 FreeProtoList (&Clusterer->ProtoList); 00515 Clusterer->ProtoList = NIL_LIST; 00516 00517 //compute prototypes starting at the root node in the tree 00518 ComputePrototypes(Clusterer, Config); 00519 return (Clusterer->ProtoList); 00520 } // ClusterSamples 00521 00522 00536 void FreeClusterer(CLUSTERER *Clusterer) { 00537 if (Clusterer != NULL) { 00538 memfree (Clusterer->ParamDesc); 00539 if (Clusterer->KDTree != NULL) 00540 FreeKDTree (Clusterer->KDTree); 00541 if (Clusterer->Root != NULL) 00542 FreeCluster (Clusterer->Root); 00543 // Free up all used buckets structures. 00544 for (int d = 0; d < DISTRIBUTION_COUNT; ++d) { 00545 for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c) 00546 if (Clusterer->bucket_cache[d][c] != NULL) 00547 FreeBuckets(Clusterer->bucket_cache[d][c]); 00548 } 00549 00550 memfree(Clusterer); 00551 } 00552 } // FreeClusterer 00553 00554 00564 void FreeProtoList(LIST *ProtoList) { 00565 destroy_nodes(*ProtoList, FreePrototype); 00566 } // FreeProtoList 00567 00568 00579 void FreePrototype(void *arg) { //PROTOTYPE *Prototype) 00580 PROTOTYPE *Prototype = (PROTOTYPE *) arg; 00581 00582 // unmark the corresponding cluster (if there is one 00583 if (Prototype->Cluster != NULL) 00584 Prototype->Cluster->Prototype = FALSE; 00585 00586 // deallocate the prototype statistics and then the prototype itself 00587 if (Prototype->Distrib != NULL) 00588 memfree (Prototype->Distrib); 00589 if (Prototype->Mean != NULL) 00590 memfree (Prototype->Mean); 00591 if (Prototype->Style != spherical) { 00592 if (Prototype->Variance.Elliptical != NULL) 00593 memfree (Prototype->Variance.Elliptical); 00594 if (Prototype->Magnitude.Elliptical != NULL) 00595 memfree (Prototype->Magnitude.Elliptical); 00596 if (Prototype->Weight.Elliptical != NULL) 00597 memfree (Prototype->Weight.Elliptical); 00598 } 00599 memfree(Prototype); 00600 } // FreePrototype 00601 00602 00618 CLUSTER *NextSample(LIST *SearchState) { 00619 CLUSTER *Cluster; 00620 00621 if (*SearchState == NIL_LIST) 00622 return (NULL); 00623 Cluster = (CLUSTER *) first_node (*SearchState); 00624 *SearchState = pop (*SearchState); 00625 while (TRUE) { 00626 if (Cluster->Left == NULL) 00627 return (Cluster); 00628 *SearchState = push (*SearchState, Cluster->Right); 00629 Cluster = Cluster->Left; 00630 } 00631 } // NextSample 00632 00633 00643 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) { 00644 return (Proto->Mean[Dimension]); 00645 } // Mean 00646 00647 00657 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) { 00658 switch (Proto->Style) { 00659 case spherical: 00660 return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical)); 00661 case elliptical: 00662 return ((FLOAT32) 00663 sqrt ((double) Proto->Variance.Elliptical[Dimension])); 00664 case mixed: 00665 switch (Proto->Distrib[Dimension]) { 00666 case normal: 00667 return ((FLOAT32) 00668 sqrt ((double) Proto->Variance.Elliptical[Dimension])); 00669 case uniform: 00670 case D_random: 00671 return (Proto->Variance.Elliptical[Dimension]); 00672 case DISTRIBUTION_COUNT: 00673 ASSERT_HOST(!"Distribution count not allowed!"); 00674 } 00675 } 00676 return 0.0f; 00677 } // StandardDeviation 00678 00679 00680 /*--------------------------------------------------------------------------- 00681 Private Code 00682 ----------------------------------------------------------------------------*/ 00698 void CreateClusterTree(CLUSTERER *Clusterer) { 00699 ClusteringContext context; 00700 ClusterPair HeapEntry; 00701 TEMPCLUSTER *PotentialCluster; 00702 00703 // each sample and its nearest neighbor form a "potential" cluster 00704 // save these in a heap with the "best" potential clusters on top 00705 context.tree = Clusterer->KDTree; 00706 context.candidates = (TEMPCLUSTER *) 00707 Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER)); 00708 context.next = 0; 00709 context.heap = new ClusterHeap(Clusterer->NumberOfSamples); 00710 KDWalk(context.tree, (void_proc)MakePotentialClusters, &context); 00711 00712 // form potential clusters into actual clusters - always do "best" first 00713 while (context.heap->Pop(&HeapEntry)) { 00714 PotentialCluster = HeapEntry.data; 00715 00716 // if main cluster of potential cluster is already in another cluster 00717 // then we don't need to worry about it 00718 if (PotentialCluster->Cluster->Clustered) { 00719 continue; 00720 } 00721 00722 // if main cluster is not yet clustered, but its nearest neighbor is 00723 // then we must find a new nearest neighbor 00724 else if (PotentialCluster->Neighbor->Clustered) { 00725 PotentialCluster->Neighbor = 00726 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, 00727 &HeapEntry.key); 00728 if (PotentialCluster->Neighbor != NULL) { 00729 context.heap->Push(&HeapEntry); 00730 } 00731 } 00732 00733 // if neither cluster is already clustered, form permanent cluster 00734 else { 00735 PotentialCluster->Cluster = 00736 MakeNewCluster(Clusterer, PotentialCluster); 00737 PotentialCluster->Neighbor = 00738 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, 00739 &HeapEntry.key); 00740 if (PotentialCluster->Neighbor != NULL) { 00741 context.heap->Push(&HeapEntry); 00742 } 00743 } 00744 } 00745 00746 // the root node in the cluster tree is now the only node in the kd-tree 00747 Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree); 00748 00749 // free up the memory used by the K-D tree, heap, and temp clusters 00750 FreeKDTree(context.tree); 00751 Clusterer->KDTree = NULL; 00752 delete context.heap; 00753 memfree(context.candidates); 00754 } // CreateClusterTree 00755 00756 00768 void MakePotentialClusters(ClusteringContext *context, 00769 CLUSTER *Cluster, inT32 Level) { 00770 ClusterPair HeapEntry; 00771 int next = context->next; 00772 context->candidates[next].Cluster = Cluster; 00773 HeapEntry.data = &(context->candidates[next]); 00774 context->candidates[next].Neighbor = 00775 FindNearestNeighbor(context->tree, 00776 context->candidates[next].Cluster, 00777 &HeapEntry.key); 00778 if (context->candidates[next].Neighbor != NULL) { 00779 context->heap->Push(&HeapEntry); 00780 context->next++; 00781 } 00782 } // MakePotentialClusters 00783 00784 00801 CLUSTER * 00802 FindNearestNeighbor(KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance) 00803 #define MAXNEIGHBORS 2 00804 #define MAXDISTANCE MAX_FLOAT32 00805 { 00806 CLUSTER *Neighbor[MAXNEIGHBORS]; 00807 FLOAT32 Dist[MAXNEIGHBORS]; 00808 int NumberOfNeighbors; 00809 inT32 i; 00810 CLUSTER *BestNeighbor; 00811 00812 // find the 2 nearest neighbors of the cluster 00813 KDNearestNeighborSearch(Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE, 00814 &NumberOfNeighbors, (void **)Neighbor, Dist); 00815 00816 // search for the nearest neighbor that is not the cluster itself 00817 *Distance = MAXDISTANCE; 00818 BestNeighbor = NULL; 00819 for (i = 0; i < NumberOfNeighbors; i++) { 00820 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) { 00821 *Distance = Dist[i]; 00822 BestNeighbor = Neighbor[i]; 00823 } 00824 } 00825 return BestNeighbor; 00826 } // FindNearestNeighbor 00827 00828 00841 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) { 00842 CLUSTER *Cluster; 00843 00844 // allocate the new cluster and initialize it 00845 Cluster = (CLUSTER *) Emalloc( 00846 sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32)); 00847 Cluster->Clustered = FALSE; 00848 Cluster->Prototype = FALSE; 00849 Cluster->Left = TempCluster->Cluster; 00850 Cluster->Right = TempCluster->Neighbor; 00851 Cluster->CharID = -1; 00852 00853 // mark the old clusters as "clustered" and delete them from the kd-tree 00854 Cluster->Left->Clustered = TRUE; 00855 Cluster->Right->Clustered = TRUE; 00856 KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left); 00857 KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right); 00858 00859 // compute the mean and sample count for the new cluster 00860 Cluster->SampleCount = 00861 MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, 00862 Cluster->Left->SampleCount, Cluster->Right->SampleCount, 00863 Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean); 00864 00865 // add the new cluster to the KD tree 00866 KDStore(Clusterer->KDTree, Cluster->Mean, Cluster); 00867 return Cluster; 00868 } // MakeNewCluster 00869 00870 00886 inT32 MergeClusters(inT16 N, 00887 PARAM_DESC ParamDesc[], 00888 inT32 n1, 00889 inT32 n2, 00890 FLOAT32 m[], 00891 FLOAT32 m1[], FLOAT32 m2[]) { 00892 inT32 i, n; 00893 00894 n = n1 + n2; 00895 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) { 00896 if (ParamDesc->Circular) { 00897 // if distance between means is greater than allowed 00898 // reduce upper point by one "rotation" to compute mean 00899 // then normalize the mean back into the accepted range 00900 if ((*m2 - *m1) > ParamDesc->HalfRange) { 00901 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n; 00902 if (*m < ParamDesc->Min) 00903 *m += ParamDesc->Range; 00904 } 00905 else if ((*m1 - *m2) > ParamDesc->HalfRange) { 00906 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n; 00907 if (*m < ParamDesc->Min) 00908 *m += ParamDesc->Range; 00909 } 00910 else 00911 *m = (n1 * *m1 + n2 * *m2) / n; 00912 } 00913 else 00914 *m = (n1 * *m1 + n2 * *m2) / n; 00915 } 00916 return n; 00917 } // MergeClusters 00918 00919 00931 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { 00932 LIST ClusterStack = NIL_LIST; 00933 CLUSTER *Cluster; 00934 PROTOTYPE *Prototype; 00935 00936 // use a stack to keep track of clusters waiting to be processed 00937 // initially the only cluster on the stack is the root cluster 00938 if (Clusterer->Root != NULL) 00939 ClusterStack = push (NIL_LIST, Clusterer->Root); 00940 00941 // loop until we have analyzed all clusters which are potential prototypes 00942 while (ClusterStack != NIL_LIST) { 00943 // remove the next cluster to be analyzed from the stack 00944 // try to make a prototype from the cluster 00945 // if successful, put it on the proto list, else split the cluster 00946 Cluster = (CLUSTER *) first_node (ClusterStack); 00947 ClusterStack = pop (ClusterStack); 00948 Prototype = MakePrototype(Clusterer, Config, Cluster); 00949 if (Prototype != NULL) { 00950 Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype); 00951 } 00952 else { 00953 ClusterStack = push (ClusterStack, Cluster->Right); 00954 ClusterStack = push (ClusterStack, Cluster->Left); 00955 } 00956 } 00957 } // ComputePrototypes 00958 00959 00978 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, 00979 CLUSTERCONFIG *Config, 00980 CLUSTER *Cluster) { 00981 STATISTICS *Statistics; 00982 PROTOTYPE *Proto; 00983 BUCKETS *Buckets; 00984 00985 // filter out clusters which contain samples from the same character 00986 if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal)) 00987 return NULL; 00988 00989 // compute the covariance matrix and ranges for the cluster 00990 Statistics = 00991 ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster); 00992 00993 // check for degenerate clusters which need not be analyzed further 00994 // note that the MinSamples test assumes that all clusters with multiple 00995 // character samples have been removed (as above) 00996 Proto = MakeDegenerateProto( 00997 Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle, 00998 (inT32) (Config->MinSamples * Clusterer->NumChar)); 00999 if (Proto != NULL) { 01000 FreeStatistics(Statistics); 01001 return Proto; 01002 } 01003 // check to ensure that all dimensions are independent 01004 if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, 01005 Statistics->CoVariance, Config->Independence)) { 01006 FreeStatistics(Statistics); 01007 return NULL; 01008 } 01009 01010 if (HOTELLING && Config->ProtoStyle == elliptical) { 01011 Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics); 01012 if (Proto != NULL) { 01013 FreeStatistics(Statistics); 01014 return Proto; 01015 } 01016 } 01017 01018 // create a histogram data structure used to evaluate distributions 01019 Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, 01020 Config->Confidence); 01021 01022 // create a prototype based on the statistics and test it 01023 switch (Config->ProtoStyle) { 01024 case spherical: 01025 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); 01026 break; 01027 case elliptical: 01028 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); 01029 break; 01030 case mixed: 01031 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, 01032 Config->Confidence); 01033 break; 01034 case automatic: 01035 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets); 01036 if (Proto != NULL) 01037 break; 01038 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets); 01039 if (Proto != NULL) 01040 break; 01041 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, 01042 Config->Confidence); 01043 break; 01044 } 01045 FreeStatistics(Statistics); 01046 return Proto; 01047 } // MakePrototype 01048 01049 01071 PROTOTYPE *MakeDegenerateProto( //this was MinSample 01072 uinT16 N, 01073 CLUSTER *Cluster, 01074 STATISTICS *Statistics, 01075 PROTOSTYLE Style, 01076 inT32 MinSamples) { 01077 PROTOTYPE *Proto = NULL; 01078 01079 if (MinSamples < MINSAMPLESNEEDED) 01080 MinSamples = MINSAMPLESNEEDED; 01081 01082 if (Cluster->SampleCount < MinSamples) { 01083 switch (Style) { 01084 case spherical: 01085 Proto = NewSphericalProto (N, Cluster, Statistics); 01086 break; 01087 case elliptical: 01088 case automatic: 01089 Proto = NewEllipticalProto (N, Cluster, Statistics); 01090 break; 01091 case mixed: 01092 Proto = NewMixedProto (N, Cluster, Statistics); 01093 break; 01094 } 01095 Proto->Significant = FALSE; 01096 } 01097 return (Proto); 01098 } // MakeDegenerateProto 01099 01113 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, 01114 CLUSTERCONFIG *Config, 01115 CLUSTER *Cluster, 01116 STATISTICS *Statistics) { 01117 // Fraction of the number of samples used as a range around 1 within 01118 // which a cluster has the magic size that allows a boost to the 01119 // FTable by kFTableBoostMargin, thus allowing clusters near the 01120 // magic size (equal to the number of sample characters) to be more 01121 // likely to stay together. 01122 const double kMagicSampleMargin = 0.0625; 01123 const double kFTableBoostMargin = 2.0; 01124 01125 int N = Clusterer->SampleSize; 01126 CLUSTER* Left = Cluster->Left; 01127 CLUSTER* Right = Cluster->Right; 01128 if (Left == NULL || Right == NULL) 01129 return NULL; 01130 int TotalDims = Left->SampleCount + Right->SampleCount; 01131 if (TotalDims < N + 1 || TotalDims < 2) 01132 return NULL; 01133 const int kMatrixSize = N * N * sizeof(FLOAT32); 01134 FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize)); 01135 FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize)); 01136 FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32))); 01137 // Compute a new covariance matrix that only uses essential features. 01138 for (int i = 0; i < N; ++i) { 01139 int row_offset = i * N; 01140 if (!Clusterer->ParamDesc[i].NonEssential) { 01141 for (int j = 0; j < N; ++j) { 01142 if (!Clusterer->ParamDesc[j].NonEssential) 01143 Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset]; 01144 else 01145 Covariance[j + row_offset] = 0.0f; 01146 } 01147 } else { 01148 for (int j = 0; j < N; ++j) { 01149 if (i == j) 01150 Covariance[j + row_offset] = 1.0f; 01151 else 01152 Covariance[j + row_offset] = 0.0f; 01153 } 01154 } 01155 } 01156 double err = InvertMatrix(Covariance, N, Inverse); 01157 if (err > 1) { 01158 tprintf("Clustering error: Matrix inverse failed with error %g\n", err); 01159 } 01160 int EssentialN = 0; 01161 for (int dim = 0; dim < N; ++dim) { 01162 if (!Clusterer->ParamDesc[dim].NonEssential) { 01163 Delta[dim] = Left->Mean[dim] - Right->Mean[dim]; 01164 ++EssentialN; 01165 } else { 01166 Delta[dim] = 0.0f; 01167 } 01168 } 01169 // Compute Hotelling's T-squared. 01170 double Tsq = 0.0; 01171 for (int x = 0; x < N; ++x) { 01172 double temp = 0.0; 01173 for (int y = 0; y < N; ++y) { 01174 temp += Inverse[y + N*x] * Delta[y]; 01175 } 01176 Tsq += Delta[x] * temp; 01177 } 01178 memfree(Covariance); 01179 memfree(Inverse); 01180 memfree(Delta); 01181 // Changed this function to match the formula in 01182 // Statistical Methods in Medical Research p 473 01183 // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews. 01184 // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims; 01185 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN); 01186 int Fx = EssentialN; 01187 if (Fx > FTABLE_X) 01188 Fx = FTABLE_X; 01189 --Fx; 01190 int Fy = TotalDims - EssentialN - 1; 01191 if (Fy > FTABLE_Y) 01192 Fy = FTABLE_Y; 01193 --Fy; 01194 double FTarget = FTable[Fy][Fx]; 01195 if (Config->MagicSamples > 0 && 01196 TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) && 01197 TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) { 01198 // Give magic-sized clusters a magic FTable boost. 01199 FTarget += kFTableBoostMargin; 01200 } 01201 if (F < FTarget) { 01202 return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics); 01203 } 01204 return NULL; 01205 } 01206 01207 /* MakeSphericalProto ******************************************************* 01208 Parameters: Clusterer data struct containing samples being clustered 01209 Cluster cluster to be made into a spherical prototype 01210 Statistics statistical info about cluster 01211 Buckets histogram struct used to analyze distribution 01212 Operation: This routine tests the specified cluster to see if it can 01213 be approximated by a spherical normal distribution. If it 01214 can be, then a new prototype is formed and returned to the 01215 caller. If it can't be, then NULL is returned to the caller. 01216 Return: Pointer to new spherical prototype or NULL. 01217 Exceptions: None 01218 History: 6/1/89, DSJ, Created. 01219 ******************************************************************************/ 01220 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, 01221 CLUSTER *Cluster, 01222 STATISTICS *Statistics, 01223 BUCKETS *Buckets) { 01224 PROTOTYPE *Proto = NULL; 01225 int i; 01226 01227 // check that each dimension is a normal distribution 01228 for (i = 0; i < Clusterer->SampleSize; i++) { 01229 if (Clusterer->ParamDesc[i].NonEssential) 01230 continue; 01231 01232 FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01233 Cluster->Mean[i], 01234 sqrt ((FLOAT64) (Statistics->AvgVariance))); 01235 if (!DistributionOK (Buckets)) 01236 break; 01237 } 01238 // if all dimensions matched a normal distribution, make a proto 01239 if (i >= Clusterer->SampleSize) 01240 Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics); 01241 return (Proto); 01242 } // MakeSphericalProto 01243 01244 01258 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, 01259 CLUSTER *Cluster, 01260 STATISTICS *Statistics, 01261 BUCKETS *Buckets) { 01262 PROTOTYPE *Proto = NULL; 01263 int i; 01264 01265 // check that each dimension is a normal distribution 01266 for (i = 0; i < Clusterer->SampleSize; i++) { 01267 if (Clusterer->ParamDesc[i].NonEssential) 01268 continue; 01269 01270 FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01271 Cluster->Mean[i], 01272 sqrt ((FLOAT64) Statistics-> 01273 CoVariance[i * (Clusterer->SampleSize + 1)])); 01274 if (!DistributionOK (Buckets)) 01275 break; 01276 } 01277 // if all dimensions matched a normal distribution, make a proto 01278 if (i >= Clusterer->SampleSize) 01279 Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics); 01280 return (Proto); 01281 } // MakeEllipticalProto 01282 01283 01302 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, 01303 CLUSTER *Cluster, 01304 STATISTICS *Statistics, 01305 BUCKETS *NormalBuckets, 01306 FLOAT64 Confidence) { 01307 PROTOTYPE *Proto; 01308 int i; 01309 BUCKETS *UniformBuckets = NULL; 01310 BUCKETS *RandomBuckets = NULL; 01311 01312 // create a mixed proto to work on - initially assume all dimensions normal*/ 01313 Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics); 01314 01315 // find the proper distribution for each dimension 01316 for (i = 0; i < Clusterer->SampleSize; i++) { 01317 if (Clusterer->ParamDesc[i].NonEssential) 01318 continue; 01319 01320 FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01321 Proto->Mean[i], 01322 sqrt ((FLOAT64) Proto->Variance.Elliptical[i])); 01323 if (DistributionOK (NormalBuckets)) 01324 continue; 01325 01326 if (RandomBuckets == NULL) 01327 RandomBuckets = 01328 GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence); 01329 MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i])); 01330 FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01331 Proto->Mean[i], Proto->Variance.Elliptical[i]); 01332 if (DistributionOK (RandomBuckets)) 01333 continue; 01334 01335 if (UniformBuckets == NULL) 01336 UniformBuckets = 01337 GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence); 01338 MakeDimUniform(i, Proto, Statistics); 01339 FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01340 Proto->Mean[i], Proto->Variance.Elliptical[i]); 01341 if (DistributionOK (UniformBuckets)) 01342 continue; 01343 break; 01344 } 01345 // if any dimension failed to match a distribution, discard the proto 01346 if (i < Clusterer->SampleSize) { 01347 FreePrototype(Proto); 01348 Proto = NULL; 01349 } 01350 return (Proto); 01351 } // MakeMixedProto 01352 01353 01354 /* MakeDimRandom ************************************************************* 01355 Parameters: i index of dimension to be changed 01356 Proto prototype whose dimension is to be altered 01357 ParamDesc description of specified dimension 01358 Operation: This routine alters the ith dimension of the specified 01359 mixed prototype to be D_random. 01360 Return: None 01361 Exceptions: None 01362 History: 6/20/89, DSJ, Created. 01363 ******************************************************************************/ 01364 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) { 01365 Proto->Distrib[i] = D_random; 01366 Proto->Mean[i] = ParamDesc->MidRange; 01367 Proto->Variance.Elliptical[i] = ParamDesc->HalfRange; 01368 01369 // subtract out the previous magnitude of this dimension from the total 01370 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; 01371 Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range; 01372 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01373 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01374 01375 // note that the proto Weight is irrelevant for D_random protos 01376 } // MakeDimRandom 01377 01378 01389 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) { 01390 Proto->Distrib[i] = uniform; 01391 Proto->Mean[i] = Proto->Cluster->Mean[i] + 01392 (Statistics->Min[i] + Statistics->Max[i]) / 2; 01393 Proto->Variance.Elliptical[i] = 01394 (Statistics->Max[i] - Statistics->Min[i]) / 2; 01395 if (Proto->Variance.Elliptical[i] < MINVARIANCE) 01396 Proto->Variance.Elliptical[i] = MINVARIANCE; 01397 01398 // subtract out the previous magnitude of this dimension from the total 01399 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; 01400 Proto->Magnitude.Elliptical[i] = 01401 1.0 / (2.0 * Proto->Variance.Elliptical[i]); 01402 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01403 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01404 01405 // note that the proto Weight is irrelevant for uniform protos 01406 } // MakeDimUniform 01407 01408 01425 STATISTICS * 01426 ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) { 01427 STATISTICS *Statistics; 01428 int i, j; 01429 FLOAT32 *CoVariance; 01430 FLOAT32 *Distance; 01431 LIST SearchState; 01432 SAMPLE *Sample; 01433 uinT32 SampleCountAdjustedForBias; 01434 01435 // allocate memory to hold the statistics results 01436 Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS)); 01437 Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32)); 01438 Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01439 Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01440 01441 // allocate temporary memory to hold the sample to mean distances 01442 Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01443 01444 // initialize the statistics 01445 Statistics->AvgVariance = 1.0; 01446 CoVariance = Statistics->CoVariance; 01447 for (i = 0; i < N; i++) { 01448 Statistics->Min[i] = 0.0; 01449 Statistics->Max[i] = 0.0; 01450 for (j = 0; j < N; j++, CoVariance++) 01451 *CoVariance = 0; 01452 } 01453 // find each sample in the cluster and merge it into the statistics 01454 InitSampleSearch(SearchState, Cluster); 01455 while ((Sample = NextSample (&SearchState)) != NULL) { 01456 for (i = 0; i < N; i++) { 01457 Distance[i] = Sample->Mean[i] - Cluster->Mean[i]; 01458 if (ParamDesc[i].Circular) { 01459 if (Distance[i] > ParamDesc[i].HalfRange) 01460 Distance[i] -= ParamDesc[i].Range; 01461 if (Distance[i] < -ParamDesc[i].HalfRange) 01462 Distance[i] += ParamDesc[i].Range; 01463 } 01464 if (Distance[i] < Statistics->Min[i]) 01465 Statistics->Min[i] = Distance[i]; 01466 if (Distance[i] > Statistics->Max[i]) 01467 Statistics->Max[i] = Distance[i]; 01468 } 01469 CoVariance = Statistics->CoVariance; 01470 for (i = 0; i < N; i++) 01471 for (j = 0; j < N; j++, CoVariance++) 01472 *CoVariance += Distance[i] * Distance[j]; 01473 } 01474 // normalize the variances by the total number of samples 01475 // use SampleCount-1 instead of SampleCount to get an unbiased estimate 01476 // also compute the geometic mean of the diagonal variances 01477 // ensure that clusters with only 1 sample are handled correctly 01478 if (Cluster->SampleCount > 1) 01479 SampleCountAdjustedForBias = Cluster->SampleCount - 1; 01480 else 01481 SampleCountAdjustedForBias = 1; 01482 CoVariance = Statistics->CoVariance; 01483 for (i = 0; i < N; i++) 01484 for (j = 0; j < N; j++, CoVariance++) { 01485 *CoVariance /= SampleCountAdjustedForBias; 01486 if (j == i) { 01487 if (*CoVariance < MINVARIANCE) 01488 *CoVariance = MINVARIANCE; 01489 Statistics->AvgVariance *= *CoVariance; 01490 } 01491 } 01492 Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance, 01493 1.0 / N); 01494 01495 // release temporary memory and return 01496 memfree(Distance); 01497 return (Statistics); 01498 } // ComputeStatistics 01499 01500 01514 PROTOTYPE *NewSphericalProto(uinT16 N, 01515 CLUSTER *Cluster, 01516 STATISTICS *Statistics) { 01517 PROTOTYPE *Proto; 01518 01519 Proto = NewSimpleProto (N, Cluster); 01520 01521 Proto->Variance.Spherical = Statistics->AvgVariance; 01522 if (Proto->Variance.Spherical < MINVARIANCE) 01523 Proto->Variance.Spherical = MINVARIANCE; 01524 01525 Proto->Magnitude.Spherical = 01526 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical)); 01527 Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical, 01528 (double) N); 01529 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical; 01530 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01531 01532 return (Proto); 01533 } // NewSphericalProto 01534 01535 01548 PROTOTYPE *NewEllipticalProto(inT16 N, 01549 CLUSTER *Cluster, 01550 STATISTICS *Statistics) { 01551 PROTOTYPE *Proto; 01552 FLOAT32 *CoVariance; 01553 int i; 01554 01555 Proto = NewSimpleProto (N, Cluster); 01556 Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01557 Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01558 Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01559 01560 CoVariance = Statistics->CoVariance; 01561 Proto->TotalMagnitude = 1.0; 01562 for (i = 0; i < N; i++, CoVariance += N + 1) { 01563 Proto->Variance.Elliptical[i] = *CoVariance; 01564 if (Proto->Variance.Elliptical[i] < MINVARIANCE) 01565 Proto->Variance.Elliptical[i] = MINVARIANCE; 01566 01567 Proto->Magnitude.Elliptical[i] = 01568 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i])); 01569 Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i]; 01570 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01571 } 01572 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01573 Proto->Style = elliptical; 01574 return (Proto); 01575 } // NewEllipticalProto 01576 01577 01593 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) { 01594 PROTOTYPE *Proto; 01595 int i; 01596 01597 Proto = NewEllipticalProto (N, Cluster, Statistics); 01598 Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION)); 01599 01600 for (i = 0; i < N; i++) { 01601 Proto->Distrib[i] = normal; 01602 } 01603 Proto->Style = mixed; 01604 return (Proto); 01605 } // NewMixedProto 01606 01607 01618 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) { 01619 PROTOTYPE *Proto; 01620 int i; 01621 01622 Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE)); 01623 Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01624 01625 for (i = 0; i < N; i++) 01626 Proto->Mean[i] = Cluster->Mean[i]; 01627 Proto->Distrib = NULL; 01628 01629 Proto->Significant = TRUE; 01630 Proto->Merged = FALSE; 01631 Proto->Style = spherical; 01632 Proto->NumSamples = Cluster->SampleCount; 01633 Proto->Cluster = Cluster; 01634 Proto->Cluster->Prototype = TRUE; 01635 return (Proto); 01636 } // NewSimpleProto 01637 01638 01659 BOOL8 01660 Independent (PARAM_DESC ParamDesc[], 01661 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) { 01662 int i, j; 01663 FLOAT32 *VARii; // points to ith on-diagonal element 01664 FLOAT32 *VARjj; // points to jth on-diagonal element 01665 FLOAT32 CorrelationCoeff; 01666 01667 VARii = CoVariance; 01668 for (i = 0; i < N; i++, VARii += N + 1) { 01669 if (ParamDesc[i].NonEssential) 01670 continue; 01671 01672 VARjj = VARii + N + 1; 01673 CoVariance = VARii + 1; 01674 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) { 01675 if (ParamDesc[j].NonEssential) 01676 continue; 01677 01678 if ((*VARii == 0.0) || (*VARjj == 0.0)) 01679 CorrelationCoeff = 0.0; 01680 else 01681 CorrelationCoeff = 01682 sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj))); 01683 if (CorrelationCoeff > Independence) 01684 return (FALSE); 01685 } 01686 } 01687 return (TRUE); 01688 } // Independent 01689 01690 01709 BUCKETS *GetBuckets(CLUSTERER* clusterer, 01710 DISTRIBUTION Distribution, 01711 uinT32 SampleCount, 01712 FLOAT64 Confidence) { 01713 // Get an old bucket structure with the same number of buckets. 01714 uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount); 01715 BUCKETS *Buckets = 01716 clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS]; 01717 01718 // If a matching bucket structure is not found, make one and save it. 01719 if (Buckets == NULL) { 01720 Buckets = MakeBuckets(Distribution, SampleCount, Confidence); 01721 clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = 01722 Buckets; 01723 } else { 01724 // Just adjust the existing buckets. 01725 if (SampleCount != Buckets->SampleCount) 01726 AdjustBuckets(Buckets, SampleCount); 01727 if (Confidence != Buckets->Confidence) { 01728 Buckets->Confidence = Confidence; 01729 Buckets->ChiSquared = ComputeChiSquared( 01730 DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), 01731 Confidence); 01732 } 01733 InitBuckets(Buckets); 01734 } 01735 return Buckets; 01736 } // GetBuckets 01737 01738 01759 BUCKETS *MakeBuckets(DISTRIBUTION Distribution, 01760 uinT32 SampleCount, 01761 FLOAT64 Confidence) { 01762 const DENSITYFUNC DensityFunction[] = 01763 { NormalDensity, UniformDensity, UniformDensity }; 01764 int i, j; 01765 BUCKETS *Buckets; 01766 FLOAT64 BucketProbability; 01767 FLOAT64 NextBucketBoundary; 01768 FLOAT64 Probability; 01769 FLOAT64 ProbabilityDelta; 01770 FLOAT64 LastProbDensity; 01771 FLOAT64 ProbDensity; 01772 uinT16 CurrentBucket; 01773 BOOL8 Symmetrical; 01774 01775 // allocate memory needed for data structure 01776 Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS))); 01777 Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount); 01778 Buckets->SampleCount = SampleCount; 01779 Buckets->Confidence = Confidence; 01780 Buckets->Count = reinterpret_cast<uinT32*>( 01781 Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32))); 01782 Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>( 01783 Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32))); 01784 01785 // initialize simple fields 01786 Buckets->Distribution = Distribution; 01787 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 01788 Buckets->Count[i] = 0; 01789 Buckets->ExpectedCount[i] = 0.0; 01790 } 01791 01792 // all currently defined distributions are symmetrical 01793 Symmetrical = TRUE; 01794 Buckets->ChiSquared = ComputeChiSquared( 01795 DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence); 01796 01797 if (Symmetrical) { 01798 // allocate buckets so that all have approx. equal probability 01799 BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets); 01800 01801 // distribution is symmetric so fill in upper half then copy 01802 CurrentBucket = Buckets->NumberOfBuckets / 2; 01803 if (Odd (Buckets->NumberOfBuckets)) 01804 NextBucketBoundary = BucketProbability / 2; 01805 else 01806 NextBucketBoundary = BucketProbability; 01807 01808 Probability = 0.0; 01809 LastProbDensity = 01810 (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2); 01811 for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) { 01812 ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1); 01813 ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0); 01814 Probability += ProbabilityDelta; 01815 if (Probability > NextBucketBoundary) { 01816 if (CurrentBucket < Buckets->NumberOfBuckets - 1) 01817 CurrentBucket++; 01818 NextBucketBoundary += BucketProbability; 01819 } 01820 Buckets->Bucket[i] = CurrentBucket; 01821 Buckets->ExpectedCount[CurrentBucket] += 01822 (FLOAT32) (ProbabilityDelta * SampleCount); 01823 LastProbDensity = ProbDensity; 01824 } 01825 // place any leftover probability into the last bucket 01826 Buckets->ExpectedCount[CurrentBucket] += 01827 (FLOAT32) ((0.5 - Probability) * SampleCount); 01828 01829 // copy upper half of distribution to lower half 01830 for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) 01831 Buckets->Bucket[i] = 01832 Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets); 01833 01834 // copy upper half of expected counts to lower half 01835 for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) 01836 Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j]; 01837 } 01838 return Buckets; 01839 } // MakeBuckets 01840 01841 01842 //--------------------------------------------------------------------------- 01843 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) { 01844 /* 01845 ** Parameters: 01846 ** SampleCount number of samples to be tested 01847 ** Operation: 01848 ** This routine computes the optimum number of histogram 01849 ** buckets that should be used in a chi-squared goodness of 01850 ** fit test for the specified number of samples. The optimum 01851 ** number is computed based on Table 4.1 on pg. 147 of 01852 ** "Measurement and Analysis of Random Data" by Bendat & Piersol. 01853 ** Linear interpolation is used to interpolate between table 01854 ** values. The table is intended for a 0.05 level of 01855 ** significance (alpha). This routine assumes that it is 01856 ** equally valid for other alpha's, which may not be true. 01857 ** Return: 01858 ** Optimum number of histogram buckets 01859 ** Exceptions: 01860 ** None 01861 ** History: 01862 ** 6/5/89, DSJ, Created. 01863 */ 01864 uinT8 Last, Next; 01865 FLOAT32 Slope; 01866 01867 if (SampleCount < kCountTable[0]) 01868 return kBucketsTable[0]; 01869 01870 for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) { 01871 if (SampleCount <= kCountTable[Next]) { 01872 Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) / 01873 (FLOAT32) (kCountTable[Next] - kCountTable[Last]); 01874 return ((uinT16) (kBucketsTable[Last] + 01875 Slope * (SampleCount - kCountTable[Last]))); 01876 } 01877 } 01878 return kBucketsTable[Last]; 01879 } // OptimumNumberOfBuckets 01880 01881 01882 //--------------------------------------------------------------------------- 01883 FLOAT64 01884 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha) 01885 /* 01886 ** Parameters: 01887 ** DegreesOfFreedom determines shape of distribution 01888 ** Alpha probability of right tail 01889 ** Operation: 01890 ** This routine computes the chi-squared value which will 01891 ** leave a cumulative probability of Alpha in the right tail 01892 ** of a chi-squared distribution with the specified number of 01893 ** degrees of freedom. Alpha must be between 0 and 1. 01894 ** DegreesOfFreedom must be even. The routine maintains an 01895 ** array of lists. Each list corresponds to a different 01896 ** number of degrees of freedom. Each entry in the list 01897 ** corresponds to a different alpha value and its corresponding 01898 ** chi-squared value. Therefore, once a particular chi-squared 01899 ** value is computed, it is stored in the list and never 01900 ** needs to be computed again. 01901 ** Return: Desired chi-squared value 01902 ** Exceptions: none 01903 ** History: 6/5/89, DSJ, Created. 01904 */ 01905 #define CHIACCURACY 0.01 01906 #define MINALPHA (1e-200) 01907 { 01908 static LIST ChiWith[MAXDEGREESOFFREEDOM + 1]; 01909 01910 CHISTRUCT *OldChiSquared; 01911 CHISTRUCT SearchKey; 01912 01913 // limit the minimum alpha that can be used - if alpha is too small 01914 // it may not be possible to compute chi-squared. 01915 Alpha = ClipToRange(Alpha, MINALPHA, 1.0); 01916 if (Odd (DegreesOfFreedom)) 01917 DegreesOfFreedom++; 01918 01919 /* find the list of chi-squared values which have already been computed 01920 for the specified number of degrees of freedom. Search the list for 01921 the desired chi-squared. */ 01922 SearchKey.Alpha = Alpha; 01923 OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom], 01924 &SearchKey, AlphaMatch)); 01925 01926 if (OldChiSquared == NULL) { 01927 OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha); 01928 OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared, 01929 (FLOAT64) DegreesOfFreedom, 01930 (FLOAT64) CHIACCURACY); 01931 ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom], 01932 OldChiSquared); 01933 } 01934 else { 01935 // further optimization might move OldChiSquared to front of list 01936 } 01937 01938 return (OldChiSquared->ChiSquared); 01939 01940 } // ComputeChiSquared 01941 01942 01943 //--------------------------------------------------------------------------- 01944 FLOAT64 NormalDensity(inT32 x) { 01945 /* 01946 ** Parameters: 01947 ** x number to compute the normal probability density for 01948 ** Globals: 01949 ** kNormalMean mean of a discrete normal distribution 01950 ** kNormalVariance variance of a discrete normal distribution 01951 ** kNormalMagnitude magnitude of a discrete normal distribution 01952 ** Operation: 01953 ** This routine computes the probability density function 01954 ** of a discrete normal distribution defined by the global 01955 ** variables kNormalMean, kNormalVariance, and kNormalMagnitude. 01956 ** Normal magnitude could, of course, be computed in terms of 01957 ** the normal variance but it is precomputed for efficiency. 01958 ** Return: 01959 ** The value of the normal distribution at x. 01960 ** Exceptions: 01961 ** None 01962 ** History: 01963 ** 6/4/89, DSJ, Created. 01964 */ 01965 FLOAT64 Distance; 01966 01967 Distance = x - kNormalMean; 01968 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance); 01969 } // NormalDensity 01970 01971 01972 //--------------------------------------------------------------------------- 01973 FLOAT64 UniformDensity(inT32 x) { 01974 /* 01975 ** Parameters: 01976 ** x number to compute the uniform probability density for 01977 ** Operation: 01978 ** This routine computes the probability density function 01979 ** of a uniform distribution at the specified point. The 01980 ** range of the distribution is from 0 to BUCKETTABLESIZE. 01981 ** Return: 01982 ** The value of the uniform distribution at x. 01983 ** Exceptions: 01984 ** None 01985 ** History: 01986 ** 6/5/89, DSJ, Created. 01987 */ 01988 static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE; 01989 01990 if ((x >= 0.0) && (x <= BUCKETTABLESIZE)) 01991 return UniformDistributionDensity; 01992 else 01993 return (FLOAT64) 0.0; 01994 } // UniformDensity 01995 01996 01997 //--------------------------------------------------------------------------- 01998 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) { 01999 /* 02000 ** Parameters: 02001 ** f1 value of function at x1 02002 ** f2 value of function at x2 02003 ** Dx x2 - x1 (should always be positive) 02004 ** Operation: 02005 ** This routine computes a trapezoidal approximation to the 02006 ** integral of a function over a small delta in x. 02007 ** Return: 02008 ** Approximation of the integral of the function from x1 to x2. 02009 ** Exceptions: 02010 ** None 02011 ** History: 02012 ** 6/5/89, DSJ, Created. 02013 */ 02014 return (f1 + f2) * Dx / 2.0; 02015 } // Integral 02016 02017 02018 //--------------------------------------------------------------------------- 02019 void FillBuckets(BUCKETS *Buckets, 02020 CLUSTER *Cluster, 02021 uinT16 Dim, 02022 PARAM_DESC *ParamDesc, 02023 FLOAT32 Mean, 02024 FLOAT32 StdDev) { 02025 /* 02026 ** Parameters: 02027 ** Buckets histogram buckets to count samples 02028 ** Cluster cluster whose samples are being analyzed 02029 ** Dim dimension of samples which is being analyzed 02030 ** ParamDesc description of the dimension 02031 ** Mean "mean" of the distribution 02032 ** StdDev "standard deviation" of the distribution 02033 ** Operation: 02034 ** This routine counts the number of cluster samples which 02035 ** fall within the various histogram buckets in Buckets. Only 02036 ** one dimension of each sample is examined. The exact meaning 02037 ** of the Mean and StdDev parameters depends on the 02038 ** distribution which is being analyzed (this info is in the 02039 ** Buckets data structure). For normal distributions, Mean 02040 ** and StdDev have the expected meanings. For uniform and 02041 ** random distributions the Mean is the center point of the 02042 ** range and the StdDev is 1/2 the range. A dimension with 02043 ** zero standard deviation cannot be statistically analyzed. 02044 ** In this case, a pseudo-analysis is used. 02045 ** Return: 02046 ** None (the Buckets data structure is filled in) 02047 ** Exceptions: 02048 ** None 02049 ** History: 02050 ** 6/5/89, DSJ, Created. 02051 */ 02052 uinT16 BucketID; 02053 int i; 02054 LIST SearchState; 02055 SAMPLE *Sample; 02056 02057 // initialize the histogram bucket counts to 0 02058 for (i = 0; i < Buckets->NumberOfBuckets; i++) 02059 Buckets->Count[i] = 0; 02060 02061 if (StdDev == 0.0) { 02062 /* if the standard deviation is zero, then we can't statistically 02063 analyze the cluster. Use a pseudo-analysis: samples exactly on 02064 the mean are distributed evenly across all buckets. Samples greater 02065 than the mean are placed in the last bucket; samples less than the 02066 mean are placed in the first bucket. */ 02067 02068 InitSampleSearch(SearchState, Cluster); 02069 i = 0; 02070 while ((Sample = NextSample (&SearchState)) != NULL) { 02071 if (Sample->Mean[Dim] > Mean) 02072 BucketID = Buckets->NumberOfBuckets - 1; 02073 else if (Sample->Mean[Dim] < Mean) 02074 BucketID = 0; 02075 else 02076 BucketID = i; 02077 Buckets->Count[BucketID] += 1; 02078 i++; 02079 if (i >= Buckets->NumberOfBuckets) 02080 i = 0; 02081 } 02082 } 02083 else { 02084 // search for all samples in the cluster and add to histogram buckets 02085 InitSampleSearch(SearchState, Cluster); 02086 while ((Sample = NextSample (&SearchState)) != NULL) { 02087 switch (Buckets->Distribution) { 02088 case normal: 02089 BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim], 02090 Mean, StdDev); 02091 break; 02092 case D_random: 02093 case uniform: 02094 BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim], 02095 Mean, StdDev); 02096 break; 02097 default: 02098 BucketID = 0; 02099 } 02100 Buckets->Count[Buckets->Bucket[BucketID]] += 1; 02101 } 02102 } 02103 } // FillBuckets 02104 02105 02106 //---------------------------------------------------------------------------*/ 02107 uinT16 NormalBucket(PARAM_DESC *ParamDesc, 02108 FLOAT32 x, 02109 FLOAT32 Mean, 02110 FLOAT32 StdDev) { 02111 /* 02112 ** Parameters: 02113 ** ParamDesc used to identify circular dimensions 02114 ** x value to be normalized 02115 ** Mean mean of normal distribution 02116 ** StdDev standard deviation of normal distribution 02117 ** Operation: 02118 ** This routine determines which bucket x falls into in the 02119 ** discrete normal distribution defined by kNormalMean 02120 ** and kNormalStdDev. x values which exceed the range of 02121 ** the discrete distribution are clipped. 02122 ** Return: 02123 ** Bucket number into which x falls 02124 ** Exceptions: 02125 ** None 02126 ** History: 02127 ** 6/5/89, DSJ, Created. 02128 */ 02129 FLOAT32 X; 02130 02131 // wraparound circular parameters if necessary 02132 if (ParamDesc->Circular) { 02133 if (x - Mean > ParamDesc->HalfRange) 02134 x -= ParamDesc->Range; 02135 else if (x - Mean < -ParamDesc->HalfRange) 02136 x += ParamDesc->Range; 02137 } 02138 02139 X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean; 02140 if (X < 0) 02141 return 0; 02142 if (X > BUCKETTABLESIZE - 1) 02143 return ((uinT16) (BUCKETTABLESIZE - 1)); 02144 return (uinT16) floor((FLOAT64) X); 02145 } // NormalBucket 02146 02147 02148 //--------------------------------------------------------------------------- 02149 uinT16 UniformBucket(PARAM_DESC *ParamDesc, 02150 FLOAT32 x, 02151 FLOAT32 Mean, 02152 FLOAT32 StdDev) { 02153 /* 02154 ** Parameters: 02155 ** ParamDesc used to identify circular dimensions 02156 ** x value to be normalized 02157 ** Mean center of range of uniform distribution 02158 ** StdDev 1/2 the range of the uniform distribution 02159 ** Operation: 02160 ** This routine determines which bucket x falls into in the 02161 ** discrete uniform distribution defined by 02162 ** BUCKETTABLESIZE. x values which exceed the range of 02163 ** the discrete distribution are clipped. 02164 ** Return: 02165 ** Bucket number into which x falls 02166 ** Exceptions: 02167 ** None 02168 ** History: 02169 ** 6/5/89, DSJ, Created. 02170 */ 02171 FLOAT32 X; 02172 02173 // wraparound circular parameters if necessary 02174 if (ParamDesc->Circular) { 02175 if (x - Mean > ParamDesc->HalfRange) 02176 x -= ParamDesc->Range; 02177 else if (x - Mean < -ParamDesc->HalfRange) 02178 x += ParamDesc->Range; 02179 } 02180 02181 X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0); 02182 if (X < 0) 02183 return 0; 02184 if (X > BUCKETTABLESIZE - 1) 02185 return (uinT16) (BUCKETTABLESIZE - 1); 02186 return (uinT16) floor((FLOAT64) X); 02187 } // UniformBucket 02188 02189 02190 //--------------------------------------------------------------------------- 02191 BOOL8 DistributionOK(BUCKETS *Buckets) { 02192 /* 02193 ** Parameters: 02194 ** Buckets histogram data to perform chi-square test on 02195 ** Operation: 02196 ** This routine performs a chi-square goodness of fit test 02197 ** on the histogram data in the Buckets data structure. TRUE 02198 ** is returned if the histogram matches the probability 02199 ** distribution which was specified when the Buckets 02200 ** structure was originally created. Otherwise FALSE is 02201 ** returned. 02202 ** Return: 02203 ** TRUE if samples match distribution, FALSE otherwise 02204 ** Exceptions: 02205 ** None 02206 ** History: 02207 ** 6/5/89, DSJ, Created. 02208 */ 02209 FLOAT32 FrequencyDifference; 02210 FLOAT32 TotalDifference; 02211 int i; 02212 02213 // compute how well the histogram matches the expected histogram 02214 TotalDifference = 0.0; 02215 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02216 FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i]; 02217 TotalDifference += (FrequencyDifference * FrequencyDifference) / 02218 Buckets->ExpectedCount[i]; 02219 } 02220 02221 // test to see if the difference is more than expected 02222 if (TotalDifference > Buckets->ChiSquared) 02223 return FALSE; 02224 else 02225 return TRUE; 02226 } // DistributionOK 02227 02228 02229 //--------------------------------------------------------------------------- 02230 void FreeStatistics(STATISTICS *Statistics) { 02231 /* 02232 ** Parameters: 02233 ** Statistics pointer to data structure to be freed 02234 ** Operation: 02235 ** This routine frees the memory used by the statistics 02236 ** data structure. 02237 ** Return: 02238 ** None 02239 ** Exceptions: 02240 ** None 02241 ** History: 02242 ** 6/5/89, DSJ, Created. 02243 */ 02244 memfree (Statistics->CoVariance); 02245 memfree (Statistics->Min); 02246 memfree (Statistics->Max); 02247 memfree(Statistics); 02248 } // FreeStatistics 02249 02250 02251 //--------------------------------------------------------------------------- 02252 void FreeBuckets(BUCKETS *buckets) { 02253 /* 02254 ** Parameters: 02255 ** buckets pointer to data structure to be freed 02256 ** Operation: 02257 ** This routine properly frees the memory used by a BUCKETS. 02258 */ 02259 Efree(buckets->Count); 02260 Efree(buckets->ExpectedCount); 02261 Efree(buckets); 02262 } // FreeBuckets 02263 02264 02265 //--------------------------------------------------------------------------- 02266 void FreeCluster(CLUSTER *Cluster) { 02267 /* 02268 ** Parameters: 02269 ** Cluster pointer to cluster to be freed 02270 ** Operation: 02271 ** This routine frees the memory consumed by the specified 02272 ** cluster and all of its subclusters. This is done by 02273 ** recursive calls to FreeCluster(). 02274 ** Return: 02275 ** None 02276 ** Exceptions: 02277 ** None 02278 ** History: 02279 ** 6/6/89, DSJ, Created. 02280 */ 02281 if (Cluster != NULL) { 02282 FreeCluster (Cluster->Left); 02283 FreeCluster (Cluster->Right); 02284 memfree(Cluster); 02285 } 02286 } // FreeCluster 02287 02288 02289 //--------------------------------------------------------------------------- 02290 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) { 02291 /* 02292 ** Parameters: 02293 ** Distribution distribution being tested for 02294 ** HistogramBuckets number of buckets in chi-square test 02295 ** Operation: 02296 ** This routine computes the degrees of freedom that should 02297 ** be used in a chi-squared test with the specified number of 02298 ** histogram buckets. The result is always rounded up to 02299 ** the next even number so that the value of chi-squared can be 02300 ** computed more easily. This will cause the value of 02301 ** chi-squared to be higher than the optimum value, resulting 02302 ** in the chi-square test being more lenient than optimum. 02303 ** Return: The number of degrees of freedom for a chi-square test 02304 ** Exceptions: none 02305 ** History: Thu Aug 3 14:04:18 1989, DSJ, Created. 02306 */ 02307 static uinT8 DegreeOffsets[] = { 3, 3, 1 }; 02308 02309 uinT16 AdjustedNumBuckets; 02310 02311 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution]; 02312 if (Odd (AdjustedNumBuckets)) 02313 AdjustedNumBuckets++; 02314 return (AdjustedNumBuckets); 02315 02316 } // DegreesOfFreedom 02317 02318 02319 //--------------------------------------------------------------------------- 02320 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram, 02321 void *arg2) { // uinT16 *DesiredNumberOfBuckets) 02322 /* 02323 ** Parameters: 02324 ** Histogram current histogram being tested for a match 02325 ** DesiredNumberOfBuckets match key 02326 ** Operation: 02327 ** This routine is used to search a list of histogram data 02328 ** structures to find one with the specified number of 02329 ** buckets. It is called by the list search routines. 02330 ** Return: TRUE if Histogram matches DesiredNumberOfBuckets 02331 ** Exceptions: none 02332 ** History: Thu Aug 3 14:17:33 1989, DSJ, Created. 02333 */ 02334 BUCKETS *Histogram = (BUCKETS *) arg1; 02335 uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2; 02336 02337 return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets); 02338 02339 } // NumBucketsMatch 02340 02341 02342 //--------------------------------------------------------------------------- 02343 int ListEntryMatch(void *arg1, //ListNode 02344 void *arg2) { //Key 02345 /* 02346 ** Parameters: none 02347 ** Operation: 02348 ** This routine is used to search a list for a list node 02349 ** whose contents match Key. It is called by the list 02350 ** delete_d routine. 02351 ** Return: TRUE if ListNode matches Key 02352 ** Exceptions: none 02353 ** History: Thu Aug 3 14:23:58 1989, DSJ, Created. 02354 */ 02355 return (arg1 == arg2); 02356 02357 } // ListEntryMatch 02358 02359 02360 //--------------------------------------------------------------------------- 02361 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) { 02362 /* 02363 ** Parameters: 02364 ** Buckets histogram data structure to adjust 02365 ** NewSampleCount new sample count to adjust to 02366 ** Operation: 02367 ** This routine multiplies each ExpectedCount histogram entry 02368 ** by NewSampleCount/OldSampleCount so that the histogram 02369 ** is now adjusted to the new sample count. 02370 ** Return: none 02371 ** Exceptions: none 02372 ** History: Thu Aug 3 14:31:14 1989, DSJ, Created. 02373 */ 02374 int i; 02375 FLOAT64 AdjustFactor; 02376 02377 AdjustFactor = (((FLOAT64) NewSampleCount) / 02378 ((FLOAT64) Buckets->SampleCount)); 02379 02380 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02381 Buckets->ExpectedCount[i] *= AdjustFactor; 02382 } 02383 02384 Buckets->SampleCount = NewSampleCount; 02385 02386 } // AdjustBuckets 02387 02388 02389 //--------------------------------------------------------------------------- 02390 void InitBuckets(BUCKETS *Buckets) { 02391 /* 02392 ** Parameters: 02393 ** Buckets histogram data structure to init 02394 ** Operation: 02395 ** This routine sets the bucket counts in the specified histogram 02396 ** to zero. 02397 ** Return: none 02398 ** Exceptions: none 02399 ** History: Thu Aug 3 14:31:14 1989, DSJ, Created. 02400 */ 02401 int i; 02402 02403 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02404 Buckets->Count[i] = 0; 02405 } 02406 02407 } // InitBuckets 02408 02409 02410 //--------------------------------------------------------------------------- 02411 int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct, 02412 void *arg2) { //CHISTRUCT *SearchKey) 02413 /* 02414 ** Parameters: 02415 ** ChiStruct chi-squared struct being tested for a match 02416 ** SearchKey chi-squared struct that is the search key 02417 ** Operation: 02418 ** This routine is used to search a list of structures which 02419 ** hold pre-computed chi-squared values for a chi-squared 02420 ** value whose corresponding alpha field matches the alpha 02421 ** field of SearchKey. 02422 ** It is called by the list search routines. 02423 ** Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha 02424 ** Exceptions: none 02425 ** History: Thu Aug 3 14:17:33 1989, DSJ, Created. 02426 */ 02427 CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1; 02428 CHISTRUCT *SearchKey = (CHISTRUCT *) arg2; 02429 02430 return (ChiStruct->Alpha == SearchKey->Alpha); 02431 02432 } // AlphaMatch 02433 02434 02435 //--------------------------------------------------------------------------- 02436 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) { 02437 /* 02438 ** Parameters: 02439 ** DegreesOfFreedom degrees of freedom for new chi value 02440 ** Alpha confidence level for new chi value 02441 ** Operation: 02442 ** This routine allocates a new data structure which is used 02443 ** to hold a chi-squared value along with its associated 02444 ** number of degrees of freedom and alpha value. 02445 ** Return: none 02446 ** Exceptions: none 02447 ** History: Fri Aug 4 11:04:59 1989, DSJ, Created. 02448 */ 02449 CHISTRUCT *NewChiStruct; 02450 02451 NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT)); 02452 NewChiStruct->DegreesOfFreedom = DegreesOfFreedom; 02453 NewChiStruct->Alpha = Alpha; 02454 return (NewChiStruct); 02455 02456 } // NewChiStruct 02457 02458 02459 //--------------------------------------------------------------------------- 02460 FLOAT64 02461 Solve (SOLVEFUNC Function, 02462 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy) 02463 /* 02464 ** Parameters: 02465 ** Function function whose zero is to be found 02466 ** FunctionParams arbitrary data to pass to function 02467 ** InitialGuess point to start solution search at 02468 ** Accuracy maximum allowed error 02469 ** Operation: 02470 ** This routine attempts to find an x value at which Function 02471 ** goes to zero (i.e. a root of the function ). It will only 02472 ** work correctly if a solution actually exists and there 02473 ** are no extrema between the solution and the InitialGuess. 02474 ** The algorithms used are extremely primitive. 02475 ** Return: Solution of function ( x for which f(x) = 0 ). 02476 ** Exceptions: none 02477 ** History: Fri Aug 4 11:08:59 1989, DSJ, Created. 02478 */ 02479 #define INITIALDELTA 0.1 02480 #define DELTARATIO 0.1 02481 { 02482 FLOAT64 x; 02483 FLOAT64 f; 02484 FLOAT64 Slope; 02485 FLOAT64 Delta; 02486 FLOAT64 NewDelta; 02487 FLOAT64 xDelta; 02488 FLOAT64 LastPosX, LastNegX; 02489 02490 x = InitialGuess; 02491 Delta = INITIALDELTA; 02492 LastPosX = MAX_FLOAT32; 02493 LastNegX = -MAX_FLOAT32; 02494 f = (*Function) ((CHISTRUCT *) FunctionParams, x); 02495 while (Abs (LastPosX - LastNegX) > Accuracy) { 02496 // keep track of outer bounds of current estimate 02497 if (f < 0) 02498 LastNegX = x; 02499 else 02500 LastPosX = x; 02501 02502 // compute the approx. slope of f(x) at the current point 02503 Slope = 02504 ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta; 02505 02506 // compute the next solution guess */ 02507 xDelta = f / Slope; 02508 x -= xDelta; 02509 02510 // reduce the delta used for computing slope to be a fraction of 02511 //the amount moved to get to the new guess 02512 NewDelta = Abs (xDelta) * DELTARATIO; 02513 if (NewDelta < Delta) 02514 Delta = NewDelta; 02515 02516 // compute the value of the function at the new guess 02517 f = (*Function) ((CHISTRUCT *) FunctionParams, x); 02518 } 02519 return (x); 02520 02521 } // Solve 02522 02523 02524 //--------------------------------------------------------------------------- 02525 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) { 02526 /* 02527 ** Parameters: 02528 ** ChiParams contains degrees of freedom and alpha 02529 ** x value of chi-squared to evaluate 02530 ** Operation: 02531 ** This routine computes the area under a chi density curve 02532 ** from 0 to x, minus the desired area under the curve. The 02533 ** number of degrees of freedom of the chi curve is specified 02534 ** in the ChiParams structure. The desired area is also 02535 ** specified in the ChiParams structure as Alpha ( or 1 minus 02536 ** the desired area ). This routine is intended to be passed 02537 ** to the Solve() function to find the value of chi-squared 02538 ** which will yield a desired area under the right tail of 02539 ** the chi density curve. The function will only work for 02540 ** even degrees of freedom. The equations are based on 02541 ** integrating the chi density curve in parts to obtain 02542 ** a series that can be used to compute the area under the 02543 ** curve. 02544 ** Return: Error between actual and desired area under the chi curve. 02545 ** Exceptions: none 02546 ** History: Fri Aug 4 12:48:41 1989, DSJ, Created. 02547 */ 02548 int i, N; 02549 FLOAT64 SeriesTotal; 02550 FLOAT64 Denominator; 02551 FLOAT64 PowerOfx; 02552 02553 N = ChiParams->DegreesOfFreedom / 2 - 1; 02554 SeriesTotal = 1; 02555 Denominator = 1; 02556 PowerOfx = 1; 02557 for (i = 1; i <= N; i++) { 02558 Denominator *= 2 * i; 02559 PowerOfx *= x; 02560 SeriesTotal += PowerOfx / Denominator; 02561 } 02562 return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha); 02563 02564 } // ChiArea 02565 02566 02567 //--------------------------------------------------------------------------- 02568 BOOL8 02569 MultipleCharSamples (CLUSTERER * Clusterer, 02570 CLUSTER * Cluster, FLOAT32 MaxIllegal) 02571 /* 02572 ** Parameters: 02573 ** Clusterer data structure holding cluster tree 02574 ** Cluster cluster containing samples to be tested 02575 ** MaxIllegal max percentage of samples allowed to have 02576 ** more than 1 feature in the cluster 02577 ** Operation: 02578 ** This routine looks at all samples in the specified cluster. 02579 ** It computes a running estimate of the percentage of the 02580 ** charaters which have more than 1 sample in the cluster. 02581 ** When this percentage exceeds MaxIllegal, TRUE is returned. 02582 ** Otherwise FALSE is returned. The CharID 02583 ** fields must contain integers which identify the training 02584 ** characters which were used to generate the sample. One 02585 ** integer is used for each sample. The NumChar field in 02586 ** the Clusterer must contain the number of characters in the 02587 ** training set. All CharID fields must be between 0 and 02588 ** NumChar-1. The main function of this routine is to help 02589 ** identify clusters which need to be split further, i.e. if 02590 ** numerous training characters have 2 or more features which are 02591 ** contained in the same cluster, then the cluster should be 02592 ** split. 02593 ** Return: TRUE if the cluster should be split, FALSE otherwise. 02594 ** Exceptions: none 02595 ** History: Wed Aug 30 11:13:05 1989, DSJ, Created. 02596 ** 2/22/90, DSJ, Added MaxIllegal control rather than always 02597 ** splitting illegal clusters. 02598 */ 02599 #define ILLEGAL_CHAR 2 02600 { 02601 static BOOL8 *CharFlags = NULL; 02602 static inT32 NumFlags = 0; 02603 int i; 02604 LIST SearchState; 02605 SAMPLE *Sample; 02606 inT32 CharID; 02607 inT32 NumCharInCluster; 02608 inT32 NumIllegalInCluster; 02609 FLOAT32 PercentIllegal; 02610 02611 // initial estimate assumes that no illegal chars exist in the cluster 02612 NumCharInCluster = Cluster->SampleCount; 02613 NumIllegalInCluster = 0; 02614 02615 if (Clusterer->NumChar > NumFlags) { 02616 if (CharFlags != NULL) 02617 memfree(CharFlags); 02618 NumFlags = Clusterer->NumChar; 02619 CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8)); 02620 } 02621 02622 for (i = 0; i < NumFlags; i++) 02623 CharFlags[i] = FALSE; 02624 02625 // find each sample in the cluster and check if we have seen it before 02626 InitSampleSearch(SearchState, Cluster); 02627 while ((Sample = NextSample (&SearchState)) != NULL) { 02628 CharID = Sample->CharID; 02629 if (CharFlags[CharID] == FALSE) { 02630 CharFlags[CharID] = TRUE; 02631 } 02632 else { 02633 if (CharFlags[CharID] == TRUE) { 02634 NumIllegalInCluster++; 02635 CharFlags[CharID] = ILLEGAL_CHAR; 02636 } 02637 NumCharInCluster--; 02638 PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster; 02639 if (PercentIllegal > MaxIllegal) { 02640 destroy(SearchState); 02641 return (TRUE); 02642 } 02643 } 02644 } 02645 return (FALSE); 02646 02647 } // MultipleCharSamples 02648 02649 // Compute the inverse of a matrix using LU decomposition with partial pivoting. 02650 // The return value is the sum of norms of the off-diagonal terms of the 02651 // product of a and inv. (A measure of the error.) 02652 double InvertMatrix(const float* input, int size, float* inv) { 02653 // Allocate memory for the 2D arrays. 02654 GENERIC_2D_ARRAY<double> U(size, size, 0.0); 02655 GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0); 02656 GENERIC_2D_ARRAY<double> L(size, size, 0.0); 02657 02658 // Initialize the working matrices. U starts as input, L as I and U_inv as O. 02659 int row; 02660 int col; 02661 for (row = 0; row < size; row++) { 02662 for (col = 0; col < size; col++) { 02663 U[row][col] = input[row*size + col]; 02664 L[row][col] = row == col ? 1.0 : 0.0; 02665 U_inv[row][col] = 0.0; 02666 } 02667 } 02668 02669 // Compute forward matrix by inversion by LU decomposition of input. 02670 for (col = 0; col < size; ++col) { 02671 // Find best pivot 02672 int best_row = 0; 02673 double best_pivot = -1.0; 02674 for (row = col; row < size; ++row) { 02675 if (Abs(U[row][col]) > best_pivot) { 02676 best_pivot = Abs(U[row][col]); 02677 best_row = row; 02678 } 02679 } 02680 // Exchange pivot rows. 02681 if (best_row != col) { 02682 for (int k = 0; k < size; ++k) { 02683 double tmp = U[best_row][k]; 02684 U[best_row][k] = U[col][k]; 02685 U[col][k] = tmp; 02686 tmp = L[best_row][k]; 02687 L[best_row][k] = L[col][k]; 02688 L[col][k] = tmp; 02689 } 02690 } 02691 // Now do the pivot itself. 02692 for (row = col + 1; row < size; ++row) { 02693 double ratio = -U[row][col] / U[col][col]; 02694 for (int j = col; j < size; ++j) { 02695 U[row][j] += U[col][j] * ratio; 02696 } 02697 for (int k = 0; k < size; ++k) { 02698 L[row][k] += L[col][k] * ratio; 02699 } 02700 } 02701 } 02702 // Next invert U. 02703 for (col = 0; col < size; ++col) { 02704 U_inv[col][col] = 1.0 / U[col][col]; 02705 for (row = col - 1; row >= 0; --row) { 02706 double total = 0.0; 02707 for (int k = col; k > row; --k) { 02708 total += U[row][k] * U_inv[k][col]; 02709 } 02710 U_inv[row][col] = -total / U[row][row]; 02711 } 02712 } 02713 // Now the answer is U_inv.L. 02714 for (row = 0; row < size; row++) { 02715 for (col = 0; col < size; col++) { 02716 double sum = 0.0; 02717 for (int k = row; k < size; ++k) { 02718 sum += U_inv[row][k] * L[k][col]; 02719 } 02720 inv[row*size + col] = sum; 02721 } 02722 } 02723 // Check matrix product. 02724 double error_sum = 0.0; 02725 for (row = 0; row < size; row++) { 02726 for (col = 0; col < size; col++) { 02727 double sum = 0.0; 02728 for (int k = 0; k < size; ++k) { 02729 sum += input[row*size + k] * inv[k *size + col]; 02730 } 02731 if (row != col) { 02732 error_sum += Abs(sum); 02733 } 02734 } 02735 } 02736 return error_sum; 02737 }