tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/classify/cluster.cpp
Go to the documentation of this file.
00001 /******************************************************************************
00002  **     Filename:       cluster.c
00003  **     Purpose:        Routines for clustering points in N-D space
00004  **     Author:         Dan Johnson
00005  **     History:        5/29/89, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 #include "const.h"
00019 #include "cluster.h"
00020 #include "emalloc.h"
00021 #include "genericheap.h"
00022 #include "helpers.h"
00023 #include "kdpair.h"
00024 #include "matrix.h"
00025 #include "tprintf.h"
00026 #include "danerror.h"
00027 #include "freelist.h"
00028 #include <math.h>
00029 
00030 #define HOTELLING 1  // If true use Hotelling's test to decide where to split.
00031 #define FTABLE_X 10  // Size of FTable.
00032 #define FTABLE_Y 100  // Size of FTable.
00033 
00034 // Table of values approximating the cumulative F-distribution for a confidence of 1%.
00035 const double FTable[FTABLE_Y][FTABLE_X] = {
00036  {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
00037   {98.502,  99.000,  99.166,  99.249,  99.300,  99.333,  99.356,  99.374,  99.388,  99.399,},
00038   {34.116,  30.816,  29.457,  28.710,  28.237,  27.911,  27.672,  27.489,  27.345,  27.229,},
00039   {21.198,  18.000,  16.694,  15.977,  15.522,  15.207,  14.976,  14.799,  14.659,  14.546,},
00040   {16.258,  13.274,  12.060,  11.392,  10.967,  10.672,  10.456,  10.289,  10.158,  10.051,},
00041   {13.745,  10.925,   9.780,   9.148,   8.746,   8.466,   8.260,   8.102,   7.976,   7.874,},
00042   {12.246,   9.547,   8.451,   7.847,   7.460,   7.191,   6.993,   6.840,   6.719,   6.620,},
00043   {11.259,   8.649,   7.591,   7.006,   6.632,   6.371,   6.178,   6.029,   5.911,   5.814,},
00044   {10.561,   8.022,   6.992,   6.422,   6.057,   5.802,   5.613,   5.467,   5.351,   5.257,},
00045   {10.044,   7.559,   6.552,   5.994,   5.636,   5.386,   5.200,   5.057,   4.942,   4.849,},
00046   { 9.646,   7.206,   6.217,   5.668,   5.316,   5.069,   4.886,   4.744,   4.632,   4.539,},
00047   { 9.330,   6.927,   5.953,   5.412,   5.064,   4.821,   4.640,   4.499,   4.388,   4.296,},
00048   { 9.074,   6.701,   5.739,   5.205,   4.862,   4.620,   4.441,   4.302,   4.191,   4.100,},
00049   { 8.862,   6.515,   5.564,   5.035,   4.695,   4.456,   4.278,   4.140,   4.030,   3.939,},
00050   { 8.683,   6.359,   5.417,   4.893,   4.556,   4.318,   4.142,   4.004,   3.895,   3.805,},
00051   { 8.531,   6.226,   5.292,   4.773,   4.437,   4.202,   4.026,   3.890,   3.780,   3.691,},
00052   { 8.400,   6.112,   5.185,   4.669,   4.336,   4.102,   3.927,   3.791,   3.682,   3.593,},
00053   { 8.285,   6.013,   5.092,   4.579,   4.248,   4.015,   3.841,   3.705,   3.597,   3.508,},
00054   { 8.185,   5.926,   5.010,   4.500,   4.171,   3.939,   3.765,   3.631,   3.523,   3.434,},
00055   { 8.096,   5.849,   4.938,   4.431,   4.103,   3.871,   3.699,   3.564,   3.457,   3.368,},
00056   { 8.017,   5.780,   4.874,   4.369,   4.042,   3.812,   3.640,   3.506,   3.398,   3.310,},
00057   { 7.945,   5.719,   4.817,   4.313,   3.988,   3.758,   3.587,   3.453,   3.346,   3.258,},
00058   { 7.881,   5.664,   4.765,   4.264,   3.939,   3.710,   3.539,   3.406,   3.299,   3.211,},
00059   { 7.823,   5.614,   4.718,   4.218,   3.895,   3.667,   3.496,   3.363,   3.256,   3.168,},
00060   { 7.770,   5.568,   4.675,   4.177,   3.855,   3.627,   3.457,   3.324,   3.217,   3.129,},
00061   { 7.721,   5.526,   4.637,   4.140,   3.818,   3.591,   3.421,   3.288,   3.182,   3.094,},
00062   { 7.677,   5.488,   4.601,   4.106,   3.785,   3.558,   3.388,   3.256,   3.149,   3.062,},
00063   { 7.636,   5.453,   4.568,   4.074,   3.754,   3.528,   3.358,   3.226,   3.120,   3.032,},
00064   { 7.598,   5.420,   4.538,   4.045,   3.725,   3.499,   3.330,   3.198,   3.092,   3.005,},
00065   { 7.562,   5.390,   4.510,   4.018,   3.699,   3.473,   3.305,   3.173,   3.067,   2.979,},
00066   { 7.530,   5.362,   4.484,   3.993,   3.675,   3.449,   3.281,   3.149,   3.043,   2.955,},
00067   { 7.499,   5.336,   4.459,   3.969,   3.652,   3.427,   3.258,   3.127,   3.021,   2.934,},
00068   { 7.471,   5.312,   4.437,   3.948,   3.630,   3.406,   3.238,   3.106,   3.000,   2.913,},
00069   { 7.444,   5.289,   4.416,   3.927,   3.611,   3.386,   3.218,   3.087,   2.981,   2.894,},
00070   { 7.419,   5.268,   4.396,   3.908,   3.592,   3.368,   3.200,   3.069,   2.963,   2.876,},
00071   { 7.396,   5.248,   4.377,   3.890,   3.574,   3.351,   3.183,   3.052,   2.946,   2.859,},
00072   { 7.373,   5.229,   4.360,   3.873,   3.558,   3.334,   3.167,   3.036,   2.930,   2.843,},
00073   { 7.353,   5.211,   4.343,   3.858,   3.542,   3.319,   3.152,   3.021,   2.915,   2.828,},
00074   { 7.333,   5.194,   4.327,   3.843,   3.528,   3.305,   3.137,   3.006,   2.901,   2.814,},
00075   { 7.314,   5.179,   4.313,   3.828,   3.514,   3.291,   3.124,   2.993,   2.888,   2.801,},
00076   { 7.296,   5.163,   4.299,   3.815,   3.501,   3.278,   3.111,   2.980,   2.875,   2.788,},
00077   { 7.280,   5.149,   4.285,   3.802,   3.488,   3.266,   3.099,   2.968,   2.863,   2.776,},
00078   { 7.264,   5.136,   4.273,   3.790,   3.476,   3.254,   3.087,   2.957,   2.851,   2.764,},
00079   { 7.248,   5.123,   4.261,   3.778,   3.465,   3.243,   3.076,   2.946,   2.840,   2.754,},
00080   { 7.234,   5.110,   4.249,   3.767,   3.454,   3.232,   3.066,   2.935,   2.830,   2.743,},
00081   { 7.220,   5.099,   4.238,   3.757,   3.444,   3.222,   3.056,   2.925,   2.820,   2.733,},
00082   { 7.207,   5.087,   4.228,   3.747,   3.434,   3.213,   3.046,   2.916,   2.811,   2.724,},
00083   { 7.194,   5.077,   4.218,   3.737,   3.425,   3.204,   3.037,   2.907,   2.802,   2.715,},
00084   { 7.182,   5.066,   4.208,   3.728,   3.416,   3.195,   3.028,   2.898,   2.793,   2.706,},
00085   { 7.171,   5.057,   4.199,   3.720,   3.408,   3.186,   3.020,   2.890,   2.785,   2.698,},
00086   { 7.159,   5.047,   4.191,   3.711,   3.400,   3.178,   3.012,   2.882,   2.777,   2.690,},
00087   { 7.149,   5.038,   4.182,   3.703,   3.392,   3.171,   3.005,   2.874,   2.769,   2.683,},
00088   { 7.139,   5.030,   4.174,   3.695,   3.384,   3.163,   2.997,   2.867,   2.762,   2.675,},
00089   { 7.129,   5.021,   4.167,   3.688,   3.377,   3.156,   2.990,   2.860,   2.755,   2.668,},
00090   { 7.119,   5.013,   4.159,   3.681,   3.370,   3.149,   2.983,   2.853,   2.748,   2.662,},
00091   { 7.110,   5.006,   4.152,   3.674,   3.363,   3.143,   2.977,   2.847,   2.742,   2.655,},
00092   { 7.102,   4.998,   4.145,   3.667,   3.357,   3.136,   2.971,   2.841,   2.736,   2.649,},
00093   { 7.093,   4.991,   4.138,   3.661,   3.351,   3.130,   2.965,   2.835,   2.730,   2.643,},
00094   { 7.085,   4.984,   4.132,   3.655,   3.345,   3.124,   2.959,   2.829,   2.724,   2.637,},
00095   { 7.077,   4.977,   4.126,   3.649,   3.339,   3.119,   2.953,   2.823,   2.718,   2.632,},
00096   { 7.070,   4.971,   4.120,   3.643,   3.333,   3.113,   2.948,   2.818,   2.713,   2.626,},
00097   { 7.062,   4.965,   4.114,   3.638,   3.328,   3.108,   2.942,   2.813,   2.708,   2.621,},
00098   { 7.055,   4.959,   4.109,   3.632,   3.323,   3.103,   2.937,   2.808,   2.703,   2.616,},
00099   { 7.048,   4.953,   4.103,   3.627,   3.318,   3.098,   2.932,   2.803,   2.698,   2.611,},
00100   { 7.042,   4.947,   4.098,   3.622,   3.313,   3.093,   2.928,   2.798,   2.693,   2.607,},
00101   { 7.035,   4.942,   4.093,   3.618,   3.308,   3.088,   2.923,   2.793,   2.689,   2.602,},
00102   { 7.029,   4.937,   4.088,   3.613,   3.304,   3.084,   2.919,   2.789,   2.684,   2.598,},
00103   { 7.023,   4.932,   4.083,   3.608,   3.299,   3.080,   2.914,   2.785,   2.680,   2.593,},
00104   { 7.017,   4.927,   4.079,   3.604,   3.295,   3.075,   2.910,   2.781,   2.676,   2.589,},
00105   { 7.011,   4.922,   4.074,   3.600,   3.291,   3.071,   2.906,   2.777,   2.672,   2.585,},
00106   { 7.006,   4.917,   4.070,   3.596,   3.287,   3.067,   2.902,   2.773,   2.668,   2.581,},
00107   { 7.001,   4.913,   4.066,   3.591,   3.283,   3.063,   2.898,   2.769,   2.664,   2.578,},
00108   { 6.995,   4.908,   4.062,   3.588,   3.279,   3.060,   2.895,   2.765,   2.660,   2.574,},
00109   { 6.990,   4.904,   4.058,   3.584,   3.275,   3.056,   2.891,   2.762,   2.657,   2.570,},
00110   { 6.985,   4.900,   4.054,   3.580,   3.272,   3.052,   2.887,   2.758,   2.653,   2.567,},
00111   { 6.981,   4.896,   4.050,   3.577,   3.268,   3.049,   2.884,   2.755,   2.650,   2.563,},
00112   { 6.976,   4.892,   4.047,   3.573,   3.265,   3.046,   2.881,   2.751,   2.647,   2.560,},
00113   { 6.971,   4.888,   4.043,   3.570,   3.261,   3.042,   2.877,   2.748,   2.644,   2.557,},
00114   { 6.967,   4.884,   4.040,   3.566,   3.258,   3.039,   2.874,   2.745,   2.640,   2.554,},
00115   { 6.963,   4.881,   4.036,   3.563,   3.255,   3.036,   2.871,   2.742,   2.637,   2.551,},
00116   { 6.958,   4.877,   4.033,   3.560,   3.252,   3.033,   2.868,   2.739,   2.634,   2.548,},
00117   { 6.954,   4.874,   4.030,   3.557,   3.249,   3.030,   2.865,   2.736,   2.632,   2.545,},
00118   { 6.950,   4.870,   4.027,   3.554,   3.246,   3.027,   2.863,   2.733,   2.629,   2.542,},
00119   { 6.947,   4.867,   4.024,   3.551,   3.243,   3.025,   2.860,   2.731,   2.626,   2.539,},
00120   { 6.943,   4.864,   4.021,   3.548,   3.240,   3.022,   2.857,   2.728,   2.623,   2.537,},
00121   { 6.939,   4.861,   4.018,   3.545,   3.238,   3.019,   2.854,   2.725,   2.621,   2.534,},
00122   { 6.935,   4.858,   4.015,   3.543,   3.235,   3.017,   2.852,   2.723,   2.618,   2.532,},
00123   { 6.932,   4.855,   4.012,   3.540,   3.233,   3.014,   2.849,   2.720,   2.616,   2.529,},
00124   { 6.928,   4.852,   4.010,   3.538,   3.230,   3.012,   2.847,   2.718,   2.613,   2.527,},
00125   { 6.925,   4.849,   4.007,   3.535,   3.228,   3.009,   2.845,   2.715,   2.611,   2.524,},
00126   { 6.922,   4.846,   4.004,   3.533,   3.225,   3.007,   2.842,   2.713,   2.609,   2.522,},
00127   { 6.919,   4.844,   4.002,   3.530,   3.223,   3.004,   2.840,   2.711,   2.606,   2.520,},
00128   { 6.915,   4.841,   3.999,   3.528,   3.221,   3.002,   2.838,   2.709,   2.604,   2.518,},
00129   { 6.912,   4.838,   3.997,   3.525,   3.218,   3.000,   2.835,   2.706,   2.602,   2.515,},
00130   { 6.909,   4.836,   3.995,   3.523,   3.216,   2.998,   2.833,   2.704,   2.600,   2.513,},
00131   { 6.906,   4.833,   3.992,   3.521,   3.214,   2.996,   2.831,   2.702,   2.598,   2.511,},
00132   { 6.904,   4.831,   3.990,   3.519,   3.212,   2.994,   2.829,   2.700,   2.596,   2.509,},
00133   { 6.901,   4.829,   3.988,   3.517,   3.210,   2.992,   2.827,   2.698,   2.594,   2.507,},
00134   { 6.898,   4.826,   3.986,   3.515,   3.208,   2.990,   2.825,   2.696,   2.592,   2.505,},
00135   { 6.895,   4.824,   3.984,   3.513,   3.206,   2.988,   2.823,   2.694,   2.590,   2.503}
00136 };
00137 
00138 /* define the variance which will be used as a minimum variance for any
00139   dimension of any feature. Since most features are calculated from numbers
00140   with a precision no better than 1 in 128, the variance should never be
00141   less than the square of this number for parameters whose range is 1. */
00142 #define MINVARIANCE     0.0004
00143 
00144 /* define the absolute minimum number of samples which must be present in
00145   order to accurately test hypotheses about underlying probability
00146   distributions.  Define separately the minimum samples that are needed
00147   before a statistical analysis is attempted; this number should be
00148   equal to MINSAMPLES but can be set to a lower number for early testing
00149   when very few samples are available. */
00150 #define MINSAMPLESPERBUCKET 5
00151 #define MINSAMPLES    (MINBUCKETS * MINSAMPLESPERBUCKET)
00152 #define MINSAMPLESNEEDED  1
00153 
00154 /* define the size of the table which maps normalized samples to
00155   histogram buckets.  Also define the number of standard deviations
00156   in a normal distribution which are considered to be significant.
00157   The mapping table will be defined in such a way that it covers
00158   the specified number of standard deviations on either side of
00159   the mean.  BUCKETTABLESIZE should always be even. */
00160 #define BUCKETTABLESIZE   1024
00161 #define NORMALEXTENT    3.0
00162 
00163 struct TEMPCLUSTER {
00164   CLUSTER *Cluster;
00165   CLUSTER *Neighbor;
00166 };
00167 
00168 typedef tesseract::KDPairInc<float, TEMPCLUSTER*> ClusterPair;
00169 typedef tesseract::GenericHeap<ClusterPair> ClusterHeap;
00170 
00171 struct STATISTICS {
00172   FLOAT32 AvgVariance;
00173   FLOAT32 *CoVariance;
00174   FLOAT32 *Min;                  // largest negative distance from the mean
00175   FLOAT32 *Max;                  // largest positive distance from the mean
00176 };
00177 
00178 struct BUCKETS {
00179   DISTRIBUTION Distribution;     // distribution being tested for
00180   uinT32 SampleCount;            // # of samples in histogram
00181   FLOAT64 Confidence;            // confidence level of test
00182   FLOAT64 ChiSquared;            // test threshold
00183   uinT16 NumberOfBuckets;        // number of cells in histogram
00184   uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
00185   uinT32 *Count;                 // frequency of occurence histogram
00186   FLOAT32 *ExpectedCount;        // expected histogram
00187 };
00188 
00189 struct CHISTRUCT{
00190   uinT16 DegreesOfFreedom;
00191   FLOAT64 Alpha;
00192   FLOAT64 ChiSquared;
00193 };
00194 
00195 // For use with KDWalk / MakePotentialClusters
00196 struct ClusteringContext {
00197   ClusterHeap *heap;  // heap used to hold temp clusters, "best" on top
00198   TEMPCLUSTER *candidates;  // array of potential clusters
00199   KDTREE *tree;  // kd-tree to be searched for neighbors
00200   inT32 next;  // next candidate to be used
00201 };
00202 
00203 typedef FLOAT64 (*DENSITYFUNC) (inT32);
00204 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double);
00205 
00206 #define Odd(N) ((N)%2)
00207 #define Mirror(N,R) ((R) - (N) - 1)
00208 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) )
00209 
00210 //--------------Global Data Definitions and Declarations----------------------
00211 /* the following variables describe a discrete normal distribution
00212   which is used by NormalDensity() and NormalBucket().  The
00213   constant NORMALEXTENT determines how many standard
00214   deviations of the distribution are mapped onto the fixed
00215   discrete range of x.  x=0 is mapped to -NORMALEXTENT standard
00216   deviations and x=BUCKETTABLESIZE is mapped to
00217   +NORMALEXTENT standard deviations. */
00218 #define SqrtOf2Pi     2.506628275
00219 static const FLOAT64 kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
00220 static const FLOAT64 kNormalVariance =
00221     (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
00222 static const FLOAT64 kNormalMagnitude =
00223     (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
00224 static const FLOAT64 kNormalMean = BUCKETTABLESIZE / 2;
00225 
00226 /* define lookup tables used to compute the number of histogram buckets
00227   that should be used for a given number of samples. */
00228 #define LOOKUPTABLESIZE   8
00229 #define MAXDEGREESOFFREEDOM MAXBUCKETS
00230 
00231 static const uinT32 kCountTable[LOOKUPTABLESIZE] = {
00232   MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
00233 };  // number of samples
00234 
00235 static const uinT16 kBucketsTable[LOOKUPTABLESIZE] = {
00236   MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
00237 };  // number of buckets
00238 
00239 /*-------------------------------------------------------------------------
00240           Private Function Prototypes
00241 --------------------------------------------------------------------------*/
00242 void CreateClusterTree(CLUSTERER *Clusterer);
00243 
00244 void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster,
00245                            inT32 Level);
00246 
00247 CLUSTER *FindNearestNeighbor(KDTREE *Tree,
00248                              CLUSTER *Cluster,
00249                              FLOAT32 *Distance);
00250 
00251 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
00252 
00253 inT32 MergeClusters (inT16 N,
00254 register PARAM_DESC ParamDesc[],
00255 register inT32 n1,
00256 register inT32 n2,
00257 register FLOAT32 m[],
00258 register FLOAT32 m1[], register FLOAT32 m2[]);
00259 
00260 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
00261 
00262 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
00263                          CLUSTERCONFIG *Config,
00264                          CLUSTER *Cluster);
00265 
00266 PROTOTYPE *MakeDegenerateProto(uinT16 N,
00267                                CLUSTER *Cluster,
00268                                STATISTICS *Statistics,
00269                                PROTOSTYLE Style,
00270                                inT32 MinSamples);
00271 
00272 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
00273                                CLUSTERCONFIG *Config,
00274                                CLUSTER *Cluster,
00275                                STATISTICS *Statistics);
00276 
00277 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
00278                               CLUSTER *Cluster,
00279                               STATISTICS *Statistics,
00280                               BUCKETS *Buckets);
00281 
00282 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
00283                                CLUSTER *Cluster,
00284                                STATISTICS *Statistics,
00285                                BUCKETS *Buckets);
00286 
00287 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
00288                           CLUSTER *Cluster,
00289                           STATISTICS *Statistics,
00290                           BUCKETS *NormalBuckets,
00291                           FLOAT64 Confidence);
00292 
00293 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
00294 
00295 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics);
00296 
00297 STATISTICS *ComputeStatistics (inT16 N,
00298 PARAM_DESC ParamDesc[], CLUSTER * Cluster);
00299 
00300 PROTOTYPE *NewSphericalProto(uinT16 N,
00301                              CLUSTER *Cluster,
00302                              STATISTICS *Statistics);
00303 
00304 PROTOTYPE *NewEllipticalProto(inT16 N,
00305                               CLUSTER *Cluster,
00306                               STATISTICS *Statistics);
00307 
00308 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics);
00309 
00310 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster);
00311 
00312 BOOL8 Independent (PARAM_DESC ParamDesc[],
00313 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence);
00314 
00315 BUCKETS *GetBuckets(CLUSTERER* clusterer,
00316                     DISTRIBUTION Distribution,
00317                     uinT32 SampleCount,
00318                     FLOAT64 Confidence);
00319 
00320 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
00321                      uinT32 SampleCount,
00322                      FLOAT64 Confidence);
00323 
00324 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount);
00325 
00326 FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
00327 
00328 FLOAT64 NormalDensity(inT32 x);
00329 
00330 FLOAT64 UniformDensity(inT32 x);
00331 
00332 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx);
00333 
00334 void FillBuckets(BUCKETS *Buckets,
00335                  CLUSTER *Cluster,
00336                  uinT16 Dim,
00337                  PARAM_DESC *ParamDesc,
00338                  FLOAT32 Mean,
00339                  FLOAT32 StdDev);
00340 
00341 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
00342                     FLOAT32 x,
00343                     FLOAT32 Mean,
00344                     FLOAT32 StdDev);
00345 
00346 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
00347                      FLOAT32 x,
00348                      FLOAT32 Mean,
00349                      FLOAT32 StdDev);
00350 
00351 BOOL8 DistributionOK(BUCKETS *Buckets);
00352 
00353 void FreeStatistics(STATISTICS *Statistics);
00354 
00355 void FreeBuckets(BUCKETS *Buckets);
00356 
00357 void FreeCluster(CLUSTER *Cluster);
00358 
00359 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets);
00360 
00361 int NumBucketsMatch(void *arg1,   // BUCKETS *Histogram,
00362                     void *arg2);  // uinT16 *DesiredNumberOfBuckets);
00363 
00364 int ListEntryMatch(void *arg1, void *arg2);
00365 
00366 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount);
00367 
00368 void InitBuckets(BUCKETS *Buckets);
00369 
00370 int AlphaMatch(void *arg1,   // CHISTRUCT *ChiStruct,
00371                void *arg2);  // CHISTRUCT *SearchKey);
00372 
00373 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha);
00374 
00375 FLOAT64 Solve(SOLVEFUNC Function,
00376               void *FunctionParams,
00377               FLOAT64 InitialGuess,
00378               FLOAT64 Accuracy);
00379 
00380 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x);
00381 
00382 BOOL8 MultipleCharSamples(CLUSTERER *Clusterer,
00383                           CLUSTER *Cluster,
00384                           FLOAT32 MaxIllegal);
00385 
00386 double InvertMatrix(const float* input, int size, float* inv);
00387 
00388 //--------------------------Public Code--------------------------------------
00398 CLUSTERER *
00399 MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) {
00400   CLUSTERER *Clusterer;
00401   int i;
00402 
00403   // allocate main clusterer data structure and init simple fields
00404   Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
00405   Clusterer->SampleSize = SampleSize;
00406   Clusterer->NumberOfSamples = 0;
00407   Clusterer->NumChar = 0;
00408 
00409   // init fields which will not be used initially
00410   Clusterer->Root = NULL;
00411   Clusterer->ProtoList = NIL_LIST;
00412 
00413   // maintain a copy of param descriptors in the clusterer data structure
00414   Clusterer->ParamDesc =
00415     (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
00416   for (i = 0; i < SampleSize; i++) {
00417     Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
00418     Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
00419     Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
00420     Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
00421     Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
00422     Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
00423     Clusterer->ParamDesc[i].MidRange =
00424       (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
00425   }
00426 
00427   // allocate a kd tree to hold the samples
00428   Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
00429 
00430   // Initialize cache of histogram buckets to minimize recomputing them.
00431   for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
00432     for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
00433       Clusterer->bucket_cache[d][c] = NULL;
00434   }
00435 
00436   return Clusterer;
00437 }                                // MakeClusterer
00438 
00439 
00454 SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature,
00455                    inT32 CharID) {
00456   SAMPLE *Sample;
00457   int i;
00458 
00459   // see if the samples have already been clustered - if so trap an error
00460   if (Clusterer->Root != NULL)
00461     DoError (ALREADYCLUSTERED,
00462       "Can't add samples after they have been clustered");
00463 
00464   // allocate the new sample and initialize it
00465   Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
00466     (Clusterer->SampleSize -
00467     1) * sizeof (FLOAT32));
00468   Sample->Clustered = FALSE;
00469   Sample->Prototype = FALSE;
00470   Sample->SampleCount = 1;
00471   Sample->Left = NULL;
00472   Sample->Right = NULL;
00473   Sample->CharID = CharID;
00474 
00475   for (i = 0; i < Clusterer->SampleSize; i++)
00476     Sample->Mean[i] = Feature[i];
00477 
00478   // add the sample to the KD tree - keep track of the total # of samples
00479   Clusterer->NumberOfSamples++;
00480   KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
00481   if (CharID >= Clusterer->NumChar)
00482     Clusterer->NumChar = CharID + 1;
00483 
00484   // execute hook for monitoring clustering operation
00485   // (*SampleCreationHook)( Sample );
00486 
00487   return (Sample);
00488 }                                // MakeSample
00489 
00490 
00508 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
00509   //only create cluster tree if samples have never been clustered before
00510   if (Clusterer->Root == NULL)
00511     CreateClusterTree(Clusterer);
00512 
00513   //deallocate the old prototype list if one exists
00514   FreeProtoList (&Clusterer->ProtoList);
00515   Clusterer->ProtoList = NIL_LIST;
00516 
00517   //compute prototypes starting at the root node in the tree
00518   ComputePrototypes(Clusterer, Config);
00519   return (Clusterer->ProtoList);
00520 }                                // ClusterSamples
00521 
00522 
00536 void FreeClusterer(CLUSTERER *Clusterer) {
00537   if (Clusterer != NULL) {
00538     memfree (Clusterer->ParamDesc);
00539     if (Clusterer->KDTree != NULL)
00540       FreeKDTree (Clusterer->KDTree);
00541     if (Clusterer->Root != NULL)
00542       FreeCluster (Clusterer->Root);
00543     // Free up all used buckets structures.
00544     for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
00545       for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
00546         if (Clusterer->bucket_cache[d][c] != NULL)
00547           FreeBuckets(Clusterer->bucket_cache[d][c]);
00548     }
00549 
00550     memfree(Clusterer);
00551   }
00552 }                                // FreeClusterer
00553 
00554 
00564 void FreeProtoList(LIST *ProtoList) {
00565   destroy_nodes(*ProtoList, FreePrototype);
00566 }                                // FreeProtoList
00567 
00568 
00579 void FreePrototype(void *arg) {  //PROTOTYPE     *Prototype)
00580   PROTOTYPE *Prototype = (PROTOTYPE *) arg;
00581 
00582   // unmark the corresponding cluster (if there is one
00583   if (Prototype->Cluster != NULL)
00584     Prototype->Cluster->Prototype = FALSE;
00585 
00586   // deallocate the prototype statistics and then the prototype itself
00587   if (Prototype->Distrib != NULL)
00588     memfree (Prototype->Distrib);
00589   if (Prototype->Mean != NULL)
00590     memfree (Prototype->Mean);
00591   if (Prototype->Style != spherical) {
00592     if (Prototype->Variance.Elliptical != NULL)
00593       memfree (Prototype->Variance.Elliptical);
00594     if (Prototype->Magnitude.Elliptical != NULL)
00595       memfree (Prototype->Magnitude.Elliptical);
00596     if (Prototype->Weight.Elliptical != NULL)
00597       memfree (Prototype->Weight.Elliptical);
00598   }
00599   memfree(Prototype);
00600 }                                // FreePrototype
00601 
00602 
00618 CLUSTER *NextSample(LIST *SearchState) {
00619   CLUSTER *Cluster;
00620 
00621   if (*SearchState == NIL_LIST)
00622     return (NULL);
00623   Cluster = (CLUSTER *) first_node (*SearchState);
00624   *SearchState = pop (*SearchState);
00625   while (TRUE) {
00626     if (Cluster->Left == NULL)
00627       return (Cluster);
00628     *SearchState = push (*SearchState, Cluster->Right);
00629     Cluster = Cluster->Left;
00630   }
00631 }                                // NextSample
00632 
00633 
00643 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) {
00644   return (Proto->Mean[Dimension]);
00645 }                                // Mean
00646 
00647 
00657 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) {
00658   switch (Proto->Style) {
00659     case spherical:
00660       return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
00661     case elliptical:
00662       return ((FLOAT32)
00663         sqrt ((double) Proto->Variance.Elliptical[Dimension]));
00664     case mixed:
00665       switch (Proto->Distrib[Dimension]) {
00666         case normal:
00667           return ((FLOAT32)
00668             sqrt ((double) Proto->Variance.Elliptical[Dimension]));
00669         case uniform:
00670         case D_random:
00671           return (Proto->Variance.Elliptical[Dimension]);
00672         case DISTRIBUTION_COUNT:
00673           ASSERT_HOST(!"Distribution count not allowed!");
00674       }
00675   }
00676   return 0.0f;
00677 }                                // StandardDeviation
00678 
00679 
00680 /*---------------------------------------------------------------------------
00681             Private Code
00682 ----------------------------------------------------------------------------*/
00698 void CreateClusterTree(CLUSTERER *Clusterer) {
00699   ClusteringContext context;
00700   ClusterPair HeapEntry;
00701   TEMPCLUSTER *PotentialCluster;
00702 
00703   // each sample and its nearest neighbor form a "potential" cluster
00704   // save these in a heap with the "best" potential clusters on top
00705   context.tree = Clusterer->KDTree;
00706   context.candidates = (TEMPCLUSTER *)
00707     Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
00708   context.next = 0;
00709   context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
00710   KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
00711 
00712   // form potential clusters into actual clusters - always do "best" first
00713   while (context.heap->Pop(&HeapEntry)) {
00714     PotentialCluster = HeapEntry.data;
00715 
00716     // if main cluster of potential cluster is already in another cluster
00717     // then we don't need to worry about it
00718     if (PotentialCluster->Cluster->Clustered) {
00719       continue;
00720     }
00721 
00722     // if main cluster is not yet clustered, but its nearest neighbor is
00723     // then we must find a new nearest neighbor
00724     else if (PotentialCluster->Neighbor->Clustered) {
00725       PotentialCluster->Neighbor =
00726         FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
00727                             &HeapEntry.key);
00728       if (PotentialCluster->Neighbor != NULL) {
00729         context.heap->Push(&HeapEntry);
00730       }
00731     }
00732 
00733     // if neither cluster is already clustered, form permanent cluster
00734     else {
00735       PotentialCluster->Cluster =
00736           MakeNewCluster(Clusterer, PotentialCluster);
00737       PotentialCluster->Neighbor =
00738           FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
00739                               &HeapEntry.key);
00740       if (PotentialCluster->Neighbor != NULL) {
00741         context.heap->Push(&HeapEntry);
00742       }
00743     }
00744   }
00745 
00746   // the root node in the cluster tree is now the only node in the kd-tree
00747   Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
00748 
00749   // free up the memory used by the K-D tree, heap, and temp clusters
00750   FreeKDTree(context.tree);
00751   Clusterer->KDTree = NULL;
00752   delete context.heap;
00753   memfree(context.candidates);
00754 }                                // CreateClusterTree
00755 
00756 
00768 void MakePotentialClusters(ClusteringContext *context,
00769                            CLUSTER *Cluster, inT32 Level) {
00770   ClusterPair HeapEntry;
00771   int next = context->next;
00772   context->candidates[next].Cluster = Cluster;
00773   HeapEntry.data = &(context->candidates[next]);
00774   context->candidates[next].Neighbor =
00775       FindNearestNeighbor(context->tree,
00776                           context->candidates[next].Cluster,
00777                           &HeapEntry.key);
00778   if (context->candidates[next].Neighbor != NULL) {
00779     context->heap->Push(&HeapEntry);
00780     context->next++;
00781   }
00782 }                                // MakePotentialClusters
00783 
00784 
00801 CLUSTER *
00802 FindNearestNeighbor(KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance)
00803 #define MAXNEIGHBORS  2
00804 #define MAXDISTANCE   MAX_FLOAT32
00805 {
00806   CLUSTER *Neighbor[MAXNEIGHBORS];
00807   FLOAT32 Dist[MAXNEIGHBORS];
00808   int NumberOfNeighbors;
00809   inT32 i;
00810   CLUSTER *BestNeighbor;
00811 
00812   // find the 2 nearest neighbors of the cluster
00813   KDNearestNeighborSearch(Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE,
00814                           &NumberOfNeighbors, (void **)Neighbor, Dist);
00815 
00816   // search for the nearest neighbor that is not the cluster itself
00817   *Distance = MAXDISTANCE;
00818   BestNeighbor = NULL;
00819   for (i = 0; i < NumberOfNeighbors; i++) {
00820     if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
00821       *Distance = Dist[i];
00822       BestNeighbor = Neighbor[i];
00823     }
00824   }
00825   return BestNeighbor;
00826 }                                // FindNearestNeighbor
00827 
00828 
00841 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
00842   CLUSTER *Cluster;
00843 
00844   // allocate the new cluster and initialize it
00845   Cluster = (CLUSTER *) Emalloc(
00846       sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32));
00847   Cluster->Clustered = FALSE;
00848   Cluster->Prototype = FALSE;
00849   Cluster->Left = TempCluster->Cluster;
00850   Cluster->Right = TempCluster->Neighbor;
00851   Cluster->CharID = -1;
00852 
00853   // mark the old clusters as "clustered" and delete them from the kd-tree
00854   Cluster->Left->Clustered = TRUE;
00855   Cluster->Right->Clustered = TRUE;
00856   KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
00857   KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
00858 
00859   // compute the mean and sample count for the new cluster
00860   Cluster->SampleCount =
00861       MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
00862                     Cluster->Left->SampleCount, Cluster->Right->SampleCount,
00863                     Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
00864 
00865   // add the new cluster to the KD tree
00866   KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
00867   return Cluster;
00868 }                                // MakeNewCluster
00869 
00870 
00886 inT32 MergeClusters(inT16 N,
00887                     PARAM_DESC ParamDesc[],
00888                     inT32 n1,
00889                     inT32 n2,
00890                     FLOAT32 m[],
00891                     FLOAT32 m1[], FLOAT32 m2[]) {
00892   inT32 i, n;
00893 
00894   n = n1 + n2;
00895   for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
00896     if (ParamDesc->Circular) {
00897       // if distance between means is greater than allowed
00898       // reduce upper point by one "rotation" to compute mean
00899       // then normalize the mean back into the accepted range
00900       if ((*m2 - *m1) > ParamDesc->HalfRange) {
00901         *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
00902         if (*m < ParamDesc->Min)
00903           *m += ParamDesc->Range;
00904       }
00905       else if ((*m1 - *m2) > ParamDesc->HalfRange) {
00906         *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
00907         if (*m < ParamDesc->Min)
00908           *m += ParamDesc->Range;
00909       }
00910       else
00911         *m = (n1 * *m1 + n2 * *m2) / n;
00912     }
00913     else
00914       *m = (n1 * *m1 + n2 * *m2) / n;
00915   }
00916   return n;
00917 }                                // MergeClusters
00918 
00919 
00931 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
00932   LIST ClusterStack = NIL_LIST;
00933   CLUSTER *Cluster;
00934   PROTOTYPE *Prototype;
00935 
00936   // use a stack to keep track of clusters waiting to be processed
00937   // initially the only cluster on the stack is the root cluster
00938   if (Clusterer->Root != NULL)
00939     ClusterStack = push (NIL_LIST, Clusterer->Root);
00940 
00941   // loop until we have analyzed all clusters which are potential prototypes
00942   while (ClusterStack != NIL_LIST) {
00943     // remove the next cluster to be analyzed from the stack
00944     // try to make a prototype from the cluster
00945     // if successful, put it on the proto list, else split the cluster
00946     Cluster = (CLUSTER *) first_node (ClusterStack);
00947     ClusterStack = pop (ClusterStack);
00948     Prototype = MakePrototype(Clusterer, Config, Cluster);
00949     if (Prototype != NULL) {
00950       Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
00951     }
00952     else {
00953       ClusterStack = push (ClusterStack, Cluster->Right);
00954       ClusterStack = push (ClusterStack, Cluster->Left);
00955     }
00956   }
00957 }                                // ComputePrototypes
00958 
00959 
00978 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
00979                          CLUSTERCONFIG *Config,
00980                          CLUSTER *Cluster) {
00981   STATISTICS *Statistics;
00982   PROTOTYPE *Proto;
00983   BUCKETS *Buckets;
00984 
00985   // filter out clusters which contain samples from the same character
00986   if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
00987     return NULL;
00988 
00989   // compute the covariance matrix and ranges for the cluster
00990   Statistics =
00991       ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
00992 
00993   // check for degenerate clusters which need not be analyzed further
00994   // note that the MinSamples test assumes that all clusters with multiple
00995   // character samples have been removed (as above)
00996   Proto = MakeDegenerateProto(
00997       Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
00998       (inT32) (Config->MinSamples * Clusterer->NumChar));
00999   if (Proto != NULL) {
01000     FreeStatistics(Statistics);
01001     return Proto;
01002   }
01003   // check to ensure that all dimensions are independent
01004   if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
01005                    Statistics->CoVariance, Config->Independence)) {
01006     FreeStatistics(Statistics);
01007     return NULL;
01008   }
01009 
01010   if (HOTELLING && Config->ProtoStyle == elliptical) {
01011     Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
01012     if (Proto != NULL) {
01013       FreeStatistics(Statistics);
01014       return Proto;
01015     }
01016   }
01017 
01018   // create a histogram data structure used to evaluate distributions
01019   Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
01020                        Config->Confidence);
01021 
01022   // create a prototype based on the statistics and test it
01023   switch (Config->ProtoStyle) {
01024     case spherical:
01025       Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
01026       break;
01027     case elliptical:
01028       Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
01029       break;
01030     case mixed:
01031       Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
01032                              Config->Confidence);
01033       break;
01034     case automatic:
01035       Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
01036       if (Proto != NULL)
01037         break;
01038       Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
01039       if (Proto != NULL)
01040         break;
01041       Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
01042                              Config->Confidence);
01043       break;
01044   }
01045   FreeStatistics(Statistics);
01046   return Proto;
01047 }                                // MakePrototype
01048 
01049 
01071 PROTOTYPE *MakeDegenerateProto(  //this was MinSample
01072                                uinT16 N,
01073                                CLUSTER *Cluster,
01074                                STATISTICS *Statistics,
01075                                PROTOSTYLE Style,
01076                                inT32 MinSamples) {
01077   PROTOTYPE *Proto = NULL;
01078 
01079   if (MinSamples < MINSAMPLESNEEDED)
01080     MinSamples = MINSAMPLESNEEDED;
01081 
01082   if (Cluster->SampleCount < MinSamples) {
01083     switch (Style) {
01084       case spherical:
01085         Proto = NewSphericalProto (N, Cluster, Statistics);
01086         break;
01087       case elliptical:
01088       case automatic:
01089         Proto = NewEllipticalProto (N, Cluster, Statistics);
01090         break;
01091       case mixed:
01092         Proto = NewMixedProto (N, Cluster, Statistics);
01093         break;
01094     }
01095     Proto->Significant = FALSE;
01096   }
01097   return (Proto);
01098 }                                // MakeDegenerateProto
01099 
01113 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
01114                                CLUSTERCONFIG *Config,
01115                                CLUSTER *Cluster,
01116                                STATISTICS *Statistics) {
01117   // Fraction of the number of samples used as a range around 1 within
01118   // which a cluster has the magic size that allows a boost to the
01119   // FTable by kFTableBoostMargin, thus allowing clusters near the
01120   // magic size (equal to the number of sample characters) to be more
01121   // likely to stay together.
01122   const double kMagicSampleMargin = 0.0625;
01123   const double kFTableBoostMargin = 2.0;
01124 
01125   int N = Clusterer->SampleSize;
01126   CLUSTER* Left = Cluster->Left;
01127   CLUSTER* Right = Cluster->Right;
01128   if (Left == NULL || Right == NULL)
01129     return NULL;
01130   int TotalDims = Left->SampleCount + Right->SampleCount;
01131   if (TotalDims < N + 1 || TotalDims < 2)
01132     return NULL;
01133   const int kMatrixSize = N * N * sizeof(FLOAT32);
01134   FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
01135   FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
01136   FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
01137   // Compute a new covariance matrix that only uses essential features.
01138   for (int i = 0; i < N; ++i) {
01139     int row_offset = i * N;
01140     if (!Clusterer->ParamDesc[i].NonEssential) {
01141       for (int j = 0; j < N; ++j) {
01142         if (!Clusterer->ParamDesc[j].NonEssential)
01143           Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
01144         else
01145           Covariance[j + row_offset] = 0.0f;
01146       }
01147     } else {
01148       for (int j = 0; j < N; ++j) {
01149         if (i == j)
01150           Covariance[j + row_offset] = 1.0f;
01151         else
01152           Covariance[j + row_offset] = 0.0f;
01153       }
01154     }
01155   }
01156   double err = InvertMatrix(Covariance, N, Inverse);
01157   if (err > 1) {
01158     tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
01159   }
01160   int EssentialN = 0;
01161   for (int dim = 0; dim < N; ++dim) {
01162     if (!Clusterer->ParamDesc[dim].NonEssential) {
01163       Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
01164       ++EssentialN;
01165     } else {
01166       Delta[dim] = 0.0f;
01167     }
01168   }
01169   // Compute Hotelling's T-squared.
01170   double Tsq = 0.0;
01171   for (int x = 0; x < N; ++x) {
01172     double temp = 0.0;
01173     for (int y = 0; y < N; ++y) {
01174       temp += Inverse[y + N*x] * Delta[y];
01175     }
01176     Tsq += Delta[x] * temp;
01177   }
01178   memfree(Covariance);
01179   memfree(Inverse);
01180   memfree(Delta);
01181   // Changed this function to match the formula in
01182   // Statistical Methods in Medical Research p 473
01183   // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
01184   // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
01185   double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
01186   int Fx = EssentialN;
01187   if (Fx > FTABLE_X)
01188     Fx = FTABLE_X;
01189   --Fx;
01190   int Fy = TotalDims - EssentialN - 1;
01191   if (Fy > FTABLE_Y)
01192     Fy = FTABLE_Y;
01193   --Fy;
01194   double FTarget = FTable[Fy][Fx];
01195   if (Config->MagicSamples > 0 &&
01196       TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
01197       TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
01198     // Give magic-sized clusters a magic FTable boost.
01199     FTarget += kFTableBoostMargin;
01200   }
01201   if (F < FTarget) {
01202     return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
01203   }
01204   return NULL;
01205 }
01206 
01207 /* MakeSphericalProto *******************************************************
01208 Parameters:     Clusterer       data struct containing samples being clustered
01209       Cluster           cluster to be made into a spherical prototype
01210       Statistics        statistical info about cluster
01211       Buckets           histogram struct used to analyze distribution
01212 Operation:      This routine tests the specified cluster to see if it can
01213       be approximated by a spherical normal distribution.  If it
01214       can be, then a new prototype is formed and returned to the
01215       caller.  If it can't be, then NULL is returned to the caller.
01216 Return:         Pointer to new spherical prototype or NULL.
01217 Exceptions:     None
01218 History:        6/1/89, DSJ, Created.
01219 ******************************************************************************/
01220 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
01221                               CLUSTER *Cluster,
01222                               STATISTICS *Statistics,
01223                               BUCKETS *Buckets) {
01224   PROTOTYPE *Proto = NULL;
01225   int i;
01226 
01227   // check that each dimension is a normal distribution
01228   for (i = 0; i < Clusterer->SampleSize; i++) {
01229     if (Clusterer->ParamDesc[i].NonEssential)
01230       continue;
01231 
01232     FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01233       Cluster->Mean[i],
01234       sqrt ((FLOAT64) (Statistics->AvgVariance)));
01235     if (!DistributionOK (Buckets))
01236       break;
01237   }
01238   // if all dimensions matched a normal distribution, make a proto
01239   if (i >= Clusterer->SampleSize)
01240     Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
01241   return (Proto);
01242 }                                // MakeSphericalProto
01243 
01244 
01258 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
01259                                CLUSTER *Cluster,
01260                                STATISTICS *Statistics,
01261                                BUCKETS *Buckets) {
01262   PROTOTYPE *Proto = NULL;
01263   int i;
01264 
01265   // check that each dimension is a normal distribution
01266   for (i = 0; i < Clusterer->SampleSize; i++) {
01267     if (Clusterer->ParamDesc[i].NonEssential)
01268       continue;
01269 
01270     FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01271       Cluster->Mean[i],
01272       sqrt ((FLOAT64) Statistics->
01273       CoVariance[i * (Clusterer->SampleSize + 1)]));
01274     if (!DistributionOK (Buckets))
01275       break;
01276   }
01277   // if all dimensions matched a normal distribution, make a proto
01278   if (i >= Clusterer->SampleSize)
01279     Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
01280   return (Proto);
01281 }                                // MakeEllipticalProto
01282 
01283 
01302 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
01303                           CLUSTER *Cluster,
01304                           STATISTICS *Statistics,
01305                           BUCKETS *NormalBuckets,
01306                           FLOAT64 Confidence) {
01307   PROTOTYPE *Proto;
01308   int i;
01309   BUCKETS *UniformBuckets = NULL;
01310   BUCKETS *RandomBuckets = NULL;
01311 
01312   // create a mixed proto to work on - initially assume all dimensions normal*/
01313   Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
01314 
01315   // find the proper distribution for each dimension
01316   for (i = 0; i < Clusterer->SampleSize; i++) {
01317     if (Clusterer->ParamDesc[i].NonEssential)
01318       continue;
01319 
01320     FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01321       Proto->Mean[i],
01322       sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
01323     if (DistributionOK (NormalBuckets))
01324       continue;
01325 
01326     if (RandomBuckets == NULL)
01327       RandomBuckets =
01328         GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
01329     MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
01330     FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01331       Proto->Mean[i], Proto->Variance.Elliptical[i]);
01332     if (DistributionOK (RandomBuckets))
01333       continue;
01334 
01335     if (UniformBuckets == NULL)
01336       UniformBuckets =
01337         GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
01338     MakeDimUniform(i, Proto, Statistics);
01339     FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01340       Proto->Mean[i], Proto->Variance.Elliptical[i]);
01341     if (DistributionOK (UniformBuckets))
01342       continue;
01343     break;
01344   }
01345   // if any dimension failed to match a distribution, discard the proto
01346   if (i < Clusterer->SampleSize) {
01347     FreePrototype(Proto);
01348     Proto = NULL;
01349   }
01350   return (Proto);
01351 }                                // MakeMixedProto
01352 
01353 
01354 /* MakeDimRandom *************************************************************
01355 Parameters:     i               index of dimension to be changed
01356       Proto             prototype whose dimension is to be altered
01357       ParamDesc description of specified dimension
01358 Operation:      This routine alters the ith dimension of the specified
01359       mixed prototype to be D_random.
01360 Return:         None
01361 Exceptions:     None
01362 History:        6/20/89, DSJ, Created.
01363 ******************************************************************************/
01364 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
01365   Proto->Distrib[i] = D_random;
01366   Proto->Mean[i] = ParamDesc->MidRange;
01367   Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
01368 
01369   // subtract out the previous magnitude of this dimension from the total
01370   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
01371   Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
01372   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01373   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01374 
01375   // note that the proto Weight is irrelevant for D_random protos
01376 }                                // MakeDimRandom
01377 
01378 
01389 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) {
01390   Proto->Distrib[i] = uniform;
01391   Proto->Mean[i] = Proto->Cluster->Mean[i] +
01392     (Statistics->Min[i] + Statistics->Max[i]) / 2;
01393   Proto->Variance.Elliptical[i] =
01394     (Statistics->Max[i] - Statistics->Min[i]) / 2;
01395   if (Proto->Variance.Elliptical[i] < MINVARIANCE)
01396     Proto->Variance.Elliptical[i] = MINVARIANCE;
01397 
01398   // subtract out the previous magnitude of this dimension from the total
01399   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
01400   Proto->Magnitude.Elliptical[i] =
01401     1.0 / (2.0 * Proto->Variance.Elliptical[i]);
01402   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01403   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01404 
01405   // note that the proto Weight is irrelevant for uniform protos
01406 }                                // MakeDimUniform
01407 
01408 
01425 STATISTICS *
01426 ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
01427   STATISTICS *Statistics;
01428   int i, j;
01429   FLOAT32 *CoVariance;
01430   FLOAT32 *Distance;
01431   LIST SearchState;
01432   SAMPLE *Sample;
01433   uinT32 SampleCountAdjustedForBias;
01434 
01435   // allocate memory to hold the statistics results
01436   Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
01437   Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
01438   Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01439   Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01440 
01441   // allocate temporary memory to hold the sample to mean distances
01442   Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01443 
01444   // initialize the statistics
01445   Statistics->AvgVariance = 1.0;
01446   CoVariance = Statistics->CoVariance;
01447   for (i = 0; i < N; i++) {
01448     Statistics->Min[i] = 0.0;
01449     Statistics->Max[i] = 0.0;
01450     for (j = 0; j < N; j++, CoVariance++)
01451       *CoVariance = 0;
01452   }
01453   // find each sample in the cluster and merge it into the statistics
01454   InitSampleSearch(SearchState, Cluster);
01455   while ((Sample = NextSample (&SearchState)) != NULL) {
01456     for (i = 0; i < N; i++) {
01457       Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
01458       if (ParamDesc[i].Circular) {
01459         if (Distance[i] > ParamDesc[i].HalfRange)
01460           Distance[i] -= ParamDesc[i].Range;
01461         if (Distance[i] < -ParamDesc[i].HalfRange)
01462           Distance[i] += ParamDesc[i].Range;
01463       }
01464       if (Distance[i] < Statistics->Min[i])
01465         Statistics->Min[i] = Distance[i];
01466       if (Distance[i] > Statistics->Max[i])
01467         Statistics->Max[i] = Distance[i];
01468     }
01469     CoVariance = Statistics->CoVariance;
01470     for (i = 0; i < N; i++)
01471       for (j = 0; j < N; j++, CoVariance++)
01472         *CoVariance += Distance[i] * Distance[j];
01473   }
01474   // normalize the variances by the total number of samples
01475   // use SampleCount-1 instead of SampleCount to get an unbiased estimate
01476   // also compute the geometic mean of the diagonal variances
01477   // ensure that clusters with only 1 sample are handled correctly
01478   if (Cluster->SampleCount > 1)
01479     SampleCountAdjustedForBias = Cluster->SampleCount - 1;
01480   else
01481     SampleCountAdjustedForBias = 1;
01482   CoVariance = Statistics->CoVariance;
01483   for (i = 0; i < N; i++)
01484   for (j = 0; j < N; j++, CoVariance++) {
01485     *CoVariance /= SampleCountAdjustedForBias;
01486     if (j == i) {
01487       if (*CoVariance < MINVARIANCE)
01488         *CoVariance = MINVARIANCE;
01489       Statistics->AvgVariance *= *CoVariance;
01490     }
01491   }
01492   Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
01493                                        1.0 / N);
01494 
01495   // release temporary memory and return
01496   memfree(Distance);
01497   return (Statistics);
01498 }                                // ComputeStatistics
01499 
01500 
01514 PROTOTYPE *NewSphericalProto(uinT16 N,
01515                              CLUSTER *Cluster,
01516                              STATISTICS *Statistics) {
01517   PROTOTYPE *Proto;
01518 
01519   Proto = NewSimpleProto (N, Cluster);
01520 
01521   Proto->Variance.Spherical = Statistics->AvgVariance;
01522   if (Proto->Variance.Spherical < MINVARIANCE)
01523     Proto->Variance.Spherical = MINVARIANCE;
01524 
01525   Proto->Magnitude.Spherical =
01526     1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
01527   Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
01528                                      (double) N);
01529   Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
01530   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01531 
01532   return (Proto);
01533 }                                // NewSphericalProto
01534 
01535 
01548 PROTOTYPE *NewEllipticalProto(inT16 N,
01549                               CLUSTER *Cluster,
01550                               STATISTICS *Statistics) {
01551   PROTOTYPE *Proto;
01552   FLOAT32 *CoVariance;
01553   int i;
01554 
01555   Proto = NewSimpleProto (N, Cluster);
01556   Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01557   Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01558   Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01559 
01560   CoVariance = Statistics->CoVariance;
01561   Proto->TotalMagnitude = 1.0;
01562   for (i = 0; i < N; i++, CoVariance += N + 1) {
01563     Proto->Variance.Elliptical[i] = *CoVariance;
01564     if (Proto->Variance.Elliptical[i] < MINVARIANCE)
01565       Proto->Variance.Elliptical[i] = MINVARIANCE;
01566 
01567     Proto->Magnitude.Elliptical[i] =
01568       1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
01569     Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
01570     Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01571   }
01572   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01573   Proto->Style = elliptical;
01574   return (Proto);
01575 }                                // NewEllipticalProto
01576 
01577 
01593 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) {
01594   PROTOTYPE *Proto;
01595   int i;
01596 
01597   Proto = NewEllipticalProto (N, Cluster, Statistics);
01598   Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
01599 
01600   for (i = 0; i < N; i++) {
01601     Proto->Distrib[i] = normal;
01602   }
01603   Proto->Style = mixed;
01604   return (Proto);
01605 }                                // NewMixedProto
01606 
01607 
01618 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) {
01619   PROTOTYPE *Proto;
01620   int i;
01621 
01622   Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
01623   Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01624 
01625   for (i = 0; i < N; i++)
01626     Proto->Mean[i] = Cluster->Mean[i];
01627   Proto->Distrib = NULL;
01628 
01629   Proto->Significant = TRUE;
01630   Proto->Merged = FALSE;
01631   Proto->Style = spherical;
01632   Proto->NumSamples = Cluster->SampleCount;
01633   Proto->Cluster = Cluster;
01634   Proto->Cluster->Prototype = TRUE;
01635   return (Proto);
01636 }                                // NewSimpleProto
01637 
01638 
01659 BOOL8
01660 Independent (PARAM_DESC ParamDesc[],
01661 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) {
01662   int i, j;
01663   FLOAT32 *VARii;                // points to ith on-diagonal element
01664   FLOAT32 *VARjj;                // points to jth on-diagonal element
01665   FLOAT32 CorrelationCoeff;
01666 
01667   VARii = CoVariance;
01668   for (i = 0; i < N; i++, VARii += N + 1) {
01669     if (ParamDesc[i].NonEssential)
01670       continue;
01671 
01672     VARjj = VARii + N + 1;
01673     CoVariance = VARii + 1;
01674     for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
01675       if (ParamDesc[j].NonEssential)
01676         continue;
01677 
01678       if ((*VARii == 0.0) || (*VARjj == 0.0))
01679         CorrelationCoeff = 0.0;
01680       else
01681         CorrelationCoeff =
01682           sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
01683       if (CorrelationCoeff > Independence)
01684         return (FALSE);
01685     }
01686   }
01687   return (TRUE);
01688 }                                // Independent
01689 
01690 
01709 BUCKETS *GetBuckets(CLUSTERER* clusterer,
01710                     DISTRIBUTION Distribution,
01711                     uinT32 SampleCount,
01712                     FLOAT64 Confidence) {
01713   // Get an old bucket structure with the same number of buckets.
01714   uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
01715   BUCKETS *Buckets =
01716       clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
01717 
01718   // If a matching bucket structure is not found, make one and save it.
01719   if (Buckets == NULL) {
01720     Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
01721     clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
01722         Buckets;
01723   } else {
01724     // Just adjust the existing buckets.
01725     if (SampleCount != Buckets->SampleCount)
01726       AdjustBuckets(Buckets, SampleCount);
01727     if (Confidence != Buckets->Confidence) {
01728       Buckets->Confidence = Confidence;
01729       Buckets->ChiSquared = ComputeChiSquared(
01730           DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
01731           Confidence);
01732     }
01733     InitBuckets(Buckets);
01734   }
01735   return Buckets;
01736 }                                // GetBuckets
01737 
01738 
01759 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
01760                      uinT32 SampleCount,
01761                      FLOAT64 Confidence) {
01762   const DENSITYFUNC DensityFunction[] =
01763     { NormalDensity, UniformDensity, UniformDensity };
01764   int i, j;
01765   BUCKETS *Buckets;
01766   FLOAT64 BucketProbability;
01767   FLOAT64 NextBucketBoundary;
01768   FLOAT64 Probability;
01769   FLOAT64 ProbabilityDelta;
01770   FLOAT64 LastProbDensity;
01771   FLOAT64 ProbDensity;
01772   uinT16 CurrentBucket;
01773   BOOL8 Symmetrical;
01774 
01775   // allocate memory needed for data structure
01776   Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
01777   Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
01778   Buckets->SampleCount = SampleCount;
01779   Buckets->Confidence = Confidence;
01780   Buckets->Count = reinterpret_cast<uinT32*>(
01781       Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
01782   Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
01783       Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
01784 
01785   // initialize simple fields
01786   Buckets->Distribution = Distribution;
01787   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
01788     Buckets->Count[i] = 0;
01789     Buckets->ExpectedCount[i] = 0.0;
01790   }
01791 
01792   // all currently defined distributions are symmetrical
01793   Symmetrical = TRUE;
01794   Buckets->ChiSquared = ComputeChiSquared(
01795       DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
01796 
01797   if (Symmetrical) {
01798     // allocate buckets so that all have approx. equal probability
01799     BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
01800 
01801     // distribution is symmetric so fill in upper half then copy
01802     CurrentBucket = Buckets->NumberOfBuckets / 2;
01803     if (Odd (Buckets->NumberOfBuckets))
01804       NextBucketBoundary = BucketProbability / 2;
01805     else
01806       NextBucketBoundary = BucketProbability;
01807 
01808     Probability = 0.0;
01809     LastProbDensity =
01810       (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
01811     for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
01812       ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
01813       ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
01814       Probability += ProbabilityDelta;
01815       if (Probability > NextBucketBoundary) {
01816         if (CurrentBucket < Buckets->NumberOfBuckets - 1)
01817           CurrentBucket++;
01818         NextBucketBoundary += BucketProbability;
01819       }
01820       Buckets->Bucket[i] = CurrentBucket;
01821       Buckets->ExpectedCount[CurrentBucket] +=
01822         (FLOAT32) (ProbabilityDelta * SampleCount);
01823       LastProbDensity = ProbDensity;
01824     }
01825     // place any leftover probability into the last bucket
01826     Buckets->ExpectedCount[CurrentBucket] +=
01827       (FLOAT32) ((0.5 - Probability) * SampleCount);
01828 
01829     // copy upper half of distribution to lower half
01830     for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
01831       Buckets->Bucket[i] =
01832         Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
01833 
01834     // copy upper half of expected counts to lower half
01835     for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
01836       Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
01837   }
01838   return Buckets;
01839 }                                // MakeBuckets
01840 
01841 
01842 //---------------------------------------------------------------------------
01843 uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) {
01844 /*
01845  **     Parameters:
01846  **             SampleCount     number of samples to be tested
01847   **    Operation:
01848  **             This routine computes the optimum number of histogram
01849  **             buckets that should be used in a chi-squared goodness of
01850  **             fit test for the specified number of samples.  The optimum
01851  **             number is computed based on Table 4.1 on pg. 147 of
01852  **             "Measurement and Analysis of Random Data" by Bendat & Piersol.
01853  **             Linear interpolation is used to interpolate between table
01854  **             values.  The table is intended for a 0.05 level of
01855  **             significance (alpha).  This routine assumes that it is
01856  **             equally valid for other alpha's, which may not be true.
01857  **     Return:
01858  **             Optimum number of histogram buckets
01859  **     Exceptions:
01860  **             None
01861  **     History:
01862  **             6/5/89, DSJ, Created.
01863  */
01864   uinT8 Last, Next;
01865   FLOAT32 Slope;
01866 
01867   if (SampleCount < kCountTable[0])
01868     return kBucketsTable[0];
01869 
01870   for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
01871     if (SampleCount <= kCountTable[Next]) {
01872       Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) /
01873           (FLOAT32) (kCountTable[Next] - kCountTable[Last]);
01874       return ((uinT16) (kBucketsTable[Last] +
01875           Slope * (SampleCount - kCountTable[Last])));
01876     }
01877   }
01878   return kBucketsTable[Last];
01879 }                                // OptimumNumberOfBuckets
01880 
01881 
01882 //---------------------------------------------------------------------------
01883 FLOAT64
01884 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
01885 /*
01886  **     Parameters:
01887  **             DegreesOfFreedom        determines shape of distribution
01888  **             Alpha                   probability of right tail
01889  **     Operation:
01890  **             This routine computes the chi-squared value which will
01891  **             leave a cumulative probability of Alpha in the right tail
01892  **             of a chi-squared distribution with the specified number of
01893  **             degrees of freedom.  Alpha must be between 0 and 1.
01894  **             DegreesOfFreedom must be even.  The routine maintains an
01895  **             array of lists.  Each list corresponds to a different
01896  **             number of degrees of freedom.  Each entry in the list
01897  **             corresponds to a different alpha value and its corresponding
01898  **             chi-squared value.  Therefore, once a particular chi-squared
01899  **             value is computed, it is stored in the list and never
01900  **             needs to be computed again.
01901  **     Return: Desired chi-squared value
01902  **     Exceptions: none
01903  **     History: 6/5/89, DSJ, Created.
01904  */
01905 #define CHIACCURACY     0.01
01906 #define MINALPHA  (1e-200)
01907 {
01908   static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
01909 
01910   CHISTRUCT *OldChiSquared;
01911   CHISTRUCT SearchKey;
01912 
01913   // limit the minimum alpha that can be used - if alpha is too small
01914   //      it may not be possible to compute chi-squared.
01915   Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
01916   if (Odd (DegreesOfFreedom))
01917     DegreesOfFreedom++;
01918 
01919   /* find the list of chi-squared values which have already been computed
01920      for the specified number of degrees of freedom.  Search the list for
01921      the desired chi-squared. */
01922   SearchKey.Alpha = Alpha;
01923   OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
01924     &SearchKey, AlphaMatch));
01925 
01926   if (OldChiSquared == NULL) {
01927     OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
01928     OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
01929       (FLOAT64) DegreesOfFreedom,
01930       (FLOAT64) CHIACCURACY);
01931     ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
01932       OldChiSquared);
01933   }
01934   else {
01935     // further optimization might move OldChiSquared to front of list
01936   }
01937 
01938   return (OldChiSquared->ChiSquared);
01939 
01940 }                                // ComputeChiSquared
01941 
01942 
01943 //---------------------------------------------------------------------------
01944 FLOAT64 NormalDensity(inT32 x) {
01945 /*
01946  **     Parameters:
01947  **             x       number to compute the normal probability density for
01948  **     Globals:
01949  **             kNormalMean     mean of a discrete normal distribution
01950  **             kNormalVariance variance of a discrete normal distribution
01951  **             kNormalMagnitude        magnitude of a discrete normal distribution
01952  **     Operation:
01953  **             This routine computes the probability density function
01954  **             of a discrete normal distribution defined by the global
01955  **             variables kNormalMean, kNormalVariance, and kNormalMagnitude.
01956  **             Normal magnitude could, of course, be computed in terms of
01957  **             the normal variance but it is precomputed for efficiency.
01958  **     Return:
01959  **             The value of the normal distribution at x.
01960  **     Exceptions:
01961  **             None
01962  **     History:
01963  **             6/4/89, DSJ, Created.
01964  */
01965   FLOAT64 Distance;
01966 
01967   Distance = x - kNormalMean;
01968   return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
01969 }                                // NormalDensity
01970 
01971 
01972 //---------------------------------------------------------------------------
01973 FLOAT64 UniformDensity(inT32 x) {
01974 /*
01975  **     Parameters:
01976  **             x       number to compute the uniform probability density for
01977  **     Operation:
01978  **             This routine computes the probability density function
01979  **             of a uniform distribution at the specified point.  The
01980  **             range of the distribution is from 0 to BUCKETTABLESIZE.
01981  **     Return:
01982  **             The value of the uniform distribution at x.
01983  **     Exceptions:
01984  **             None
01985  **     History:
01986  **             6/5/89, DSJ, Created.
01987  */
01988   static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
01989 
01990   if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
01991     return UniformDistributionDensity;
01992   else
01993     return (FLOAT64) 0.0;
01994 }                                // UniformDensity
01995 
01996 
01997 //---------------------------------------------------------------------------
01998 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) {
01999 /*
02000  **     Parameters:
02001  **             f1      value of function at x1
02002  **             f2      value of function at x2
02003  **             Dx      x2 - x1 (should always be positive)
02004  **     Operation:
02005  **             This routine computes a trapezoidal approximation to the
02006  **             integral of a function over a small delta in x.
02007  **     Return:
02008  **             Approximation of the integral of the function from x1 to x2.
02009  **     Exceptions:
02010  **             None
02011  **     History:
02012  **             6/5/89, DSJ, Created.
02013  */
02014   return (f1 + f2) * Dx / 2.0;
02015 }                                // Integral
02016 
02017 
02018 //---------------------------------------------------------------------------
02019 void FillBuckets(BUCKETS *Buckets,
02020                  CLUSTER *Cluster,
02021                  uinT16 Dim,
02022                  PARAM_DESC *ParamDesc,
02023                  FLOAT32 Mean,
02024                  FLOAT32 StdDev) {
02025 /*
02026  **     Parameters:
02027  **             Buckets         histogram buckets to count samples
02028  **             Cluster         cluster whose samples are being analyzed
02029  **             Dim             dimension of samples which is being analyzed
02030  **             ParamDesc       description of the dimension
02031  **             Mean            "mean" of the distribution
02032  **             StdDev          "standard deviation" of the distribution
02033  **     Operation:
02034  **             This routine counts the number of cluster samples which
02035  **             fall within the various histogram buckets in Buckets.  Only
02036  **             one dimension of each sample is examined.  The exact meaning
02037  **             of the Mean and StdDev parameters depends on the
02038  **             distribution which is being analyzed (this info is in the
02039  **             Buckets data structure).  For normal distributions, Mean
02040  **             and StdDev have the expected meanings.  For uniform and
02041  **             random distributions the Mean is the center point of the
02042  **             range and the StdDev is 1/2 the range.  A dimension with
02043  **             zero standard deviation cannot be statistically analyzed.
02044  **             In this case, a pseudo-analysis is used.
02045  **     Return:
02046  **             None (the Buckets data structure is filled in)
02047  **     Exceptions:
02048  **             None
02049  **     History:
02050  **             6/5/89, DSJ, Created.
02051  */
02052   uinT16 BucketID;
02053   int i;
02054   LIST SearchState;
02055   SAMPLE *Sample;
02056 
02057   // initialize the histogram bucket counts to 0
02058   for (i = 0; i < Buckets->NumberOfBuckets; i++)
02059     Buckets->Count[i] = 0;
02060 
02061   if (StdDev == 0.0) {
02062     /* if the standard deviation is zero, then we can't statistically
02063        analyze the cluster.  Use a pseudo-analysis: samples exactly on
02064        the mean are distributed evenly across all buckets.  Samples greater
02065        than the mean are placed in the last bucket; samples less than the
02066        mean are placed in the first bucket. */
02067 
02068     InitSampleSearch(SearchState, Cluster);
02069     i = 0;
02070     while ((Sample = NextSample (&SearchState)) != NULL) {
02071       if (Sample->Mean[Dim] > Mean)
02072         BucketID = Buckets->NumberOfBuckets - 1;
02073       else if (Sample->Mean[Dim] < Mean)
02074         BucketID = 0;
02075       else
02076         BucketID = i;
02077       Buckets->Count[BucketID] += 1;
02078       i++;
02079       if (i >= Buckets->NumberOfBuckets)
02080         i = 0;
02081     }
02082   }
02083   else {
02084     // search for all samples in the cluster and add to histogram buckets
02085     InitSampleSearch(SearchState, Cluster);
02086     while ((Sample = NextSample (&SearchState)) != NULL) {
02087       switch (Buckets->Distribution) {
02088         case normal:
02089           BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
02090             Mean, StdDev);
02091           break;
02092         case D_random:
02093         case uniform:
02094           BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
02095             Mean, StdDev);
02096           break;
02097         default:
02098           BucketID = 0;
02099       }
02100       Buckets->Count[Buckets->Bucket[BucketID]] += 1;
02101     }
02102   }
02103 }                                // FillBuckets
02104 
02105 
02106 //---------------------------------------------------------------------------*/
02107 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
02108                     FLOAT32 x,
02109                     FLOAT32 Mean,
02110                     FLOAT32 StdDev) {
02111 /*
02112  **     Parameters:
02113  **             ParamDesc       used to identify circular dimensions
02114  **             x               value to be normalized
02115  **             Mean            mean of normal distribution
02116  **             StdDev          standard deviation of normal distribution
02117  **     Operation:
02118  **             This routine determines which bucket x falls into in the
02119  **             discrete normal distribution defined by kNormalMean
02120  **             and kNormalStdDev.  x values which exceed the range of
02121  **             the discrete distribution are clipped.
02122  **     Return:
02123  **             Bucket number into which x falls
02124  **     Exceptions:
02125  **             None
02126  **     History:
02127  **             6/5/89, DSJ, Created.
02128  */
02129   FLOAT32 X;
02130 
02131   // wraparound circular parameters if necessary
02132   if (ParamDesc->Circular) {
02133     if (x - Mean > ParamDesc->HalfRange)
02134       x -= ParamDesc->Range;
02135     else if (x - Mean < -ParamDesc->HalfRange)
02136       x += ParamDesc->Range;
02137   }
02138 
02139   X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
02140   if (X < 0)
02141     return 0;
02142   if (X > BUCKETTABLESIZE - 1)
02143     return ((uinT16) (BUCKETTABLESIZE - 1));
02144   return (uinT16) floor((FLOAT64) X);
02145 }                                // NormalBucket
02146 
02147 
02148 //---------------------------------------------------------------------------
02149 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
02150                      FLOAT32 x,
02151                      FLOAT32 Mean,
02152                      FLOAT32 StdDev) {
02153 /*
02154  **     Parameters:
02155  **             ParamDesc       used to identify circular dimensions
02156  **             x               value to be normalized
02157  **             Mean            center of range of uniform distribution
02158  **             StdDev          1/2 the range of the uniform distribution
02159  **     Operation:
02160  **             This routine determines which bucket x falls into in the
02161  **             discrete uniform distribution defined by
02162  **             BUCKETTABLESIZE.  x values which exceed the range of
02163  **             the discrete distribution are clipped.
02164  **     Return:
02165  **             Bucket number into which x falls
02166  **     Exceptions:
02167  **             None
02168  **     History:
02169  **             6/5/89, DSJ, Created.
02170  */
02171   FLOAT32 X;
02172 
02173   // wraparound circular parameters if necessary
02174   if (ParamDesc->Circular) {
02175     if (x - Mean > ParamDesc->HalfRange)
02176       x -= ParamDesc->Range;
02177     else if (x - Mean < -ParamDesc->HalfRange)
02178       x += ParamDesc->Range;
02179   }
02180 
02181   X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
02182   if (X < 0)
02183     return 0;
02184   if (X > BUCKETTABLESIZE - 1)
02185     return (uinT16) (BUCKETTABLESIZE - 1);
02186   return (uinT16) floor((FLOAT64) X);
02187 }                                // UniformBucket
02188 
02189 
02190 //---------------------------------------------------------------------------
02191 BOOL8 DistributionOK(BUCKETS *Buckets) {
02192 /*
02193  **     Parameters:
02194  **             Buckets         histogram data to perform chi-square test on
02195  **     Operation:
02196  **             This routine performs a chi-square goodness of fit test
02197  **             on the histogram data in the Buckets data structure.  TRUE
02198  **             is returned if the histogram matches the probability
02199  **             distribution which was specified when the Buckets
02200  **             structure was originally created.  Otherwise FALSE is
02201  **             returned.
02202  **     Return:
02203  **             TRUE if samples match distribution, FALSE otherwise
02204  **     Exceptions:
02205  **             None
02206  **     History:
02207  **             6/5/89, DSJ, Created.
02208  */
02209   FLOAT32 FrequencyDifference;
02210   FLOAT32 TotalDifference;
02211   int i;
02212 
02213   // compute how well the histogram matches the expected histogram
02214   TotalDifference = 0.0;
02215   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02216     FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
02217     TotalDifference += (FrequencyDifference * FrequencyDifference) /
02218       Buckets->ExpectedCount[i];
02219   }
02220 
02221   // test to see if the difference is more than expected
02222   if (TotalDifference > Buckets->ChiSquared)
02223     return FALSE;
02224   else
02225     return TRUE;
02226 }                                // DistributionOK
02227 
02228 
02229 //---------------------------------------------------------------------------
02230 void FreeStatistics(STATISTICS *Statistics) {
02231 /*
02232  **     Parameters:
02233  **             Statistics      pointer to data structure to be freed
02234  **     Operation:
02235  **             This routine frees the memory used by the statistics
02236  **             data structure.
02237  **     Return:
02238  **             None
02239  **     Exceptions:
02240  **             None
02241  **     History:
02242  **             6/5/89, DSJ, Created.
02243  */
02244   memfree (Statistics->CoVariance);
02245   memfree (Statistics->Min);
02246   memfree (Statistics->Max);
02247   memfree(Statistics);
02248 }                                // FreeStatistics
02249 
02250 
02251 //---------------------------------------------------------------------------
02252 void FreeBuckets(BUCKETS *buckets) {
02253 /*
02254  **  Parameters:
02255  **      buckets  pointer to data structure to be freed
02256  **  Operation:
02257  **      This routine properly frees the memory used by a BUCKETS.
02258  */
02259   Efree(buckets->Count);
02260   Efree(buckets->ExpectedCount);
02261   Efree(buckets);
02262 }                                // FreeBuckets
02263 
02264 
02265 //---------------------------------------------------------------------------
02266 void FreeCluster(CLUSTER *Cluster) {
02267 /*
02268  **     Parameters:
02269  **             Cluster         pointer to cluster to be freed
02270  **     Operation:
02271  **             This routine frees the memory consumed by the specified
02272  **             cluster and all of its subclusters.  This is done by
02273  **             recursive calls to FreeCluster().
02274  **     Return:
02275  **             None
02276  **     Exceptions:
02277  **             None
02278  **     History:
02279  **             6/6/89, DSJ, Created.
02280  */
02281   if (Cluster != NULL) {
02282     FreeCluster (Cluster->Left);
02283     FreeCluster (Cluster->Right);
02284     memfree(Cluster);
02285   }
02286 }                                // FreeCluster
02287 
02288 
02289 //---------------------------------------------------------------------------
02290 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) {
02291 /*
02292  **     Parameters:
02293  **             Distribution            distribution being tested for
02294  **             HistogramBuckets        number of buckets in chi-square test
02295  **     Operation:
02296  **             This routine computes the degrees of freedom that should
02297  **             be used in a chi-squared test with the specified number of
02298  **             histogram buckets.  The result is always rounded up to
02299  **             the next even number so that the value of chi-squared can be
02300  **             computed more easily.  This will cause the value of
02301  **             chi-squared to be higher than the optimum value, resulting
02302  **             in the chi-square test being more lenient than optimum.
02303  **     Return: The number of degrees of freedom for a chi-square test
02304  **     Exceptions: none
02305  **     History: Thu Aug  3 14:04:18 1989, DSJ, Created.
02306  */
02307   static uinT8 DegreeOffsets[] = { 3, 3, 1 };
02308 
02309   uinT16 AdjustedNumBuckets;
02310 
02311   AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
02312   if (Odd (AdjustedNumBuckets))
02313     AdjustedNumBuckets++;
02314   return (AdjustedNumBuckets);
02315 
02316 }                                // DegreesOfFreedom
02317 
02318 
02319 //---------------------------------------------------------------------------
02320 int NumBucketsMatch(void *arg1,    // BUCKETS *Histogram,
02321                     void *arg2) {  // uinT16 *DesiredNumberOfBuckets)
02322 /*
02323  **     Parameters:
02324  **             Histogram       current histogram being tested for a match
02325  **             DesiredNumberOfBuckets  match key
02326  **     Operation:
02327  **             This routine is used to search a list of histogram data
02328  **             structures to find one with the specified number of
02329  **             buckets.  It is called by the list search routines.
02330  **     Return: TRUE if Histogram matches DesiredNumberOfBuckets
02331  **     Exceptions: none
02332  **     History: Thu Aug  3 14:17:33 1989, DSJ, Created.
02333  */
02334   BUCKETS *Histogram = (BUCKETS *) arg1;
02335   uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
02336 
02337   return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
02338 
02339 }                                // NumBucketsMatch
02340 
02341 
02342 //---------------------------------------------------------------------------
02343 int ListEntryMatch(void *arg1,    //ListNode
02344                    void *arg2) {  //Key
02345 /*
02346  **     Parameters: none
02347  **     Operation:
02348  **             This routine is used to search a list for a list node
02349  **             whose contents match Key.  It is called by the list
02350  **             delete_d routine.
02351  **     Return: TRUE if ListNode matches Key
02352  **     Exceptions: none
02353  **     History: Thu Aug  3 14:23:58 1989, DSJ, Created.
02354  */
02355   return (arg1 == arg2);
02356 
02357 }                                // ListEntryMatch
02358 
02359 
02360 //---------------------------------------------------------------------------
02361 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) {
02362 /*
02363  **     Parameters:
02364  **             Buckets         histogram data structure to adjust
02365  **             NewSampleCount  new sample count to adjust to
02366  **     Operation:
02367  **             This routine multiplies each ExpectedCount histogram entry
02368  **             by NewSampleCount/OldSampleCount so that the histogram
02369  **             is now adjusted to the new sample count.
02370  **     Return: none
02371  **     Exceptions: none
02372  **     History: Thu Aug  3 14:31:14 1989, DSJ, Created.
02373  */
02374   int i;
02375   FLOAT64 AdjustFactor;
02376 
02377   AdjustFactor = (((FLOAT64) NewSampleCount) /
02378     ((FLOAT64) Buckets->SampleCount));
02379 
02380   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02381     Buckets->ExpectedCount[i] *= AdjustFactor;
02382   }
02383 
02384   Buckets->SampleCount = NewSampleCount;
02385 
02386 }                                // AdjustBuckets
02387 
02388 
02389 //---------------------------------------------------------------------------
02390 void InitBuckets(BUCKETS *Buckets) {
02391 /*
02392  **     Parameters:
02393  **             Buckets         histogram data structure to init
02394  **     Operation:
02395  **             This routine sets the bucket counts in the specified histogram
02396  **             to zero.
02397  **     Return: none
02398  **     Exceptions: none
02399  **     History: Thu Aug  3 14:31:14 1989, DSJ, Created.
02400  */
02401   int i;
02402 
02403   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02404     Buckets->Count[i] = 0;
02405   }
02406 
02407 }                                // InitBuckets
02408 
02409 
02410 //---------------------------------------------------------------------------
02411 int AlphaMatch(void *arg1,    //CHISTRUCT                             *ChiStruct,
02412                void *arg2) {  //CHISTRUCT                             *SearchKey)
02413 /*
02414  **     Parameters:
02415  **             ChiStruct       chi-squared struct being tested for a match
02416  **             SearchKey       chi-squared struct that is the search key
02417  **     Operation:
02418  **             This routine is used to search a list of structures which
02419  **             hold pre-computed chi-squared values for a chi-squared
02420  **             value whose corresponding alpha field matches the alpha
02421  **             field of SearchKey.
02422  **             It is called by the list search routines.
02423  **     Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha
02424  **     Exceptions: none
02425  **     History: Thu Aug  3 14:17:33 1989, DSJ, Created.
02426  */
02427   CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
02428   CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
02429 
02430   return (ChiStruct->Alpha == SearchKey->Alpha);
02431 
02432 }                                // AlphaMatch
02433 
02434 
02435 //---------------------------------------------------------------------------
02436 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) {
02437 /*
02438  **     Parameters:
02439  **             DegreesOfFreedom        degrees of freedom for new chi value
02440  **             Alpha                   confidence level for new chi value
02441  **     Operation:
02442  **             This routine allocates a new data structure which is used
02443  **             to hold a chi-squared value along with its associated
02444  **             number of degrees of freedom and alpha value.
02445  **     Return: none
02446  **     Exceptions: none
02447  **     History: Fri Aug  4 11:04:59 1989, DSJ, Created.
02448  */
02449   CHISTRUCT *NewChiStruct;
02450 
02451   NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
02452   NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
02453   NewChiStruct->Alpha = Alpha;
02454   return (NewChiStruct);
02455 
02456 }                                // NewChiStruct
02457 
02458 
02459 //---------------------------------------------------------------------------
02460 FLOAT64
02461 Solve (SOLVEFUNC Function,
02462 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
02463 /*
02464  **     Parameters:
02465  **             Function        function whose zero is to be found
02466  **             FunctionParams  arbitrary data to pass to function
02467  **             InitialGuess    point to start solution search at
02468  **             Accuracy        maximum allowed error
02469  **     Operation:
02470  **             This routine attempts to find an x value at which Function
02471  **             goes to zero (i.e. a root of the function ).  It will only
02472  **             work correctly if a solution actually exists and there
02473  **             are no extrema between the solution and the InitialGuess.
02474  **             The algorithms used are extremely primitive.
02475  **     Return: Solution of function ( x for which f(x) = 0 ).
02476  **     Exceptions: none
02477  **     History: Fri Aug  4 11:08:59 1989, DSJ, Created.
02478  */
02479 #define INITIALDELTA    0.1
02480 #define  DELTARATIO     0.1
02481 {
02482   FLOAT64 x;
02483   FLOAT64 f;
02484   FLOAT64 Slope;
02485   FLOAT64 Delta;
02486   FLOAT64 NewDelta;
02487   FLOAT64 xDelta;
02488   FLOAT64 LastPosX, LastNegX;
02489 
02490   x = InitialGuess;
02491   Delta = INITIALDELTA;
02492   LastPosX = MAX_FLOAT32;
02493   LastNegX = -MAX_FLOAT32;
02494   f = (*Function) ((CHISTRUCT *) FunctionParams, x);
02495   while (Abs (LastPosX - LastNegX) > Accuracy) {
02496     // keep track of outer bounds of current estimate
02497     if (f < 0)
02498       LastNegX = x;
02499     else
02500       LastPosX = x;
02501 
02502     // compute the approx. slope of f(x) at the current point
02503     Slope =
02504       ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
02505 
02506     // compute the next solution guess */
02507     xDelta = f / Slope;
02508     x -= xDelta;
02509 
02510     // reduce the delta used for computing slope to be a fraction of
02511     //the amount moved to get to the new guess
02512     NewDelta = Abs (xDelta) * DELTARATIO;
02513     if (NewDelta < Delta)
02514       Delta = NewDelta;
02515 
02516     // compute the value of the function at the new guess
02517     f = (*Function) ((CHISTRUCT *) FunctionParams, x);
02518   }
02519   return (x);
02520 
02521 }                                // Solve
02522 
02523 
02524 //---------------------------------------------------------------------------
02525 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) {
02526 /*
02527  **     Parameters:
02528  **             ChiParams       contains degrees of freedom and alpha
02529  **             x               value of chi-squared to evaluate
02530  **     Operation:
02531  **             This routine computes the area under a chi density curve
02532  **             from 0 to x, minus the desired area under the curve.  The
02533  **             number of degrees of freedom of the chi curve is specified
02534  **             in the ChiParams structure.  The desired area is also
02535  **             specified in the ChiParams structure as Alpha ( or 1 minus
02536  **             the desired area ).  This routine is intended to be passed
02537  **             to the Solve() function to find the value of chi-squared
02538  **             which will yield a desired area under the right tail of
02539  **             the chi density curve.  The function will only work for
02540  **             even degrees of freedom.  The equations are based on
02541  **             integrating the chi density curve in parts to obtain
02542  **             a series that can be used to compute the area under the
02543  **             curve.
02544  **     Return: Error between actual and desired area under the chi curve.
02545  **     Exceptions: none
02546  **     History: Fri Aug  4 12:48:41 1989, DSJ, Created.
02547  */
02548   int i, N;
02549   FLOAT64 SeriesTotal;
02550   FLOAT64 Denominator;
02551   FLOAT64 PowerOfx;
02552 
02553   N = ChiParams->DegreesOfFreedom / 2 - 1;
02554   SeriesTotal = 1;
02555   Denominator = 1;
02556   PowerOfx = 1;
02557   for (i = 1; i <= N; i++) {
02558     Denominator *= 2 * i;
02559     PowerOfx *= x;
02560     SeriesTotal += PowerOfx / Denominator;
02561   }
02562   return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
02563 
02564 }                                // ChiArea
02565 
02566 
02567 //---------------------------------------------------------------------------
02568 BOOL8
02569 MultipleCharSamples (CLUSTERER * Clusterer,
02570 CLUSTER * Cluster, FLOAT32 MaxIllegal)
02571 /*
02572  **     Parameters:
02573  **             Clusterer       data structure holding cluster tree
02574  **             Cluster         cluster containing samples to be tested
02575  **             MaxIllegal      max percentage of samples allowed to have
02576  **                             more than 1 feature in the cluster
02577  **     Operation:
02578  **             This routine looks at all samples in the specified cluster.
02579  **             It computes a running estimate of the percentage of the
02580  **             charaters which have more than 1 sample in the cluster.
02581  **             When this percentage exceeds MaxIllegal, TRUE is returned.
02582  **             Otherwise FALSE is returned.  The CharID
02583  **             fields must contain integers which identify the training
02584  **             characters which were used to generate the sample.  One
02585  **             integer is used for each sample.  The NumChar field in
02586  **             the Clusterer must contain the number of characters in the
02587  **             training set.  All CharID fields must be between 0 and
02588  **             NumChar-1.  The main function of this routine is to help
02589  **             identify clusters which need to be split further, i.e. if
02590  **             numerous training characters have 2 or more features which are
02591  **             contained in the same cluster, then the cluster should be
02592  **             split.
02593  **     Return: TRUE if the cluster should be split, FALSE otherwise.
02594  **     Exceptions: none
02595  **     History: Wed Aug 30 11:13:05 1989, DSJ, Created.
02596  **             2/22/90, DSJ, Added MaxIllegal control rather than always
02597  **                             splitting illegal clusters.
02598  */
02599 #define ILLEGAL_CHAR    2
02600 {
02601   static BOOL8 *CharFlags = NULL;
02602   static inT32 NumFlags = 0;
02603   int i;
02604   LIST SearchState;
02605   SAMPLE *Sample;
02606   inT32 CharID;
02607   inT32 NumCharInCluster;
02608   inT32 NumIllegalInCluster;
02609   FLOAT32 PercentIllegal;
02610 
02611   // initial estimate assumes that no illegal chars exist in the cluster
02612   NumCharInCluster = Cluster->SampleCount;
02613   NumIllegalInCluster = 0;
02614 
02615   if (Clusterer->NumChar > NumFlags) {
02616     if (CharFlags != NULL)
02617       memfree(CharFlags);
02618     NumFlags = Clusterer->NumChar;
02619     CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
02620   }
02621 
02622   for (i = 0; i < NumFlags; i++)
02623     CharFlags[i] = FALSE;
02624 
02625   // find each sample in the cluster and check if we have seen it before
02626   InitSampleSearch(SearchState, Cluster);
02627   while ((Sample = NextSample (&SearchState)) != NULL) {
02628     CharID = Sample->CharID;
02629     if (CharFlags[CharID] == FALSE) {
02630       CharFlags[CharID] = TRUE;
02631     }
02632     else {
02633       if (CharFlags[CharID] == TRUE) {
02634         NumIllegalInCluster++;
02635         CharFlags[CharID] = ILLEGAL_CHAR;
02636       }
02637       NumCharInCluster--;
02638       PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
02639       if (PercentIllegal > MaxIllegal) {
02640         destroy(SearchState);
02641         return (TRUE);
02642       }
02643     }
02644   }
02645   return (FALSE);
02646 
02647 }                                // MultipleCharSamples
02648 
02649 // Compute the inverse of a matrix using LU decomposition with partial pivoting.
02650 // The return value is the sum of norms of the off-diagonal terms of the
02651 // product of a and inv. (A measure of the error.)
02652 double InvertMatrix(const float* input, int size, float* inv) {
02653   // Allocate memory for the 2D arrays.
02654   GENERIC_2D_ARRAY<double> U(size, size, 0.0);
02655   GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
02656   GENERIC_2D_ARRAY<double> L(size, size, 0.0);
02657 
02658   // Initialize the working matrices. U starts as input, L as I and U_inv as O.
02659   int row;
02660   int col;
02661   for (row = 0; row < size; row++) {
02662     for (col = 0; col < size; col++) {
02663       U[row][col] = input[row*size + col];
02664       L[row][col] = row == col ? 1.0 : 0.0;
02665       U_inv[row][col] = 0.0;
02666     }
02667   }
02668 
02669   // Compute forward matrix by inversion by LU decomposition of input.
02670   for (col = 0; col < size; ++col) {
02671     // Find best pivot
02672     int best_row = 0;
02673     double best_pivot = -1.0;
02674     for (row = col; row < size; ++row) {
02675       if (Abs(U[row][col]) > best_pivot) {
02676         best_pivot = Abs(U[row][col]);
02677         best_row = row;
02678       }
02679     }
02680     // Exchange pivot rows.
02681     if (best_row != col) {
02682       for (int k = 0; k < size; ++k) {
02683         double tmp = U[best_row][k];
02684         U[best_row][k] = U[col][k];
02685         U[col][k] = tmp;
02686         tmp = L[best_row][k];
02687         L[best_row][k] = L[col][k];
02688         L[col][k] = tmp;
02689       }
02690     }
02691     // Now do the pivot itself.
02692     for (row = col + 1; row < size; ++row) {
02693       double ratio = -U[row][col] / U[col][col];
02694       for (int j = col; j < size; ++j) {
02695         U[row][j] += U[col][j] * ratio;
02696       }
02697       for (int k = 0; k < size; ++k) {
02698         L[row][k] += L[col][k] * ratio;
02699       }
02700     }
02701   }
02702   // Next invert U.
02703   for (col = 0; col < size; ++col) {
02704     U_inv[col][col] = 1.0 / U[col][col];
02705     for (row = col - 1; row >= 0; --row) {
02706       double total = 0.0;
02707       for (int k = col; k > row; --k) {
02708         total += U[row][k] * U_inv[k][col];
02709       }
02710       U_inv[row][col] = -total / U[row][row];
02711     }
02712   }
02713   // Now the answer is U_inv.L.
02714   for (row = 0; row < size; row++) {
02715     for (col = 0; col < size; col++) {
02716       double sum = 0.0;
02717       for (int k = row; k < size; ++k) {
02718         sum += U_inv[row][k] * L[k][col];
02719       }
02720       inv[row*size + col] = sum;
02721     }
02722   }
02723   // Check matrix product.
02724   double error_sum = 0.0;
02725   for (row = 0; row < size; row++) {
02726     for (col = 0; col < size; col++) {
02727       double sum = 0.0;
02728       for (int k = 0; k < size; ++k) {
02729         sum += input[row*size + k] * inv[k *size + col];
02730       }
02731       if (row != col) {
02732         error_sum += Abs(sum);
02733       }
02734     }
02735   }
02736   return error_sum;
02737 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines