tesseract
3.03
|
00001 /********************************************************************** 00002 * File: linlsq.cpp (Formerly llsq.c) 00003 * Description: Linear Least squares fitting code. 00004 * Author: Ray Smith 00005 * Created: Thu Sep 12 08:44:51 BST 1991 00006 * 00007 * (C) Copyright 1991, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #include <stdio.h> 00021 #include <math.h> 00022 #include "errcode.h" 00023 #include "linlsq.h" 00024 00025 const ERRCODE EMPTY_LLSQ = "Can't delete from an empty LLSQ"; 00026 00027 /********************************************************************** 00028 * LLSQ::clear 00029 * 00030 * Function to initialize a LLSQ. 00031 **********************************************************************/ 00032 00033 void LLSQ::clear() { // initialize 00034 total_weight = 0.0; // no elements 00035 sigx = 0.0; // update accumulators 00036 sigy = 0.0; 00037 sigxx = 0.0; 00038 sigxy = 0.0; 00039 sigyy = 0.0; 00040 } 00041 00042 00043 /********************************************************************** 00044 * LLSQ::add 00045 * 00046 * Add an element to the accumulator. 00047 **********************************************************************/ 00048 00049 void LLSQ::add(double x, double y) { // add an element 00050 total_weight++; // count elements 00051 sigx += x; // update accumulators 00052 sigy += y; 00053 sigxx += x * x; 00054 sigxy += x * y; 00055 sigyy += y * y; 00056 } 00057 // Adds an element with a specified weight. 00058 void LLSQ::add(double x, double y, double weight) { 00059 total_weight += weight; 00060 sigx += x * weight; // update accumulators 00061 sigy += y * weight; 00062 sigxx += x * x * weight; 00063 sigxy += x * y * weight; 00064 sigyy += y * y * weight; 00065 } 00066 // Adds a whole LLSQ. 00067 void LLSQ::add(const LLSQ& other) { 00068 total_weight += other.total_weight; 00069 sigx += other.sigx; // update accumulators 00070 sigy += other.sigy; 00071 sigxx += other.sigxx; 00072 sigxy += other.sigxy; 00073 sigyy += other.sigyy; 00074 } 00075 00076 00077 /********************************************************************** 00078 * LLSQ::remove 00079 * 00080 * Delete an element from the acculuator. 00081 **********************************************************************/ 00082 00083 void LLSQ::remove(double x, double y) { // delete an element 00084 if (total_weight <= 0.0) // illegal 00085 EMPTY_LLSQ.error("LLSQ::remove", ABORT, NULL); 00086 total_weight--; // count elements 00087 sigx -= x; // update accumulators 00088 sigy -= y; 00089 sigxx -= x * x; 00090 sigxy -= x * y; 00091 sigyy -= y * y; 00092 } 00093 00094 00095 /********************************************************************** 00096 * LLSQ::m 00097 * 00098 * Return the gradient of the line fit. 00099 **********************************************************************/ 00100 00101 double LLSQ::m() const { // get gradient 00102 double covar = covariance(); 00103 double x_var = x_variance(); 00104 if (x_var != 0.0) 00105 return covar / x_var; 00106 else 00107 return 0.0; // too little 00108 } 00109 00110 00111 /********************************************************************** 00112 * LLSQ::c 00113 * 00114 * Return the constant of the line fit. 00115 **********************************************************************/ 00116 00117 double LLSQ::c(double m) const { // get constant 00118 if (total_weight > 0.0) 00119 return (sigy - m * sigx) / total_weight; 00120 else 00121 return 0; // too little 00122 } 00123 00124 00125 /********************************************************************** 00126 * LLSQ::rms 00127 * 00128 * Return the rms error of the fit. 00129 **********************************************************************/ 00130 00131 double LLSQ::rms(double m, double c) const { // get error 00132 double error; // total error 00133 00134 if (total_weight > 0) { 00135 error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c * 00136 (total_weight * c - 2 * sigy); 00137 if (error >= 0) 00138 error = sqrt(error / total_weight); // sqrt of mean 00139 else 00140 error = 0; 00141 } else { 00142 error = 0; // too little 00143 } 00144 return error; 00145 } 00146 00147 00148 /********************************************************************** 00149 * LLSQ::pearson 00150 * 00151 * Return the pearson product moment correlation coefficient. 00152 **********************************************************************/ 00153 00154 double LLSQ::pearson() const { // get correlation 00155 double r = 0.0; // Correlation is 0 if insufficent data. 00156 00157 double covar = covariance(); 00158 if (covar != 0.0) { 00159 double var_product = x_variance() * y_variance(); 00160 if (var_product > 0.0) 00161 r = covar / sqrt(var_product); 00162 } 00163 return r; 00164 } 00165 00166 // Returns the x,y means as an FCOORD. 00167 FCOORD LLSQ::mean_point() const { 00168 if (total_weight > 0.0) { 00169 return FCOORD(sigx / total_weight, sigy / total_weight); 00170 } else { 00171 return FCOORD(0.0f, 0.0f); 00172 } 00173 } 00174 00175 // Returns the sqrt of the mean squared error measured perpendicular from the 00176 // line through mean_point() in the direction dir. 00177 // 00178 // Derivation: 00179 // Lemma: Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices). 00180 // Let % be dot product and ' be transpose. Note that: 00181 // Sum[i=1..N] (v % x_i)^2 00182 // = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v' 00183 // If x_i have average 0 we have: 00184 // = v * (N * COVARIANCE_MATRIX(X)) * v' 00185 // Expanded for the case that k = 2, where we treat the dimensions 00186 // as x_i and y_i, this is: 00187 // = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v' 00188 // Now, we are trying to calculate the mean squared error, where v is 00189 // perpendicular to our line of interest: 00190 // Mean squared error 00191 // = E [ (v % (x_i - x_avg))) ^2 ] 00192 // = Sum (v % (x_i - x_avg))^2 / N 00193 // = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v' 00194 // = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v' 00195 // = code below 00196 double LLSQ::rms_orth(const FCOORD &dir) const { 00197 FCOORD v = !dir; 00198 v.normalise(); 00199 return sqrt(v.x() * v.x() * x_variance() + 00200 2 * v.x() * v.y() * covariance() + 00201 v.y() * v.y() * y_variance()); 00202 } 00203 00204 // Returns the direction of the fitted line as a unit vector, using the 00205 // least mean squared perpendicular distance. The line runs through the 00206 // mean_point, i.e. a point p on the line is given by: 00207 // p = mean_point() + lambda * vector_fit() for some real number lambda. 00208 // Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous 00209 // and may be negated without changing its meaning. 00210 // Fitting a line m + ๐v to a set of N points Pi = (xi, yi), where 00211 // m is the mean point (๐, ๐) and 00212 // v is the direction vector (cos๐, sin๐) 00213 // The perpendicular distance of each Pi from the line is: 00214 // (Pi - m) x v, where x is the scalar cross product. 00215 // Total squared error is thus: 00216 // E = โ((xi - ๐)sin๐ - (yi - ๐)cos๐)ยฒ 00217 // = โ(xi - ๐)ยฒsinยฒ๐ - 2โ(xi - ๐)(yi - ๐)sin๐ cos๐ + โ(yi - ๐)ยฒcosยฒ๐ 00218 // = NVar(xi)sinยฒ๐ - 2NCovar(xi, yi)sin๐ cos๐ + NVar(yi)cosยฒ๐ (Eq 1) 00219 // where Var(xi) is the variance of xi, 00220 // and Covar(xi, yi) is the covariance of xi, yi. 00221 // Taking the derivative wrt ๐ and setting to 0 to obtain the min/max: 00222 // 0 = 2NVar(xi)sin๐ cos๐ -2NCovar(xi, yi)(cosยฒ๐ - sinยฒ๐) -2NVar(yi)sin๐ cos๐ 00223 // => Covar(xi, yi)(cosยฒ๐ - sinยฒ๐) = (Var(xi) - Var(yi))sin๐ cos๐ 00224 // Using double angles: 00225 // 2Covar(xi, yi)cos2๐ = (Var(xi) - Var(yi))sin2๐ (Eq 2) 00226 // So ๐ = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3) 00227 00228 // Because it involves 2๐ , Eq 2 has 2 solutions 90 degrees apart, but which 00229 // is the min and which is the max? From Eq1: 00230 // E/N = Var(xi)sinยฒ๐ - 2Covar(xi, yi)sin๐ cos๐ + Var(yi)cosยฒ๐ 00231 // and 90 degrees away, using sin/cos equivalences: 00232 // E'/N = Var(xi)cosยฒ๐ + 2Covar(xi, yi)sin๐ cos๐ + Var(yi)sinยฒ๐ 00233 // The second error is smaller (making it the minimum) iff 00234 // E'/N < E/N ie: 00235 // (Var(xi) - Var(yi))(cosยฒ๐ - sinยฒ๐) < -4Covar(xi, yi)sin๐ cos๐ 00236 // Using double angles: 00237 // (Var(xi) - Var(yi))cos2๐ < -2Covar(xi, yi)sin2๐ (InEq 1) 00238 // But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2๐ such that: 00239 // sgn(cos2๐) = sgn(Var(xi) - Var(yi)) and sgn(sin2๐) = sgn(Covar(xi, yi)) 00240 // so InEq1 can *never* be true, making the atan2 result *always* the min! 00241 // In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi), 00242 // the 2 solutions have equal error and the inequality is still false. 00243 // Therefore the solution really is as trivial as Eq 3. 00244 00245 // This is equivalent to returning the Principal Component in PCA, or the 00246 // eigenvector corresponding to the largest eigenvalue in the covariance 00247 // matrix. However, atan2 is much simpler! The one reference I found that 00248 // uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but 00249 // that is still a much more complex derivation. It seems Pearson had already 00250 // found this simple solution in 1901. 00251 // http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559 00252 FCOORD LLSQ::vector_fit() const { 00253 double x_var = x_variance(); 00254 double y_var = y_variance(); 00255 double covar = covariance(); 00256 double theta = 0.5 * atan2(2.0 * covar, x_var - y_var); 00257 FCOORD result(cos(theta), sin(theta)); 00258 return result; 00259 }