tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccstruct/linlsq.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        linlsq.cpp  (Formerly llsq.c)
00003  * Description: Linear Least squares fitting code.
00004  * Author:              Ray Smith
00005  * Created:             Thu Sep 12 08:44:51 BST 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #include          <stdio.h>
00021 #include          <math.h>
00022 #include          "errcode.h"
00023 #include          "linlsq.h"
00024 
00025 const ERRCODE EMPTY_LLSQ = "Can't delete from an empty LLSQ";
00026 
00027 /**********************************************************************
00028  * LLSQ::clear
00029  *
00030  * Function to initialize a LLSQ.
00031  **********************************************************************/
00032 
00033 void LLSQ::clear() {  // initialize
00034   total_weight = 0.0;                         // no elements
00035   sigx = 0.0;                      // update accumulators
00036   sigy = 0.0;
00037   sigxx = 0.0;
00038   sigxy = 0.0;
00039   sigyy = 0.0;
00040 }
00041 
00042 
00043 /**********************************************************************
00044  * LLSQ::add
00045  *
00046  * Add an element to the accumulator.
00047  **********************************************************************/
00048 
00049 void LLSQ::add(double x, double y) {          // add an element
00050   total_weight++;                           // count elements
00051   sigx += x;                     // update accumulators
00052   sigy += y;
00053   sigxx += x * x;
00054   sigxy += x * y;
00055   sigyy += y * y;
00056 }
00057 // Adds an element with a specified weight.
00058 void LLSQ::add(double x, double y, double weight) {
00059   total_weight += weight;
00060   sigx += x * weight;                     // update accumulators
00061   sigy += y * weight;
00062   sigxx += x * x * weight;
00063   sigxy += x * y * weight;
00064   sigyy += y * y * weight;
00065 }
00066 // Adds a whole LLSQ.
00067 void LLSQ::add(const LLSQ& other) {
00068   total_weight += other.total_weight;
00069   sigx += other.sigx;                     // update accumulators
00070   sigy += other.sigy;
00071   sigxx += other.sigxx;
00072   sigxy += other.sigxy;
00073   sigyy += other.sigyy;
00074 }
00075 
00076 
00077 /**********************************************************************
00078  * LLSQ::remove
00079  *
00080  * Delete an element from the acculuator.
00081  **********************************************************************/
00082 
00083 void LLSQ::remove(double x, double y) {          // delete an element
00084   if (total_weight <= 0.0)                       // illegal
00085     EMPTY_LLSQ.error("LLSQ::remove", ABORT, NULL);
00086   total_weight--;                           // count elements
00087   sigx -= x;                     // update accumulators
00088   sigy -= y;
00089   sigxx -= x * x;
00090   sigxy -= x * y;
00091   sigyy -= y * y;
00092 }
00093 
00094 
00095 /**********************************************************************
00096  * LLSQ::m
00097  *
00098  * Return the gradient of the line fit.
00099  **********************************************************************/
00100 
00101 double LLSQ::m() const {  // get gradient
00102   double covar = covariance();
00103   double x_var = x_variance();
00104   if (x_var != 0.0)
00105     return covar / x_var;
00106   else
00107     return 0.0;                    // too little
00108 }
00109 
00110 
00111 /**********************************************************************
00112  * LLSQ::c
00113  *
00114  * Return the constant of the line fit.
00115  **********************************************************************/
00116 
00117 double LLSQ::c(double m) const {          // get constant
00118   if (total_weight > 0.0)
00119     return (sigy - m * sigx) / total_weight;
00120   else
00121     return 0;                    // too little
00122 }
00123 
00124 
00125 /**********************************************************************
00126  * LLSQ::rms
00127  *
00128  * Return the rms error of the fit.
00129  **********************************************************************/
00130 
00131 double LLSQ::rms(double m,  double c) const {          // get error
00132   double error;                  // total error
00133 
00134   if (total_weight > 0) {
00135     error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c *
00136             (total_weight * c - 2 * sigy);
00137     if (error >= 0)
00138       error = sqrt(error / total_weight);  // sqrt of mean
00139     else
00140       error = 0;
00141   } else {
00142     error = 0;                   // too little
00143   }
00144   return error;
00145 }
00146 
00147 
00148 /**********************************************************************
00149  * LLSQ::pearson
00150  *
00151  * Return the pearson product moment correlation coefficient.
00152  **********************************************************************/
00153 
00154 double LLSQ::pearson() const {  // get correlation
00155   double r = 0.0;                  // Correlation is 0 if insufficent data.
00156 
00157   double covar = covariance();
00158   if (covar != 0.0) {
00159     double var_product = x_variance()  * y_variance();
00160     if (var_product > 0.0)
00161       r = covar / sqrt(var_product);
00162   }
00163   return r;
00164 }
00165 
00166 // Returns the x,y means as an FCOORD.
00167 FCOORD LLSQ::mean_point() const {
00168   if (total_weight > 0.0) {
00169     return FCOORD(sigx / total_weight, sigy / total_weight);
00170   } else {
00171     return FCOORD(0.0f, 0.0f);
00172   }
00173 }
00174 
00175 // Returns the sqrt of the mean squared error measured perpendicular from the
00176 // line through mean_point() in the direction dir.
00177 //
00178 // Derivation:
00179 //   Lemma:  Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices).
00180 //     Let % be dot product and ' be transpose.  Note that:
00181 //      Sum[i=1..N] (v % x_i)^2
00182 //         = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v'
00183 //     If x_i have average 0 we have:
00184 //       = v * (N * COVARIANCE_MATRIX(X)) * v'
00185 //     Expanded for the case that k = 2, where we treat the dimensions
00186 //     as x_i and y_i, this is:
00187 //       = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v'
00188 //  Now, we are trying to calculate the mean squared error, where v is
00189 //  perpendicular to our line of interest:
00190 //    Mean squared error
00191 //      = E [ (v % (x_i - x_avg))) ^2 ]
00192 //      = Sum (v % (x_i - x_avg))^2 / N
00193 //      = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v'
00194 //      = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v'
00195 //      = code below
00196 double LLSQ::rms_orth(const FCOORD &dir) const {
00197   FCOORD v = !dir;
00198   v.normalise();
00199   return sqrt(v.x() * v.x() * x_variance() +
00200               2 * v.x() * v.y() * covariance() +
00201               v.y() * v.y() * y_variance());
00202 }
00203 
00204 // Returns the direction of the fitted line as a unit vector, using the
00205 // least mean squared perpendicular distance. The line runs through the
00206 // mean_point, i.e. a point p on the line is given by:
00207 // p = mean_point() + lambda * vector_fit() for some real number lambda.
00208 // Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous
00209 // and may be negated without changing its meaning.
00210 // Fitting a line m + ๐œ†v to a set of N points Pi = (xi, yi), where
00211 // m is the mean point (๐, ๐‚) and
00212 // v is the direction vector (cos๐œƒ, sin๐œƒ)
00213 // The perpendicular distance of each Pi from the line is:
00214 // (Pi - m) x v, where x is the scalar cross product.
00215 // Total squared error is thus:
00216 // E = โˆ‘((xi - ๐)sin๐œƒ - (yi - ๐‚)cos๐œƒ)ยฒ
00217 //   = โˆ‘(xi - ๐)ยฒsinยฒ๐œƒ  - 2โˆ‘(xi - ๐)(yi - ๐‚)sin๐œƒ cos๐œƒ + โˆ‘(yi - ๐‚)ยฒcosยฒ๐œƒ
00218 //   = NVar(xi)sinยฒ๐œƒ  - 2NCovar(xi, yi)sin๐œƒ cos๐œƒ  + NVar(yi)cosยฒ๐œƒ   (Eq 1)
00219 // where Var(xi) is the variance of xi,
00220 // and Covar(xi, yi) is the covariance of xi, yi.
00221 // Taking the derivative wrt ๐œƒ and setting to 0 to obtain the min/max:
00222 // 0 = 2NVar(xi)sin๐œƒ cos๐œƒ -2NCovar(xi, yi)(cosยฒ๐œƒ - sinยฒ๐œƒ) -2NVar(yi)sin๐œƒ cos๐œƒ
00223 // => Covar(xi, yi)(cosยฒ๐œƒ - sinยฒ๐œƒ) = (Var(xi) - Var(yi))sin๐œƒ cos๐œƒ
00224 // Using double angles:
00225 // 2Covar(xi, yi)cos2๐œƒ = (Var(xi) - Var(yi))sin2๐œƒ   (Eq 2)
00226 // So ๐œƒ = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3)
00227 
00228 // Because it involves 2๐œƒ , Eq 2 has 2 solutions 90 degrees apart, but which
00229 // is the min and which is the max? From Eq1:
00230 // E/N = Var(xi)sinยฒ๐œƒ  - 2Covar(xi, yi)sin๐œƒ cos๐œƒ  + Var(yi)cosยฒ๐œƒ
00231 // and 90 degrees away, using sin/cos equivalences:
00232 // E'/N = Var(xi)cosยฒ๐œƒ  + 2Covar(xi, yi)sin๐œƒ cos๐œƒ  + Var(yi)sinยฒ๐œƒ
00233 // The second error is smaller (making it the minimum) iff
00234 // E'/N < E/N ie:
00235 // (Var(xi) - Var(yi))(cosยฒ๐œƒ - sinยฒ๐œƒ) < -4Covar(xi, yi)sin๐œƒ cos๐œƒ
00236 // Using double angles:
00237 // (Var(xi) - Var(yi))cos2๐œƒ  < -2Covar(xi, yi)sin2๐œƒ  (InEq 1)
00238 // But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2๐œƒ  such that:
00239 // sgn(cos2๐œƒ) = sgn(Var(xi) - Var(yi)) and sgn(sin2๐œƒ) = sgn(Covar(xi, yi))
00240 // so InEq1 can *never* be true, making the atan2 result *always* the min!
00241 // In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi),
00242 // the 2 solutions have equal error and the inequality is still false.
00243 // Therefore the solution really is as trivial as Eq 3.
00244 
00245 // This is equivalent to returning the Principal Component in PCA, or the
00246 // eigenvector corresponding to the largest eigenvalue in the covariance
00247 // matrix.  However, atan2 is much simpler! The one reference I found that
00248 // uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but
00249 // that is still a much more complex derivation. It seems Pearson had already
00250 // found this simple solution in 1901.
00251 // http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559
00252 FCOORD LLSQ::vector_fit() const {
00253   double x_var = x_variance();
00254   double y_var = y_variance();
00255   double covar = covariance();
00256   double theta = 0.5 * atan2(2.0 * covar, x_var - y_var);
00257   FCOORD result(cos(theta), sin(theta));
00258   return result;
00259 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines