tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccstruct/linlsq.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        linlsq.h  (Formerly llsq.h)
00003  * Description: Linear Least squares fitting code.
00004  * Author:              Ray Smith
00005  * Created:             Thu Sep 12 08:44:51 BST 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef TESSERACT_CCSTRUCT_LINLSQ_H_
00021 #define TESSERACT_CCSTRUCT_LINLSQ_H_
00022 
00023 #include "points.h"
00024 #include "params.h"
00025 
00026 class LLSQ {
00027  public:
00028   LLSQ() {  // constructor
00029     clear();  // set to zeros
00030   }
00031   void clear();  // initialize
00032 
00033   // Adds an element with a weight of 1.
00034   void add(double x, double y);
00035   // Adds an element with a specified weight.
00036   void add(double x, double y, double weight);
00037   // Adds a whole LLSQ.
00038   void add(const LLSQ& other);
00039   // Deletes an element with a weight of 1.
00040   void remove(double x, double y);
00041   inT32 count() const {  // no of elements
00042     return static_cast<int>(total_weight + 0.5);
00043   }
00044 
00045   double m() const;  // get gradient
00046   double c(double m) const;            // get constant
00047   double rms(double m, double c) const;            // get error
00048   double pearson() const;  // get correlation coefficient.
00049 
00050   // Returns the x,y means as an FCOORD.
00051   FCOORD mean_point() const;
00052 
00053   // Returns the average sum of squared perpendicular error from a line
00054   // through mean_point() in the direction dir.
00055   double rms_orth(const FCOORD &dir) const;
00056 
00057   // Returns the direction of the fitted line as a unit vector, using the
00058   // least mean squared perpendicular distance. The line runs through the
00059   // mean_point, i.e. a point p on the line is given by:
00060   // p = mean_point() + lambda * vector_fit() for some real number lambda.
00061   // Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous
00062   // and may be negated without changing its meaning, since a line is only
00063   // unique to a range of pi radians.
00064   // Modernists prefer to think of this as an Eigenvalue problem, but
00065   // Pearson had the simple solution in 1901.
00066   //
00067   // Note that this is equivalent to returning the Principal Component in PCA,
00068   // or the eigenvector corresponding to the largest eigenvalue in the
00069   // covariance matrix.
00070   FCOORD vector_fit() const;
00071 
00072   // Returns the covariance.
00073   double covariance() const {
00074     if (total_weight > 0.0)
00075       return (sigxy - sigx * sigy / total_weight) / total_weight;
00076     else
00077       return 0.0;
00078   }
00079   double x_variance() const {
00080     if (total_weight > 0.0)
00081       return (sigxx - sigx * sigx / total_weight) / total_weight;
00082     else
00083       return 0.0;
00084   }
00085   double y_variance() const {
00086     if (total_weight > 0.0)
00087       return (sigyy - sigy * sigy / total_weight) / total_weight;
00088     else
00089       return 0.0;
00090   }
00091 
00092  private:
00093   double total_weight;         // no of elements or sum of weights.
00094   double sigx;                 // sum of x
00095   double sigy;                 // sum of y
00096   double sigxx;                // sum x squared
00097   double sigxy;                // sum of xy
00098   double sigyy;                // sum y squared
00099 };
00100 
00101 
00102 // Returns the median value of the vector, given that the values are
00103 // circular, with the given modulus. Values may be signed or unsigned,
00104 // eg range from -pi to pi (modulus 2pi) or from 0 to 2pi (modulus 2pi).
00105 // NOTE that the array is shuffled, but the time taken is linear.
00106 // An assumption is made that most of the values are spread over no more than
00107 // half the range, but wrap-around is accounted for if the median is near
00108 // the wrap-around point.
00109 // Cannot be a member of GenericVector, as it makes heavy used of LLSQ.
00110 // T must be an integer or float/double type.
00111 template<typename T> T MedianOfCircularValues(T modulus, GenericVector<T>* v) {
00112   LLSQ stats;
00113   T halfrange = static_cast<T>(modulus / 2);
00114   int num_elements = v->size();
00115   for (int i = 0; i < num_elements; ++i) {
00116     stats.add((*v)[i], (*v)[i] + halfrange);
00117   }
00118   bool offset_needed = stats.y_variance() < stats.x_variance();
00119   if (offset_needed) {
00120     for (int i = 0; i < num_elements; ++i) {
00121       (*v)[i] += halfrange;
00122     }
00123   }
00124   int median_index = v->choose_nth_item(num_elements / 2);
00125   if (offset_needed) {
00126     for (int i = 0; i < num_elements; ++i) {
00127       (*v)[i] -= halfrange;
00128     }
00129   }
00130   return (*v)[median_index];
00131 }
00132 
00133 
00134 #endif  // TESSERACT_CCSTRUCT_LINLSQ_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines