tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/pageiterator.h
Go to the documentation of this file.
00001 
00002 // File:        pageiterator.h
00003 // Description: Iterator for tesseract page structure that avoids using
00004 //              tesseract internal data structures.
00005 // Author:      Ray Smith
00006 // Created:     Fri Feb 26 11:01:06 PST 2010
00007 //
00008 // (C) Copyright 2010, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifndef TESSERACT_CCMAIN_PAGEITERATOR_H__
00022 #define TESSERACT_CCMAIN_PAGEITERATOR_H__
00023 
00024 #include "publictypes.h"
00025 #include "platform.h"
00026 
00027 struct BlamerBundle;
00028 class C_BLOB_IT;
00029 class PAGE_RES;
00030 class PAGE_RES_IT;
00031 class WERD;
00032 struct Pix;
00033 struct Pta;
00034 
00035 namespace tesseract {
00036 
00037 class Tesseract;
00038 
00052 class TESS_API PageIterator {
00053  public:
00068   PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
00069                int scale, int scaled_yres,
00070                int rect_left, int rect_top,
00071                int rect_width, int rect_height);
00072   virtual ~PageIterator();
00073 
00080   PageIterator(const PageIterator& src);
00081   const PageIterator& operator=(const PageIterator& src);
00082 
00084   bool PositionedAtSameWord(const PAGE_RES_IT* other) const;
00085 
00086   // ============= Moving around within the page ============.
00087 
00092   virtual void Begin();
00093 
00099   virtual void RestartParagraph();
00100 
00105   bool IsWithinFirstTextlineOfParagraph() const;
00106 
00112   virtual void RestartRow();
00113 
00125   virtual bool Next(PageIteratorLevel level);
00126 
00140   virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
00141 
00158   virtual bool IsAtFinalElement(PageIteratorLevel level,
00159                                 PageIteratorLevel element) const;
00160 
00167   int Cmp(const PageIterator &other) const;
00168 
00169   // ============= Accessing data ==============.
00170   // Coordinate system:
00171   // Integer coordinates are at the cracks between the pixels.
00172   // The top-left corner of the top-left pixel in the image is at (0,0).
00173   // The bottom-right corner of the bottom-right pixel in the image is at
00174   // (width, height).
00175   // Every bounding box goes from the top-left of the top-left contained
00176   // pixel to the bottom-right of the bottom-right contained pixel, so
00177   // the bounding box of the single top-left pixel in the image is:
00178   // (0,0)->(1,1).
00179   // If an image rectangle has been set in the API, then returned coordinates
00180   // relate to the original (full) image, rather than the rectangle.
00181 
00191   bool BoundingBox(PageIteratorLevel level,
00192                    int* left, int* top, int* right, int* bottom) const;
00193   bool BoundingBox(PageIteratorLevel level, const int padding,
00194                    int* left, int* top, int* right, int* bottom) const;
00200   bool BoundingBoxInternal(PageIteratorLevel level,
00201                            int* left, int* top, int* right, int* bottom) const;
00202 
00204   bool Empty(PageIteratorLevel level) const;
00205 
00210   PolyBlockType BlockType() const;
00211 
00219   Pta* BlockPolygon() const;
00220 
00227   Pix* GetBinaryImage(PageIteratorLevel level) const;
00228 
00239   Pix* GetImage(PageIteratorLevel level, int padding,
00240                 int* left, int* top) const;
00241 
00248   bool Baseline(PageIteratorLevel level,
00249                 int* x1, int* y1, int* x2, int* y2) const;
00250 
00259   void Orientation(tesseract::Orientation *orientation,
00260                    tesseract::WritingDirection *writing_direction,
00261                    tesseract::TextlineOrder *textline_order,
00262                    float *deskew_angle) const;
00263 
00292   void ParagraphInfo(tesseract::ParagraphJustification *justification,
00293                      bool *is_list_item,
00294                      bool *is_crown,
00295                      int *first_line_indent) const;
00296 
00297   // If the current WERD_RES (it_->word()) is not NULL, sets the BlamerBundle
00298   // of the current word to the given pointer (takes ownership of the pointer)
00299   // and returns true.
00300   // Can only be used when iterating on the word level.
00301   bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
00302 
00303  protected:
00308   TESS_LOCAL void BeginWord(int offset);
00309 
00311   PAGE_RES* page_res_;
00313   Tesseract* tesseract_;
00318   PAGE_RES_IT* it_;
00323   WERD* word_;
00325   int word_length_;
00327   int blob_index_;
00333   C_BLOB_IT* cblob_it_;
00335   int scale_;
00336   int scaled_yres_;
00337   int rect_left_;
00338   int rect_top_;
00339   int rect_width_;
00340   int rect_height_;
00341 };
00342 
00343 }  // namespace tesseract.
00344 
00345 #endif  // TESSERACT_CCMAIN_PAGEITERATOR_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines