tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/resultiterator.h
Go to the documentation of this file.
00001 
00002 // File:        resultiterator.h
00003 // Description: Iterator for tesseract results that is capable of
00004 //              iterating in proper reading order over Bi Directional
00005 //              (e.g. mixed Hebrew and English) text.
00006 // Author:      David Eger
00007 // Created:     Fri May 27 13:58:06 PST 2011
00008 //
00009 // (C) Copyright 2011, Google Inc.
00010 // Licensed under the Apache License, Version 2.0 (the "License");
00011 // you may not use this file except in compliance with the License.
00012 // You may obtain a copy of the License at
00013 // http://www.apache.org/licenses/LICENSE-2.0
00014 // Unless required by applicable law or agreed to in writing, software
00015 // distributed under the License is distributed on an "AS IS" BASIS,
00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017 // See the License for the specific language governing permissions and
00018 // limitations under the License.
00019 //
00021 
00022 #ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H__
00023 #define TESSERACT_CCMAIN_RESULT_ITERATOR_H__
00024 
00025 #include "platform.h"
00026 #include "ltrresultiterator.h"
00027 
00028 template <typename T> class GenericVector;
00029 template <typename T> class GenericVectorEqEq;
00030 class BLOB_CHOICE_IT;
00031 class WERD_RES;
00032 class STRING;
00033 
00034 namespace tesseract {
00035 
00036 class Tesseract;
00037 
00038 class TESS_API ResultIterator : public LTRResultIterator {
00039  public:
00040   static ResultIterator *StartOfParagraph(const LTRResultIterator &resit);
00041 
00046   virtual ~ResultIterator() {}
00047 
00048   // ============= Moving around within the page ============.
00053   virtual void Begin();
00054 
00067   virtual bool Next(PageIteratorLevel level);
00068 
00075   virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
00076 
00082   virtual bool IsAtFinalElement(PageIteratorLevel level,
00083                                 PageIteratorLevel element) const;
00084 
00085   // ============= Accessing data ==============.
00086 
00091   virtual char* GetUTF8Text(PageIteratorLevel level) const;
00092 
00097   bool ParagraphIsLtr() const;
00098 
00099   // ============= Exposed only for testing =============.
00100 
00123   static void CalculateTextlineOrder(
00124       bool paragraph_is_ltr,
00125       const GenericVector<StrongScriptDirection> &word_dirs,
00126       GenericVectorEqEq<int> *reading_order);
00127 
00128   static const int kMinorRunStart;
00129   static const int kMinorRunEnd;
00130   static const int kComplexWord;
00131 
00132  protected:
00139   TESS_LOCAL explicit ResultIterator(const LTRResultIterator &resit);
00140 
00141  private:
00146   bool CurrentParagraphIsLtr() const;
00147 
00159   void CalculateTextlineOrder(bool paragraph_is_ltr,
00160                               const LTRResultIterator &resit,
00161                               GenericVectorEqEq<int> *indices) const;
00163   void CalculateTextlineOrder(bool paragraph_is_ltr,
00164                               const LTRResultIterator &resit,
00165                               GenericVector<StrongScriptDirection> *ssd,
00166                               GenericVectorEqEq<int> *indices) const;
00167 
00172   int LTRWordIndex() const;
00173 
00178   void CalculateBlobOrder(GenericVector<int> *blob_indices) const;
00179 
00181   void MoveToLogicalStartOfTextline();
00182 
00187   void MoveToLogicalStartOfWord();
00188 
00190   bool IsAtFinalSymbolOfWord() const;
00191 
00193   bool IsAtFirstSymbolOfWord() const;
00194 
00199   void AppendSuffixMarks(STRING *text) const;
00200 
00202   void AppendUTF8WordText(STRING *text) const;
00203 
00211   void IterateAndAppendUTF8TextlineText(STRING *text);
00212 
00219   void AppendUTF8ParagraphText(STRING *text) const;
00220 
00222   bool BidiDebug(int min_level) const;
00223 
00224   bool current_paragraph_is_ltr_;
00225 
00230   bool at_beginning_of_minor_run_;
00231 
00233   bool in_minor_direction_;
00234 };
00235 
00236 }  // namespace tesseract.
00237 
00238 #endif  // TESSERACT_CCMAIN_RESULT_ITERATOR_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines