00001 00002 // File: resultiterator.h 00003 // Description: Iterator for tesseract results that is capable of 00004 // iterating in proper reading order over Bi Directional 00005 // (e.g. mixed Hebrew and English) text. 00006 // Author: David Eger 00007 // Created: Fri May 27 13:58:06 PST 2011 00008 // 00009 // (C) Copyright 2011, Google Inc. 00010 // Licensed under the Apache License, Version 2.0 (the "License"); 00011 // you may not use this file except in compliance with the License. 00012 // You may obtain a copy of the License at 00013 // http://www.apache.org/licenses/LICENSE-2.0 00014 // Unless required by applicable law or agreed to in writing, software 00015 // distributed under the License is distributed on an "AS IS" BASIS, 00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00017 // See the License for the specific language governing permissions and 00018 // limitations under the License. 00019 // 00021 00022 #ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H__ 00023 #define TESSERACT_CCMAIN_RESULT_ITERATOR_H__ 00024 00025 #include "ltrresultiterator.h" 00026 #include "genericvector.h" 00027 00028 class BLOB_CHOICE_IT; 00029 class WERD_RES; 00030 class STRING; 00031 00032 namespace tesseract { 00033 00034 class Tesseract; 00035 00036 class ResultIterator : public LTRResultIterator { 00037 public: 00038 static ResultIterator *StartOfParagraph(const LTRResultIterator &resit); 00039 00040 // ResultIterator is copy constructible! 00041 // The default copy constructor works just fine for us. 00042 virtual ~ResultIterator() {} 00043 00044 // ============= Moving around within the page ============. 00045 // Moves the iterator to point to the start of the page to begin an iteration. 00046 virtual void Begin(); 00047 00048 // Moves to the start of the next object at the given level in the 00049 // page hierarchy in the appropriate reading order and returns false if 00050 // the end of the page was reached. 00051 // NOTE that RIL_SYMBOL will skip non-text blocks, but all other 00052 // PageIteratorLevel level values will visit each non-text block once. 00053 // Think of non text blocks as containing a single para, with a single line, 00054 // with a single imaginary word. 00055 // Calls to Next with different levels may be freely intermixed. 00056 // This function iterates words in right-to-left scripts correctly, if 00057 // the appropriate language has been loaded into Tesseract. 00058 virtual bool Next(PageIteratorLevel level); 00059 00060 // IsAtBeginningOf() returns whether we're at the logical beginning of the 00061 // given level. (as opposed to ResultIterator's left-to-right top-to-bottom 00062 // order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). 00063 // For a full description, see pageiterator.h 00064 virtual bool IsAtBeginningOf(PageIteratorLevel level) const; 00065 00066 // Implement PageIterator's IsAtFinalElement correctly in a BiDi context. 00067 // For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we 00068 // point at the last word in a paragraph. See PageIterator for full comment. 00069 virtual bool IsAtFinalElement(PageIteratorLevel level, 00070 PageIteratorLevel element) const; 00071 00072 // ============= Accessing data ==============. 00073 00074 // Returns the null terminated UTF-8 encoded text string for the current 00075 // object at the given level. Use delete [] to free after use. 00076 virtual char* GetUTF8Text(PageIteratorLevel level) const; 00077 00078 // Return whether the current paragraph's dominant reading direction 00079 // is left-to-right (as opposed to right-to-left). 00080 bool ParagraphIsLtr() const; 00081 00082 // ============= Exposed only for testing =============. 00083 00084 // Yields the reading order as a sequence of indices and (optional) 00085 // meta-marks for a set of words (given left-to-right). 00086 // The meta marks are passed as negative values: 00087 // kMinorRunStart Start of minor direction text. 00088 // kMinorRunEnd End of minor direction text. 00089 // kComplexWord The next indexed word contains both left-to-right and 00090 // right-to-left characters and was treated as neutral. 00091 // 00092 // For example, suppose we have five words in a text line, 00093 // indexed [0,1,2,3,4] from the leftmost side of the text line. 00094 // The following are all believable reading_orders: 00095 // 00096 // Left-to-Right (in ltr paragraph): 00097 // { 0, 1, 2, 3, 4 } 00098 // Left-to-Right (in rtl paragraph): 00099 // { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } 00100 // Right-to-Left (in rtl paragraph): 00101 // { 4, 3, 2, 1, 0 } 00102 // Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: 00103 // { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 } 00104 static void CalculateTextlineOrder( 00105 bool paragraph_is_ltr, 00106 const GenericVector<StrongScriptDirection> &word_dirs, 00107 GenericVectorEqEq<int> *reading_order); 00108 00109 static const int kMinorRunStart; 00110 static const int kMinorRunEnd; 00111 static const int kComplexWord; 00112 00113 protected: 00114 // We presume the data associated with the given iterator will outlive us. 00115 // NB: This is private because it does something that is non-obvious: 00116 // it resets to the beginning of the paragraph instead of staying wherever 00117 // resit might have pointed. 00118 explicit ResultIterator(const LTRResultIterator &resit); 00119 00120 private: 00121 // Calculates the current paragraph's dominant writing direction. 00122 // Typically, members should use current_paragraph_ltr_ instead. 00123 bool CurrentParagraphIsLtr() const; 00124 00125 // Returns word indices as measured from resit->RestartRow() = index 0 00126 // for the reading order of words within a textline given an iterator 00127 // into the middle of the text line. 00128 // In addition to non-negative word indices, the following negative values 00129 // may be inserted: 00130 // kMinorRunStart Start of minor direction text. 00131 // kMinorRunEnd End of minor direction text. 00132 // kComplexWord The previous word contains both left-to-right and 00133 // right-to-left characters and was treated as neutral. 00134 void CalculateTextlineOrder(bool paragraph_is_ltr, 00135 const LTRResultIterator &resit, 00136 GenericVectorEqEq<int> *indices) const; 00137 // Same as above, but the caller's ssd gets filled in if ssd != NULL. 00138 void CalculateTextlineOrder(bool paragraph_is_ltr, 00139 const LTRResultIterator &resit, 00140 GenericVector<StrongScriptDirection> *ssd, 00141 GenericVectorEqEq<int> *indices) const; 00142 00143 // What is the index of the current word in a strict left-to-right reading 00144 // of the row? 00145 int LTRWordIndex() const; 00146 00147 // Given an iterator pointing at a word, returns the logical reading order 00148 // of blob indices for the word. 00149 void CalculateBlobOrder(GenericVector<int> *blob_indices) const; 00150 00151 // Precondition: current_paragraph_is_ltr_ is set. 00152 void MoveToLogicalStartOfTextline(); 00153 00154 // Precondition: current_paragraph_is_ltr_ and in_minor_direction_ are set. 00155 void MoveToLogicalStartOfWord(); 00156 00157 // Are we pointing at the final (reading order) symbol of the word? 00158 bool IsAtFinalSymbolOfWord() const; 00159 00160 // Are we pointing at the first (reading order) symbol of the word? 00161 bool IsAtFirstSymbolOfWord() const; 00162 00163 // Append any extra marks that should be appended to this word when printed. 00164 // Mostly, these are Unicode BiDi control characters. 00165 void AppendSuffixMarks(STRING *text) const; 00166 00167 // Appends the current word in reading order to the given buffer. 00168 void AppendUTF8WordText(STRING *text) const; 00169 00170 // Appends the text of the current text line, *assuming this iterator is 00171 // positioned at the beginning of the text line* This function 00172 // updates the iterator to point to the first position past the text line. 00173 // Each textline is terminated in a single newline character. 00174 // If the textline ends a paragraph, it gets a second terminal newline. 00175 void IterateAndAppendUTF8TextlineText(STRING *text); 00176 00177 // Appends the text of the current paragraph in reading order 00178 // to the given buffer. 00179 // Each textline is terminated in a single newline character, and the 00180 // paragraph gets an extra newline at the end. 00181 void AppendUTF8ParagraphText(STRING *text) const; 00182 00183 // Returns whether the bidi_debug flag is set to at least min_level. 00184 bool BidiDebug(int min_level) const; 00185 00186 bool current_paragraph_is_ltr_; 00187 00188 // Is the currently pointed-at character at the beginning of 00189 // a minor-direction run? 00190 bool at_beginning_of_minor_run_; 00191 00192 // Is the currently pointed-at character in a minor-direction sequence? 00193 bool in_minor_direction_; 00194 }; 00195 00196 } // namespace tesseract. 00197 00198 #endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H__