tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/resultiterator.cpp
Go to the documentation of this file.
00001 
00002 // File:        resultiterator.cpp
00003 // Description: Iterator for tesseract results that is capable of
00004 //              iterating in proper reading order over Bi Directional
00005 //              (e.g. mixed Hebrew and English) text.
00006 // Author:      David Eger
00007 // Created:     Fri May 27 13:58:06 PST 2011
00008 //
00009 // (C) Copyright 2011, Google Inc.
00010 // Licensed under the Apache License, Version 2.0 (the "License");
00011 // you may not use this file except in compliance with the License.
00012 // You may obtain a copy of the License at
00013 // http://www.apache.org/licenses/LICENSE-2.0
00014 // Unless required by applicable law or agreed to in writing, software
00015 // distributed under the License is distributed on an "AS IS" BASIS,
00016 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00017 // See the License for the specific language governing permissions and
00018 // limitations under the License.
00019 //
00021 
00022 #include "resultiterator.h"
00023 
00024 #include "allheaders.h"
00025 #include "pageres.h"
00026 #include "strngs.h"
00027 #include "tesseractclass.h"
00028 #include "unicharset.h"
00029 #include "unicodes.h"
00030 
00031 namespace tesseract {
00032 
00033 ResultIterator::ResultIterator(const LTRResultIterator &resit)
00034     : LTRResultIterator(resit) {
00035   in_minor_direction_ = false;
00036   at_beginning_of_minor_run_ = false;
00037   current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
00038   MoveToLogicalStartOfTextline();
00039 }
00040 
00041 ResultIterator *ResultIterator::StartOfParagraph(
00042     const LTRResultIterator &resit) {
00043   return new ResultIterator(resit);
00044 }
00045 
00046 bool ResultIterator::ParagraphIsLtr() const {
00047   return current_paragraph_is_ltr_;
00048 }
00049 
00050 bool ResultIterator::CurrentParagraphIsLtr() const {
00051   if (!it_->word())
00052     return true;  // doesn't matter.
00053   LTRResultIterator it(*this);
00054   it.RestartParagraph();
00055   // Try to figure out the ltr-ness of the paragraph.  The rules below
00056   // make more sense in the context of a difficult paragraph example.
00057   // Here we denote {ltr characters, RTL CHARACTERS}:
00058   //
00059   //   "don't go in there!" DAIS EH
00060   //   EHT OTNI DEPMUJ FELSMIH NEHT DNA
00061   //                  .GNIDLIUB GNINRUB
00062   //
00063   // On the first line, the left-most word is LTR and the rightmost word
00064   // is RTL.  Thus, we are better off taking the majority direction for
00065   // the whole paragraph contents.  So instead of "the leftmost word is LTR"
00066   // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
00067   // would not do:  Typically an RTL paragraph would *not* start with an LTR
00068   // word.  So our heuristics are as follows:
00069   //
00070   // (1) If the first text line has an RTL word in the left-most position
00071   //     it is RTL.
00072   // (2) If the first text line has an LTR word in the right-most position
00073   //     it is LTR.
00074   // (3) If neither of the above is true, take the majority count for the
00075   //     paragraph -- if there are more rtl words, it is RTL.  If there
00076   //     are more LTR words, it's LTR.
00077   bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
00078   bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
00079   int num_ltr, num_rtl;
00080   num_rtl = leftmost_rtl ? 1 : 0;
00081   num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
00082   for (it.Next(RIL_WORD);
00083        !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
00084        it.Next(RIL_WORD)) {
00085     StrongScriptDirection dir = it.WordDirection();
00086     rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
00087     num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
00088     num_ltr += rightmost_ltr ? 1 : 0;
00089   }
00090   if (leftmost_rtl)
00091     return false;
00092   if (rightmost_ltr)
00093     return true;
00094   // First line is ambiguous.  Take statistics on the whole paragraph.
00095   if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do {
00096     StrongScriptDirection dir = it.WordDirection();
00097     num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
00098     num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
00099   } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
00100   return num_ltr >= num_rtl;
00101 }
00102 
00103 const int ResultIterator::kMinorRunStart = -1;
00104 const int ResultIterator::kMinorRunEnd = -2;
00105 const int ResultIterator::kComplexWord = -3;
00106 
00107 void ResultIterator::CalculateBlobOrder(
00108     GenericVector<int> *blob_indices) const {
00109   bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
00110   blob_indices->clear();
00111   if (Empty(RIL_WORD)) return;
00112   if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
00113     // Easy! just return the blobs in order;
00114     for (int i = 0; i < word_length_; i++)
00115       blob_indices->push_back(i);
00116     return;
00117   }
00118 
00119   // The blobs are in left-to-right order, but the current reading context
00120   // is right-to-left.
00121   const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
00122   const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
00123   const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
00124   const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
00125   const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
00126   const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
00127   const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
00128 
00129   // Step 1: Scan for and mark European Number sequences
00130   //   [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
00131   GenericVector<int> letter_types;
00132   for (int i = 0; i < word_length_; i++) {
00133     letter_types.push_back(it_->word()->SymbolDirection(i));
00134   }
00135   // Convert a single separtor sandwiched between two EN's into an EN.
00136   for (int i = 0; i + 2 < word_length_; i++) {
00137     if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
00138         (letter_types[i + 1] == U_EURO_NUM_SEP ||
00139          letter_types[i + 1] == U_COMMON_NUM_SEP)) {
00140       letter_types[i + 1] = U_EURO_NUM;
00141     }
00142   }
00143   // Scan for sequences of European Number Terminators around ENs and convert
00144   // them to ENs.
00145   for (int i = 0; i < word_length_; i++) {
00146     if (letter_types[i] == U_EURO_NUM_TERM) {
00147       int j = i + 1;
00148       while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; }
00149       if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
00150         // The sequence [i..j] should be converted to all European Numbers.
00151         for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
00152       }
00153       j = i - 1;
00154       while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; }
00155       if (j > -1 && letter_types[j] == U_EURO_NUM) {
00156         // The sequence [j..i] should be converted to all European Numbers.
00157         for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
00158       }
00159     }
00160   }
00161   // Step 2: Convert all remaining types to either L or R.
00162   // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
00163   // All other are R.
00164   for (int i = 0; i < word_length_;) {
00165     int ti = letter_types[i];
00166     if (ti == U_LTR || ti == U_EURO_NUM) {
00167       // Left to right sequence; scan to the end of it.
00168       int last_good = i;
00169       for (int j = i + 1; j < word_length_; j++) {
00170         int tj = letter_types[j];
00171         if (tj == U_LTR || tj == U_EURO_NUM) {
00172           last_good = j;
00173         } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
00174           // do nothing.
00175         } else {
00176           break;
00177         }
00178       }
00179       // [i..last_good] is the L sequence
00180       for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
00181       i = last_good + 1;
00182     } else {
00183       letter_types[i] = U_RTL;
00184       i++;
00185     }
00186   }
00187 
00188   // At this point, letter_types is entirely U_LTR or U_RTL.
00189   for (int i = word_length_ - 1; i >= 0;) {
00190     if (letter_types[i] == U_RTL) {
00191       blob_indices->push_back(i);
00192       i--;
00193     } else {
00194       // left to right sequence.  scan to the beginning.
00195       int j = i - 1;
00196       for (; j >= 0 && letter_types[j] != U_RTL; j--) { }  // pass
00197       // Now (j, i] is LTR
00198       for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
00199       i = j;
00200     }
00201   }
00202   ASSERT_HOST(blob_indices->size() == word_length_);
00203 }
00204 
00205 static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) {
00206   for (int i = 0; i < dirs.size(); i++) {
00207     switch (dirs[i]) {
00208       case DIR_NEUTRAL: tprintf ("N "); break;
00209       case DIR_LEFT_TO_RIGHT: tprintf("L "); break;
00210       case DIR_RIGHT_TO_LEFT: tprintf("R "); break;
00211       case DIR_MIX: tprintf("Z "); break;
00212       default: tprintf("? "); break;
00213     }
00214   }
00215   tprintf("\n");
00216 }
00217 
00218 void ResultIterator::CalculateTextlineOrder(
00219     bool paragraph_is_ltr,
00220     const LTRResultIterator &resit,
00221     GenericVectorEqEq<int> *word_indices) const {
00222   GenericVector<StrongScriptDirection> directions;
00223   CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
00224 }
00225 
00226 void ResultIterator::CalculateTextlineOrder(
00227     bool paragraph_is_ltr,
00228     const LTRResultIterator &resit,
00229     GenericVector<StrongScriptDirection> *dirs_arg,
00230     GenericVectorEqEq<int> *word_indices) const {
00231   GenericVector<StrongScriptDirection> dirs;
00232   GenericVector<StrongScriptDirection> *directions;
00233   directions = (dirs_arg != NULL) ? dirs_arg : &dirs;
00234   directions->truncate(0);
00235 
00236   // A LTRResultIterator goes strictly left-to-right word order.
00237   LTRResultIterator ltr_it(resit);
00238   ltr_it.RestartRow();
00239   if (ltr_it.Empty(RIL_WORD)) return;
00240   do {
00241     directions->push_back(ltr_it.WordDirection());
00242   } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
00243 
00244   word_indices->truncate(0);
00245   CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
00246 }
00247 
00248 void ResultIterator::CalculateTextlineOrder(
00249     bool paragraph_is_ltr,
00250     const GenericVector<StrongScriptDirection> &word_dirs,
00251     GenericVectorEqEq<int> *reading_order) {
00252   reading_order->truncate(0);
00253   if (word_dirs.size() == 0) return;
00254 
00255   // Take all of the runs of minor direction words and insert them
00256   // in reverse order.
00257   int minor_direction, major_direction, major_step, start, end;
00258   if (paragraph_is_ltr) {
00259     start = 0;
00260     end = word_dirs.size();
00261     major_step = 1;
00262     major_direction = DIR_LEFT_TO_RIGHT;
00263     minor_direction = DIR_RIGHT_TO_LEFT;
00264   } else {
00265     start = word_dirs.size() - 1;
00266     end = -1;
00267     major_step = -1;
00268     major_direction = DIR_RIGHT_TO_LEFT;
00269     minor_direction = DIR_LEFT_TO_RIGHT;
00270     // Special rule: if there are neutral words at the right most side
00271     //   of a line adjacent to a left-to-right word in the middle of the
00272     //   line, we interpret the end of the line as a single LTR sequence.
00273     if (word_dirs[start] == DIR_NEUTRAL) {
00274       int neutral_end = start;
00275       while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
00276         neutral_end--;
00277       }
00278       if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
00279         // LTR followed by neutrals.
00280         // Scan for the beginning of the minor left-to-right run.
00281         int left = neutral_end;
00282         for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
00283           if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
00284         }
00285         reading_order->push_back(kMinorRunStart);
00286         for (int i = left; i < word_dirs.size(); i++) {
00287           reading_order->push_back(i);
00288           if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
00289         }
00290         reading_order->push_back(kMinorRunEnd);
00291         start = left - 1;
00292       }
00293     }
00294   }
00295   for (int i = start; i != end;) {
00296     if (word_dirs[i] == minor_direction) {
00297       int j = i;
00298       while (j != end && word_dirs[j] != major_direction)
00299         j += major_step;
00300       if (j == end) j -= major_step;
00301       while (j != i && word_dirs[j] != minor_direction)
00302         j -= major_step;
00303       //  [j..i] is a minor direction run.
00304       reading_order->push_back(kMinorRunStart);
00305       for (int k = j; k != i; k -= major_step) {
00306         reading_order->push_back(k);
00307       }
00308       reading_order->push_back(i);
00309       reading_order->push_back(kMinorRunEnd);
00310       i = j + major_step;
00311     } else {
00312       reading_order->push_back(i);
00313       if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
00314       i += major_step;
00315     }
00316   }
00317 }
00318 
00319 int ResultIterator::LTRWordIndex() const {
00320   int this_word_index = 0;
00321   LTRResultIterator textline(*this);
00322   textline.RestartRow();
00323   while (!textline.PositionedAtSameWord(it_)) {
00324     this_word_index++;
00325     textline.Next(RIL_WORD);
00326   }
00327   return this_word_index;
00328 }
00329 
00330 void ResultIterator::MoveToLogicalStartOfWord() {
00331   if (word_length_ == 0) {
00332     BeginWord(0);
00333     return;
00334   }
00335   GenericVector<int> blob_order;
00336   CalculateBlobOrder(&blob_order);
00337   if (blob_order.size() == 0 || blob_order[0] == 0) return;
00338   BeginWord(blob_order[0]);
00339 }
00340 
00341 bool ResultIterator::IsAtFinalSymbolOfWord() const {
00342   if (!it_->word()) return true;
00343   GenericVector<int> blob_order;
00344   CalculateBlobOrder(&blob_order);
00345   return blob_order.size() == 0 || blob_order.back() == blob_index_;
00346 }
00347 
00348 bool ResultIterator::IsAtFirstSymbolOfWord() const {
00349   if (!it_->word()) return true;
00350   GenericVector<int> blob_order;
00351   CalculateBlobOrder(&blob_order);
00352   return blob_order.size() == 0 || blob_order[0] == blob_index_;
00353 }
00354 
00355 void ResultIterator::AppendSuffixMarks(STRING *text) const {
00356   if (!it_->word()) return;
00357   bool reading_direction_is_ltr =
00358       current_paragraph_is_ltr_ ^ in_minor_direction_;
00359   // scan forward to see what meta-information the word ordering algorithm
00360   // left us.
00361   // If this word is at the  *end* of a minor run, insert the other
00362   // direction's mark;  else if this was a complex word, insert the
00363   // current reading order's mark.
00364   GenericVectorEqEq<int> textline_order;
00365   CalculateTextlineOrder(current_paragraph_is_ltr_,
00366                          *this, &textline_order);
00367   int this_word_index = LTRWordIndex();
00368   int i = textline_order.get_index(this_word_index);
00369   if (i < 0) return;
00370 
00371   int last_non_word_mark = 0;
00372   for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
00373     last_non_word_mark = textline_order[i];
00374   }
00375   if (last_non_word_mark == kComplexWord) {
00376     *text += reading_direction_is_ltr ? kLRM : kRLM;
00377   } else if (last_non_word_mark == kMinorRunEnd) {
00378     if (current_paragraph_is_ltr_) {
00379       *text += kLRM;
00380     } else {
00381       *text += kRLM;
00382     }
00383   }
00384 }
00385 
00386 void ResultIterator::MoveToLogicalStartOfTextline() {
00387   GenericVectorEqEq<int> word_indices;
00388   RestartRow();
00389   CalculateTextlineOrder(current_paragraph_is_ltr_,
00390                          dynamic_cast<const LTRResultIterator&>(*this),
00391                          &word_indices);
00392   int i = 0;
00393   for (; i < word_indices.size() && word_indices[i] < 0; i++) {
00394     if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true;
00395     else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false;
00396   }
00397   if (in_minor_direction_) at_beginning_of_minor_run_ = true;
00398   if (i >= word_indices.size()) return;
00399   int first_word_index = word_indices[i];
00400   for (int j = 0; j < first_word_index; j++) {
00401     PageIterator::Next(RIL_WORD);
00402   }
00403   MoveToLogicalStartOfWord();
00404 }
00405 
00406 void ResultIterator::Begin() {
00407   LTRResultIterator::Begin();
00408   current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
00409   in_minor_direction_ = false;
00410   at_beginning_of_minor_run_ = false;
00411   MoveToLogicalStartOfTextline();
00412 }
00413 
00414 bool ResultIterator::Next(PageIteratorLevel level) {
00415   if (it_->block() == NULL) return false; // already at end!
00416   switch (level) {
00417     case RIL_BLOCK:  // explicit fall-through
00418     case RIL_PARA:   // explicit fall-through
00419     case RIL_TEXTLINE:
00420       if (!PageIterator::Next(level)) return false;
00421       if (IsWithinFirstTextlineOfParagraph()) {
00422         // if we've advanced to a new paragraph,
00423         // recalculate current_paragraph_is_ltr_
00424         current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
00425       }
00426       in_minor_direction_ = false;
00427       MoveToLogicalStartOfTextline();
00428       return it_->block() != NULL;
00429     case RIL_SYMBOL:
00430     {
00431       GenericVector<int> blob_order;
00432       CalculateBlobOrder(&blob_order);
00433       int next_blob = 0;
00434       while (next_blob < blob_order.size() &&
00435              blob_index_ != blob_order[next_blob])
00436         next_blob++;
00437       next_blob++;
00438       if (next_blob < blob_order.size()) {
00439         // we're in the same word; simply advance one blob.
00440         BeginWord(blob_order[next_blob]);
00441         at_beginning_of_minor_run_ = false;
00442         return true;
00443       }
00444       level = RIL_WORD;  // we've fallen through to the next word.
00445     }
00446     case RIL_WORD:  // explicit fall-through.
00447     {
00448       if (it_->word() == NULL) return Next(RIL_BLOCK);
00449       GenericVectorEqEq<int> word_indices;
00450       int this_word_index = LTRWordIndex();
00451       CalculateTextlineOrder(current_paragraph_is_ltr_,
00452                              *this,
00453                              &word_indices);
00454       int final_real_index = word_indices.size() - 1;
00455       while (final_real_index > 0 && word_indices[final_real_index] < 0)
00456         final_real_index--;
00457       for (int i = 0; i < final_real_index; i++) {
00458         if (word_indices[i] == this_word_index) {
00459           int j = i + 1;
00460           for (; j < final_real_index && word_indices[j] < 0; j++) {
00461             if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
00462             if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
00463           }
00464           at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
00465           // awesome, we move to word_indices[j]
00466           if (BidiDebug(3)) {
00467             tprintf("Next(RIL_WORD): %d -> %d\n",
00468                     this_word_index, word_indices[j]);
00469           }
00470           PageIterator::RestartRow();
00471           for (int k = 0; k < word_indices[j]; k++) {
00472             PageIterator::Next(RIL_WORD);
00473           }
00474           MoveToLogicalStartOfWord();
00475           return true;
00476         }
00477       }
00478       if (BidiDebug(3)) {
00479         tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
00480       }
00481       // we're going off the end of the text line.
00482       return Next(RIL_TEXTLINE);
00483     }
00484   }
00485   ASSERT_HOST(false);  // shouldn't happen.
00486   return false;
00487 }
00488 
00489 bool ResultIterator::IsAtBeginningOf(PageIteratorLevel level) const {
00490   if (it_->block() == NULL) return false;  // Already at the end!
00491   if (it_->word() == NULL) return true;  // In an image block.
00492   if (level == RIL_SYMBOL) return true;  // Always at beginning of a symbol.
00493 
00494   bool at_word_start = IsAtFirstSymbolOfWord();
00495   if (level == RIL_WORD) return at_word_start;
00496 
00497   ResultIterator line_start(*this);
00498   // move to the first word in the line...
00499   line_start.MoveToLogicalStartOfTextline();
00500 
00501   bool at_textline_start = at_word_start && *line_start.it_ == *it_;
00502   if (level == RIL_TEXTLINE) return at_textline_start;
00503 
00504   // now we move to the left-most word...
00505   line_start.RestartRow();
00506   bool at_block_start = at_textline_start &&
00507       line_start.it_->block() != line_start.it_->prev_block();
00508   if (level == RIL_BLOCK) return at_block_start;
00509 
00510   bool at_para_start = at_block_start ||
00511       (at_textline_start &&
00512        line_start.it_->row()->row->para() !=
00513            line_start.it_->prev_row()->row->para());
00514   if (level == RIL_PARA) return at_para_start;
00515 
00516   ASSERT_HOST(false);  // shouldn't happen.
00517   return false;
00518 }
00519 
00525 bool ResultIterator::IsAtFinalElement(PageIteratorLevel level,
00526                                       PageIteratorLevel element) const {
00527   if (Empty(element)) return true;  // Already at the end!
00528   // The result is true if we step forward by element and find we are
00529   // at the the end of the page or at beginning of *all* levels in:
00530   // [level, element).
00531   // When there is more than one level difference between element and level,
00532   // we could for instance move forward one symbol and still be at the first
00533   // word on a line, so we also have to be at the first symbol in a word.
00534   ResultIterator next(*this);
00535   next.Next(element);
00536   if (next.Empty(element)) return true;  // Reached the end of the page.
00537   while (element > level) {
00538     element = static_cast<PageIteratorLevel>(element - 1);
00539     if (!next.IsAtBeginningOf(element))
00540       return false;
00541   }
00542   return true;
00543 }
00544 
00549 char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
00550   if (it_->word() == NULL) return NULL;  // Already at the end!
00551   STRING text;
00552   switch (level) {
00553     case RIL_BLOCK:
00554       {
00555         ResultIterator pp(*this);
00556         do {
00557           pp.AppendUTF8ParagraphText(&text);
00558         } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
00559       }
00560       break;
00561     case RIL_PARA:
00562       AppendUTF8ParagraphText(&text);
00563       break;
00564     case RIL_TEXTLINE:
00565       {
00566         ResultIterator it(*this);
00567         it.MoveToLogicalStartOfTextline();
00568         it.IterateAndAppendUTF8TextlineText(&text);
00569       }
00570       break;
00571     case RIL_WORD:
00572       AppendUTF8WordText(&text);
00573       break;
00574     case RIL_SYMBOL:
00575       {
00576         bool reading_direction_is_ltr =
00577           current_paragraph_is_ltr_ ^ in_minor_direction_;
00578         if (at_beginning_of_minor_run_) {
00579           text += reading_direction_is_ltr ? kLRM : kRLM;
00580         }
00581         text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
00582         if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
00583       }
00584       break;
00585   }
00586   int length = text.length() + 1;
00587   char* result = new char[length];
00588   strncpy(result, text.string(), length);
00589   return result;
00590 }
00591 
00592 void ResultIterator::AppendUTF8WordText(STRING *text) const {
00593   if (!it_->word()) return;
00594   ASSERT_HOST(it_->word()->best_choice != NULL);
00595   bool reading_direction_is_ltr =
00596       current_paragraph_is_ltr_ ^ in_minor_direction_;
00597   if (at_beginning_of_minor_run_) {
00598     *text += reading_direction_is_ltr ? kLRM : kRLM;
00599   }
00600 
00601   GenericVector<int> blob_order;
00602   CalculateBlobOrder(&blob_order);
00603   for (int i = 0; i < blob_order.size(); i++) {
00604     *text += it_->word()->BestUTF8(blob_order[i], !reading_direction_is_ltr);
00605   }
00606   AppendSuffixMarks(text);
00607 }
00608 
00609 void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
00610   if (Empty(RIL_WORD)) {
00611     Next(RIL_WORD);
00612     return;
00613   }
00614   if (BidiDebug(1)) {
00615     GenericVectorEqEq<int> textline_order;
00616     GenericVector<StrongScriptDirection> dirs;
00617     CalculateTextlineOrder(current_paragraph_is_ltr_,
00618                            *this, &dirs, &textline_order);
00619     tprintf("Strong Script dirs     [%p/P=%s]: ", it_->row(),
00620             current_paragraph_is_ltr_ ? "ltr" : "rtl");
00621     PrintScriptDirs(dirs);
00622     tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
00623             current_paragraph_is_ltr_ ? "ltr" : "rtl");
00624     for (int i = 0; i < textline_order.size(); i++) {
00625       tprintf("%d ", textline_order[i]);
00626     }
00627     tprintf("\n");
00628   }
00629 
00630   int words_appended = 0;
00631   do {
00632     AppendUTF8WordText(text);
00633     words_appended++;
00634     *text += " ";
00635   } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
00636   if (BidiDebug(1)) {
00637     tprintf("%d words printed\n", words_appended);
00638   }
00639   text->truncate_at(text->length() - 1);
00640   *text += line_separator_;
00641   // If we just finished a paragraph, add an extra newline.
00642   if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
00643     *text += paragraph_separator_;
00644 }
00645 
00646 void ResultIterator::AppendUTF8ParagraphText(STRING *text) const {
00647   ResultIterator it(*this);
00648   it.RestartParagraph();
00649   it.MoveToLogicalStartOfTextline();
00650   if (it.Empty(RIL_WORD)) return;
00651   do {
00652     it.IterateAndAppendUTF8TextlineText(text);
00653   } while (it.it_->block() != NULL && !it.IsAtBeginningOf(RIL_PARA));
00654 }
00655 
00656 bool ResultIterator::BidiDebug(int min_level) const {
00657   int debug_level = 1;
00658   IntParam *p = ParamUtils::FindParam<IntParam>(
00659       "bidi_debug", GlobalParams()->int_params,
00660       tesseract_->params()->int_params);
00661   if (p != NULL) debug_level = (inT32)(*p);
00662   return debug_level >= min_level;
00663 }
00664 
00665 }  // namespace tesseract.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines