tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/paragraphs.cpp
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        paragraphs.cpp
00003  * Description: Paragraph detection for tesseract.
00004  * Author:      David Eger
00005  * Created:     25 February 2011
00006  *
00007  * (C) Copyright 2011, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 #ifdef _MSC_VER
00020 #define __func__ __FUNCTION__
00021 #endif
00022 
00023 #include <ctype.h>
00024 
00025 #include "genericvector.h"
00026 #include "helpers.h"
00027 #include "mutableiterator.h"
00028 #include "ocrpara.h"
00029 #include "pageres.h"
00030 #include "paragraphs.h"
00031 #include "paragraphs_internal.h"
00032 #include "publictypes.h"
00033 #include "ratngs.h"
00034 #include "rect.h"
00035 #include "statistc.h"
00036 #include "strngs.h"
00037 #include "tprintf.h"
00038 #include "unicharset.h"
00039 #include "unicodes.h"
00040 
00041 namespace tesseract {
00042 
00043 // Special "weak" ParagraphModels.
00044 const ParagraphModel *kCrownLeft
00045     = reinterpret_cast<ParagraphModel *>(0xDEAD111F);
00046 const ParagraphModel *kCrownRight
00047     = reinterpret_cast<ParagraphModel *>(0xDEAD888F);
00048 
00049 // Given the width of a typical space between words, what is the threshold
00050 // by which by which we think left and right alignments for paragraphs
00051 // can vary and still be aligned.
00052 static int Epsilon(int space_pix) {
00053   return space_pix * 4 / 5;
00054 }
00055 
00056 static bool AcceptableRowArgs(
00057     int debug_level, int min_num_rows, const char *function_name,
00058     const GenericVector<RowScratchRegisters> *rows,
00059     int row_start, int row_end) {
00060   if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
00061     tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n",
00062             row_start, row_end, rows->size());
00063     return false;
00064   }
00065   if (row_end - row_start < min_num_rows) {
00066     if (debug_level > 1) {
00067       tprintf("# Too few rows[%d, %d) for %s.\n",
00068               row_start, row_end, function_name);
00069     }
00070     return false;
00071   }
00072   return true;
00073 }
00074 
00075 // =============================== Debug Code ================================
00076 
00077 // Convert an integer to a decimal string.
00078 static STRING StrOf(int num) {
00079   char buffer[30];
00080   snprintf(buffer, sizeof(buffer), "%d", num);
00081   return STRING(buffer);
00082 }
00083 
00084 // Given a row-major matrix of unicode text and a column separator, print
00085 // a formatted table.  For ASCII, we get good column alignment.
00086 static void PrintTable(const GenericVector<GenericVector<STRING> > &rows,
00087                        const STRING &colsep) {
00088   GenericVector<int> max_col_widths;
00089   for (int r = 0; r < rows.size(); r++) {
00090     int num_columns = rows[r].size();
00091     for (int c = 0; c < num_columns; c++) {
00092       int num_unicodes = 0;
00093       for (int i = 0; i < rows[r][c].size(); i++) {
00094         if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
00095       }
00096       if (c >= max_col_widths.size()) {
00097         max_col_widths.push_back(num_unicodes);
00098       } else {
00099         if (num_unicodes > max_col_widths[c])
00100           max_col_widths[c] = num_unicodes;
00101       }
00102     }
00103   }
00104 
00105   GenericVector<STRING> col_width_patterns;
00106   for (int c = 0; c < max_col_widths.size(); c++) {
00107     col_width_patterns.push_back(
00108         STRING("%-") + StrOf(max_col_widths[c]) + "s");
00109   }
00110 
00111   for (int r = 0; r < rows.size(); r++) {
00112     for (int c = 0; c < rows[r].size(); c++) {
00113       if (c > 0)
00114         tprintf("%s", colsep.string());
00115       tprintf(col_width_patterns[c].string(), rows[r][c].string());
00116     }
00117     tprintf("\n");
00118   }
00119 }
00120 
00121 STRING RtlEmbed(const STRING &word, bool rtlify) {
00122   if (rtlify)
00123     return STRING(kRLE) + word + STRING(kPDF);
00124   return word;
00125 }
00126 
00127 // Print the current thoughts of the paragraph detector.
00128 static void PrintDetectorState(const ParagraphTheory &theory,
00129                                const GenericVector<RowScratchRegisters> &rows) {
00130   GenericVector<GenericVector<STRING> > output;
00131   output.push_back(GenericVector<STRING>());
00132   output.back().push_back("#row");
00133   output.back().push_back("space");
00134   output.back().push_back("..");
00135   output.back().push_back("lword[widthSEL]");
00136   output.back().push_back("rword[widthSEL]");
00137   RowScratchRegisters::AppendDebugHeaderFields(&output.back());
00138   output.back().push_back("text");
00139 
00140   for (int i = 0; i < rows.size(); i++) {
00141     output.push_back(GenericVector<STRING>());
00142     GenericVector<STRING> &row = output.back();
00143     const RowInfo& ri = *rows[i].ri_;
00144     row.push_back(StrOf(i));
00145     row.push_back(StrOf(ri.average_interword_space));
00146     row.push_back(ri.has_leaders ? ".." : " ");
00147     row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
00148                   "[" + StrOf(ri.lword_box.width()) +
00149                   (ri.lword_likely_starts_idea ? "S" : "s") +
00150                   (ri.lword_likely_ends_idea ? "E" : "e") +
00151                   (ri.lword_indicates_list_item ? "L" : "l") +
00152                   "]");
00153     row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
00154                   "[" + StrOf(ri.rword_box.width()) +
00155                   (ri.rword_likely_starts_idea ? "S" : "s") +
00156                   (ri.rword_likely_ends_idea ? "E" : "e") +
00157                   (ri.rword_indicates_list_item ? "L" : "l") +
00158                   "]");
00159     rows[i].AppendDebugInfo(theory, &row);
00160     row.push_back(RtlEmbed(ri.text, !ri.ltr));
00161   }
00162   PrintTable(output, " ");
00163 
00164   tprintf("Active Paragraph Models:\n");
00165   for (int m = 0; m < theory.models().size(); m++) {
00166     tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string());
00167   }
00168 }
00169 
00170 static void DebugDump(
00171     bool should_print,
00172     const STRING &phase,
00173     const ParagraphTheory &theory,
00174     const GenericVector<RowScratchRegisters> &rows) {
00175   if (!should_print)
00176     return;
00177   tprintf("# %s\n", phase.string());
00178   PrintDetectorState(theory, rows);
00179 }
00180 
00181 // Print out the text for rows[row_start, row_end)
00182 static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows,
00183                           int row_start, int row_end) {
00184   tprintf("======================================\n");
00185   for (int row = row_start; row < row_end; row++) {
00186     tprintf("%s\n", rows[row].ri_->text.string());
00187   }
00188   tprintf("======================================\n");
00189 }
00190 
00191 // ============= Brain Dead Language Model (ASCII Version) ===================
00192 
00193 bool IsLatinLetter(int ch) {
00194   return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
00195 }
00196 
00197 bool IsDigitLike(int ch) {
00198   return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
00199 }
00200 
00201 bool IsOpeningPunct(int ch) {
00202   return strchr("'\"({[", ch) != NULL;
00203 }
00204 
00205 bool IsTerminalPunct(int ch) {
00206   return strchr(":'\".?!]})", ch) != NULL;
00207 }
00208 
00209 // Return a pointer after consuming as much text as qualifies as roman numeral.
00210 const char *SkipChars(const char *str, const char *toskip) {
00211   while (*str != '\0' && strchr(toskip, *str)) { str++; }
00212   return str;
00213 }
00214 
00215 const char *SkipChars(const char *str, bool (*skip)(int)) {
00216   while (*str != '\0' && skip(*str)) { str++; }
00217   return str;
00218 }
00219 
00220 const char *SkipOne(const char *str, const char *toskip) {
00221   if (*str != '\0' && strchr(toskip, *str)) return str + 1;
00222   return str;
00223 }
00224 
00225 // Return whether it is very likely that this is a numeral marker that could
00226 // start a list item.  Some examples include:
00227 //   A   I   iii.   VI   (2)   3.5.   [C-4]
00228 bool LikelyListNumeral(const STRING &word) {
00229   const char *kRomans = "ivxlmdIVXLMD";
00230   const char *kDigits = "012345789";
00231   const char *kOpen = "[{(";
00232   const char *kSep = ":;-.,";
00233   const char *kClose = "]})";
00234 
00235   int num_segments = 0;
00236   const char *pos = word.string();
00237   while (*pos != '\0' && num_segments < 3) {
00238     // skip up to two open parens.
00239     const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
00240     const char *numeral_end = SkipChars(numeral_start, kRomans);
00241     if (numeral_end != numeral_start) {
00242       // Got Roman Numeral. Great.
00243     } else {
00244       numeral_end = SkipChars(numeral_start, kDigits);
00245       if (numeral_end == numeral_start) {
00246         // If there's a single latin letter, we can use that.
00247         numeral_end = SkipChars(numeral_start, IsLatinLetter);
00248         if (numeral_end - numeral_start != 1)
00249           break;
00250       }
00251     }
00252     // We got some sort of numeral.
00253     num_segments++;
00254     // Skip any trailing parens or punctuation.
00255     pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
00256     if (pos == numeral_end)
00257       break;
00258   }
00259   return *pos == '\0';
00260 }
00261 
00262 bool LikelyListMark(const STRING &word) {
00263   const char *kListMarks = "0Oo*.,+.";
00264   return word.size() == 1 && strchr(kListMarks, word[0]) != NULL;
00265 }
00266 
00267 bool AsciiLikelyListItem(const STRING &word) {
00268   return LikelyListMark(word) || LikelyListNumeral(word);
00269 }
00270 
00271 // ========== Brain Dead Language Model (Tesseract Version) ================
00272 
00273 // Return the first Unicode Codepoint from werd[pos].
00274 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
00275   if (!u || !werd || pos > werd->length())
00276     return 0;
00277   return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
00278 }
00279 
00280 // A useful helper class for finding the first j >= i so that word[j]
00281 // does not have given character type.
00282 class UnicodeSpanSkipper {
00283  public:
00284   UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
00285       : u_(unicharset), word_(word) { wordlen_ = word->length(); }
00286 
00287   // Given an input position, return the first position >= pos not punc.
00288   int SkipPunc(int pos);
00289   // Given an input position, return the first position >= pos not digit.
00290   int SkipDigits(int pos);
00291   // Given an input position, return the first position >= pos not roman.
00292   int SkipRomans(int pos);
00293   // Given an input position, return the first position >= pos not alpha.
00294   int SkipAlpha(int pos);
00295 
00296  private:
00297   const UNICHARSET *u_;
00298   const WERD_CHOICE *word_;
00299   int wordlen_;
00300 };
00301 
00302 int UnicodeSpanSkipper::SkipPunc(int pos) {
00303   while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;
00304   return pos;
00305 }
00306 
00307 int UnicodeSpanSkipper::SkipDigits(int pos) {
00308   while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||
00309                             IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;
00310   return pos;
00311 }
00312 
00313 int UnicodeSpanSkipper::SkipRomans(int pos) {
00314   const char *kRomans = "ivxlmdIVXLMD";
00315   while (pos < wordlen_) {
00316     int ch = UnicodeFor(u_, word_, pos);
00317     if (ch >= 0xF0 || strchr(kRomans, ch) == 0) break;
00318     pos++;
00319   }
00320   return pos;
00321 }
00322 
00323 int UnicodeSpanSkipper::SkipAlpha(int pos) {
00324   while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;
00325   return pos;
00326 }
00327 
00328 bool LikelyListMarkUnicode(int ch) {
00329   if (ch < 0x80) {
00330     STRING single_ch;
00331     single_ch += ch;
00332     return LikelyListMark(single_ch);
00333   }
00334   switch (ch) {
00335     // TODO(eger) expand this list of unicodes as needed.
00336     case 0x00B0:  // degree sign
00337     case 0x2022:  // bullet
00338     case 0x25E6:  // white bullet
00339     case 0x00B7:  // middle dot
00340     case 0x25A1:  // white square
00341     case 0x25A0:  // black square
00342     case 0x25AA:  // black small square
00343     case 0x2B1D:  // black very small square
00344     case 0x25BA:  // black right-pointing pointer
00345     case 0x25CF:  // black circle
00346     case 0x25CB:  // white circle
00347       return true;
00348     default:
00349       break;  // fall through
00350   }
00351   return false;
00352 }
00353 
00354 // Return whether it is very likely that this is a numeral marker that could
00355 // start a list item.  Some examples include:
00356 //   A   I   iii.   VI   (2)   3.5.   [C-4]
00357 bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
00358   if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
00359     return true;
00360 
00361   UnicodeSpanSkipper m(u, werd);
00362   int num_segments = 0;
00363   int pos = 0;
00364   while (pos < werd->length() && num_segments < 3) {
00365     int numeral_start = m.SkipPunc(pos);
00366     if (numeral_start > pos + 1) break;
00367     int numeral_end = m.SkipRomans(numeral_start);
00368     if (numeral_end == numeral_start) {
00369       numeral_end = m.SkipDigits(numeral_start);
00370       if (numeral_end == numeral_start) {
00371         // If there's a single latin letter, we can use that.
00372         numeral_end = m.SkipAlpha(numeral_start);
00373         if (numeral_end - numeral_start != 1)
00374           break;
00375       }
00376     }
00377     // We got some sort of numeral.
00378     num_segments++;
00379     // Skip any trailing punctuation.
00380     pos = m.SkipPunc(numeral_end);
00381     if (pos == numeral_end)
00382       break;
00383   }
00384   return pos == werd->length();
00385 }
00386 
00387 // ========= Brain Dead Language Model (combined entry points) ================
00388 
00389 // Given the leftmost word of a line either as a Tesseract unicharset + werd
00390 // or a utf8 string, set the following attributes for it:
00391 //   is_list -      this word might be a list number or bullet.
00392 //   starts_idea -  this word is likely to start a sentence.
00393 //   ends_idea -    this word is likely to end a sentence.
00394 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
00395                         const STRING &utf8,
00396                         bool *is_list, bool *starts_idea, bool *ends_idea) {
00397   *is_list = false;
00398   *starts_idea = false;
00399   *ends_idea = false;
00400   if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty
00401     *ends_idea = true;
00402     return;
00403   }
00404 
00405   if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
00406     if (UniLikelyListItem(unicharset, werd)) {
00407       *is_list = true;
00408       *starts_idea = true;
00409       *ends_idea = true;
00410     }
00411     if (unicharset->get_isupper(werd->unichar_id(0))) {
00412       *starts_idea = true;
00413     }
00414     if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
00415       *starts_idea = true;
00416       *ends_idea = true;
00417     }
00418   } else {  // Assume utf8 is mostly ASCII
00419     if (AsciiLikelyListItem(utf8)) {
00420       *is_list = true;
00421       *starts_idea = true;
00422     }
00423     int start_letter = utf8[0];
00424     if (IsOpeningPunct(start_letter)) {
00425       *starts_idea = true;
00426     }
00427     if (IsTerminalPunct(start_letter)) {
00428       *ends_idea = true;
00429     }
00430     if (start_letter >= 'A' && start_letter <= 'Z') {
00431       *starts_idea = true;
00432     }
00433   }
00434 }
00435 
00436 // Given the rightmost word of a line either as a Tesseract unicharset + werd
00437 // or a utf8 string, set the following attributes for it:
00438 //   is_list -      this word might be a list number or bullet.
00439 //   starts_idea -  this word is likely to start a sentence.
00440 //   ends_idea -    this word is likely to end a sentence.
00441 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
00442                          const STRING &utf8,
00443                          bool *is_list, bool *starts_idea, bool *ends_idea) {
00444   *is_list = false;
00445   *starts_idea = false;
00446   *ends_idea = false;
00447   if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty
00448     *ends_idea = true;
00449     return;
00450   }
00451 
00452   if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
00453     if (UniLikelyListItem(unicharset, werd)) {
00454       *is_list = true;
00455       *starts_idea = true;
00456     }
00457     UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
00458     if (unicharset->get_ispunctuation(last_letter)) {
00459       *ends_idea = true;
00460     }
00461   } else {  // Assume utf8 is mostly ASCII
00462     if (AsciiLikelyListItem(utf8)) {
00463       *is_list = true;
00464       *starts_idea = true;
00465     }
00466     int last_letter = utf8[utf8.size() - 1];
00467     if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
00468       *ends_idea = true;
00469     }
00470   }
00471 }
00472 
00473 // =============== Implementation of RowScratchRegisters =====================
00474 /* static */
00475 void RowScratchRegisters::AppendDebugHeaderFields(
00476     GenericVector<STRING> *header) {
00477   header->push_back("[lmarg,lind;rind,rmarg]");
00478   header->push_back("model");
00479 }
00480 
00481 void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
00482                                           GenericVector<STRING> *dbg) const {
00483   char s[30];
00484   snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]",
00485            lmargin_, lindent_, rindent_, rmargin_);
00486   dbg->push_back(s);
00487   STRING model_string;
00488   model_string += static_cast<char>(GetLineType());
00489   model_string += ":";
00490 
00491   int model_numbers = 0;
00492   for (int h = 0; h < hypotheses_.size(); h++) {
00493     if (hypotheses_[h].model == NULL)
00494       continue;
00495     if (model_numbers > 0)
00496       model_string += ",";
00497     if (StrongModel(hypotheses_[h].model)) {
00498       model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
00499     } else if (hypotheses_[h].model == kCrownLeft) {
00500       model_string += "CrL";
00501     } else if (hypotheses_[h].model == kCrownRight) {
00502       model_string += "CrR";
00503     }
00504     model_numbers++;
00505   }
00506   if (model_numbers == 0)
00507     model_string += "0";
00508 
00509   dbg->push_back(model_string);
00510 }
00511 
00512 void RowScratchRegisters::Init(const RowInfo &row) {
00513   ri_ = &row;
00514   lmargin_ = 0;
00515   lindent_ = row.pix_ldistance;
00516   rmargin_ = 0;
00517   rindent_ = row.pix_rdistance;
00518 }
00519 
00520 LineType RowScratchRegisters::GetLineType() const {
00521   if (hypotheses_.empty())
00522     return LT_UNKNOWN;
00523   bool has_start = false;
00524   bool has_body = false;
00525   for (int i = 0; i < hypotheses_.size(); i++) {
00526     switch (hypotheses_[i].ty) {
00527       case LT_START: has_start = true; break;
00528       case LT_BODY: has_body = true; break;
00529       default:
00530         tprintf("Encountered bad value in hypothesis list: %c\n",
00531                 hypotheses_[i].ty);
00532         break;
00533     }
00534   }
00535   if (has_start && has_body)
00536     return LT_MULTIPLE;
00537   return has_start ? LT_START : LT_BODY;
00538 }
00539 
00540 LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const {
00541   if (hypotheses_.empty())
00542     return LT_UNKNOWN;
00543   bool has_start = false;
00544   bool has_body = false;
00545   for (int i = 0; i < hypotheses_.size(); i++) {
00546     if (hypotheses_[i].model != model)
00547       continue;
00548     switch (hypotheses_[i].ty) {
00549       case LT_START: has_start = true; break;
00550       case LT_BODY: has_body = true; break;
00551       default:
00552         tprintf("Encountered bad value in hypothesis list: %c\n",
00553                 hypotheses_[i].ty);
00554         break;
00555     }
00556   }
00557   if (has_start && has_body)
00558     return LT_MULTIPLE;
00559   return has_start ? LT_START : LT_BODY;
00560 }
00561 
00562 void RowScratchRegisters::SetStartLine() {
00563   LineType current_lt = GetLineType();
00564   if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
00565     tprintf("Trying to set a line to be START when it's already BODY.\n");
00566   }
00567   if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
00568     hypotheses_.push_back_new(LineHypothesis(LT_START, NULL));
00569   }
00570 }
00571 
00572 void RowScratchRegisters::SetBodyLine() {
00573   LineType current_lt = GetLineType();
00574   if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
00575     tprintf("Trying to set a line to be BODY when it's already START.\n");
00576   }
00577   if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
00578     hypotheses_.push_back_new(LineHypothesis(LT_BODY, NULL));
00579   }
00580 }
00581 
00582 void RowScratchRegisters::AddStartLine(const ParagraphModel *model) {
00583   hypotheses_.push_back_new(LineHypothesis(LT_START, model));
00584   int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, NULL));
00585   if (old_idx >= 0)
00586     hypotheses_.remove(old_idx);
00587 }
00588 
00589 void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) {
00590   hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
00591   int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, NULL));
00592   if (old_idx >= 0)
00593     hypotheses_.remove(old_idx);
00594 }
00595 
00596 void RowScratchRegisters::StartHypotheses(SetOfModels *models) const {
00597   for (int h = 0; h < hypotheses_.size(); h++) {
00598     if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
00599       models->push_back_new(hypotheses_[h].model);
00600   }
00601 }
00602 
00603 void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const {
00604   for (int h = 0; h < hypotheses_.size(); h++) {
00605     if (StrongModel(hypotheses_[h].model))
00606       models->push_back_new(hypotheses_[h].model);
00607   }
00608 }
00609 
00610 void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const {
00611   for (int h = 0; h < hypotheses_.size(); h++) {
00612     if (hypotheses_[h].model != NULL)
00613       models->push_back_new(hypotheses_[h].model);
00614   }
00615 }
00616 
00617 const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const {
00618   if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START)
00619     return NULL;
00620   return hypotheses_[0].model;
00621 }
00622 
00623 const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const {
00624   if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY)
00625     return NULL;
00626   return hypotheses_[0].model;
00627 }
00628 
00629 // Discard any hypotheses whose model is not in the given list.
00630 void RowScratchRegisters::DiscardNonMatchingHypotheses(
00631     const SetOfModels &models) {
00632   if (models.empty())
00633     return;
00634   for (int h = hypotheses_.size() - 1; h >= 0; h--) {
00635     if (!models.contains(hypotheses_[h].model)) {
00636       hypotheses_.remove(h);
00637     }
00638   }
00639 }
00640 
00641 // ============ Geometry based Paragraph Detection Algorithm =================
00642 
00643 struct Cluster {
00644   Cluster() : center(0), count(0) {}
00645   Cluster(int cen, int num) : center(cen), count(num) {}
00646 
00647   int center;  // The center of the cluster.
00648   int count;   // The number of entries within the cluster.
00649 };
00650 
00651 class SimpleClusterer {
00652  public:
00653   explicit SimpleClusterer(int max_cluster_width)
00654       : max_cluster_width_(max_cluster_width) {}
00655   void Add(int value) { values_.push_back(value); }
00656   int size() const { return values_.size(); }
00657   void GetClusters(GenericVector<Cluster> *clusters);
00658 
00659  private:
00660   int max_cluster_width_;
00661   GenericVectorEqEq<int> values_;
00662 };
00663 
00664 // Return the index of the cluster closest to value.
00665 int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
00666   int best_index = 0;
00667   for (int i = 0; i < clusters.size(); i++) {
00668     if (abs(value - clusters[i].center) <
00669         abs(value - clusters[best_index].center))
00670         best_index = i;
00671   }
00672   return best_index;
00673 }
00674 
00675 void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) {
00676   clusters->clear();
00677   values_.sort();
00678   for (int i = 0; i < values_.size();) {
00679     int orig_i = i;
00680     int lo = values_[i];
00681     int hi = lo;
00682     while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
00683       hi = values_[i];
00684     }
00685     clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
00686   }
00687 }
00688 
00689 // Calculate left- and right-indent tab stop values seen in
00690 // rows[row_start, row_end) given a tolerance of tolerance.
00691 void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
00692                        int row_start, int row_end,
00693                        int tolerance,
00694                        GenericVector<Cluster> *left_tabs,
00695                        GenericVector<Cluster> *right_tabs) {
00696   if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
00697     return;
00698   // First pass: toss all left and right indents into clusterers.
00699   SimpleClusterer initial_lefts(tolerance);
00700   SimpleClusterer initial_rights(tolerance);
00701   GenericVector<Cluster> initial_left_tabs;
00702   GenericVector<Cluster> initial_right_tabs;
00703   for (int i = row_start; i < row_end; i++) {
00704     initial_lefts.Add((*rows)[i].lindent_);
00705     initial_rights.Add((*rows)[i].rindent_);
00706   }
00707   initial_lefts.GetClusters(&initial_left_tabs);
00708   initial_rights.GetClusters(&initial_right_tabs);
00709 
00710   // Second pass: cluster only lines that are not "stray"
00711   //   An example of a stray line is a page number -- a line whose start
00712   //   and end tab-stops are far outside the typical start and end tab-stops
00713   //   for the block.
00714   //   Put another way, we only cluster data from lines whose start or end
00715   //   tab stop is frequent.
00716   SimpleClusterer lefts(tolerance);
00717   SimpleClusterer rights(tolerance);
00718 
00719   // Outlier elimination.  We might want to switch this to test outlier-ness
00720   // based on how strange a position an outlier is in instead of or in addition
00721   // to how rare it is.  These outliers get re-added if we end up having too
00722   // few tab stops, to work with, however.
00723   int infrequent_enough_to_ignore = 0;
00724   if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
00725   if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
00726 
00727   for (int i = row_start; i < row_end; i++) {
00728     int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
00729     int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
00730     if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
00731         initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
00732       lefts.Add((*rows)[i].lindent_);
00733       rights.Add((*rows)[i].rindent_);
00734     }
00735   }
00736   lefts.GetClusters(left_tabs);
00737   rights.GetClusters(right_tabs);
00738 
00739   if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
00740       (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
00741     // One side is really ragged, and the other only has one tab stop,
00742     // so those "insignificant outliers" are probably important, actually.
00743     // This often happens on a page of an index.  Add back in the ones
00744     // we omitted in the first pass.
00745     for (int i = row_start; i < row_end; i++) {
00746       int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
00747       int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
00748       if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
00749             initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
00750         lefts.Add((*rows)[i].lindent_);
00751         rights.Add((*rows)[i].rindent_);
00752       }
00753     }
00754   }
00755   lefts.GetClusters(left_tabs);
00756   rights.GetClusters(right_tabs);
00757 
00758   // If one side is almost a two-indent aligned side, and the other clearly
00759   // isn't, try to prune out the least frequent tab stop from that side.
00760   if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
00761     int to_prune = -1;
00762     for (int i = left_tabs->size() - 1; i >= 0; i--) {
00763       if (to_prune < 0 ||
00764           (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
00765         to_prune = i;
00766       }
00767     }
00768     if (to_prune >= 0 &&
00769         (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
00770       left_tabs->remove(to_prune);
00771     }
00772   }
00773   if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
00774     int to_prune = -1;
00775     for (int i = right_tabs->size() - 1; i >= 0; i--) {
00776       if (to_prune < 0 ||
00777           (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
00778         to_prune = i;
00779       }
00780     }
00781     if (to_prune >= 0 &&
00782         (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
00783       right_tabs->remove(to_prune);
00784     }
00785   }
00786 }
00787 
00788 // Given a paragraph model mark rows[row_start, row_end) as said model
00789 // start or body lines.
00790 //
00791 // Case 1: model->first_indent_ != model->body_indent_
00792 //   Differentiating the paragraph start lines from the paragraph body lines in
00793 //   this case is easy, we just see how far each line is indented.
00794 //
00795 // Case 2: model->first_indent_ == model->body_indent_
00796 //   Here, we find end-of-paragraph lines by looking for "short lines."
00797 //   What constitutes a "short line" changes depending on whether the text
00798 //   ragged-right[left] or fully justified (aligned left and right).
00799 //
00800 //   Case 2a: Ragged Right (or Left) text.  (eop_threshold == 0)
00801 //     We have a new paragraph it the first word would have at the end
00802 //     of the previous line.
00803 //
00804 //   Case 2b: Fully Justified.  (eop_threshold > 0)
00805 //     We mark a line as short (end of paragraph) if the offside indent
00806 //     is greater than eop_threshold.
00807 void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows,
00808                        int row_start, int row_end,
00809                        const ParagraphModel *model,
00810                        bool ltr,
00811                        int eop_threshold) {
00812   if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
00813     return;
00814   for (int row = row_start; row < row_end; row++) {
00815     bool valid_first = ValidFirstLine(rows, row, model);
00816     bool valid_body = ValidBodyLine(rows, row, model);
00817     if (valid_first && !valid_body) {
00818       (*rows)[row].AddStartLine(model);
00819     } else if (valid_body && !valid_first) {
00820       (*rows)[row].AddBodyLine(model);
00821     } else if (valid_body && valid_first) {
00822       bool after_eop = (row == row_start);
00823       if (row > row_start) {
00824         if (eop_threshold > 0) {
00825           if (model->justification() == JUSTIFICATION_LEFT) {
00826             after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
00827           } else {
00828             after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
00829           }
00830         } else {
00831          after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
00832                                            model->justification());
00833         }
00834       }
00835       if (after_eop) {
00836         (*rows)[row].AddStartLine(model);
00837       } else {
00838         (*rows)[row].AddBodyLine(model);
00839       }
00840     } else {
00841       // Do nothing. Stray row.
00842     }
00843   }
00844 }
00845 
00846 // GeometricClassifierState holds all of the information we'll use while
00847 // trying to determine a paragraph model for the text lines in a block of
00848 // text:
00849 //   + the rows under consideration [row_start, row_end)
00850 //   + the common left- and right-indent tab stops
00851 //   + does the block start out left-to-right or right-to-left
00852 // Further, this struct holds the data we amass for the (single) ParagraphModel
00853 // we'll assign to the text lines (assuming we get that far).
00854 struct GeometricClassifierState {
00855   GeometricClassifierState(int dbg_level,
00856                            GenericVector<RowScratchRegisters> *r,
00857                            int r_start, int r_end)
00858       : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end),
00859         margin(0) {
00860     tolerance = InterwordSpace(*r, r_start, r_end);
00861     CalculateTabStops(r, r_start, r_end, tolerance,
00862                       &left_tabs, &right_tabs);
00863     if (debug_level >= 3) {
00864       tprintf("Geometry: TabStop cluster tolerance = %d; "
00865               "%d left tabs; %d right tabs\n",
00866               tolerance, left_tabs.size(), right_tabs.size());
00867     }
00868     ltr = (*r)[r_start].ri_->ltr;
00869   }
00870 
00871   void AssumeLeftJustification() {
00872     just = tesseract::JUSTIFICATION_LEFT;
00873     margin = (*rows)[row_start].lmargin_;
00874   }
00875 
00876   void AssumeRightJustification() {
00877     just = tesseract::JUSTIFICATION_RIGHT;
00878     margin = (*rows)[row_start].rmargin_;
00879   }
00880 
00881   // Align tabs are the tab stops the text is aligned to.
00882   const GenericVector<Cluster> &AlignTabs() const {
00883     if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs;
00884     return left_tabs;
00885   }
00886 
00887   // Offside tabs are the tab stops opposite the tabs used to align the text.
00888   //
00889   // Note that for a left-to-right text which is aligned to the right such as
00890   //     this function comment, the offside tabs are the horizontal tab stops
00891   //                 marking the beginning of ("Note", "this" and "marking").
00892   const GenericVector<Cluster> &OffsideTabs() const {
00893     if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs;
00894     return right_tabs;
00895   }
00896 
00897   // Return whether the i'th row extends from the leftmost left tab stop
00898   // to the right most right tab stop.
00899   bool IsFullRow(int i) const {
00900     return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
00901         ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
00902   }
00903 
00904   int AlignsideTabIndex(int row_idx) const {
00905     return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
00906   }
00907 
00908   // Given what we know about the paragraph justification (just), would the
00909   // first word of row_b have fit at the end of row_a?
00910   bool FirstWordWouldHaveFit(int row_a, int row_b) {
00911     return ::tesseract::FirstWordWouldHaveFit(
00912         (*rows)[row_a], (*rows)[row_b], just);
00913   }
00914 
00915   void PrintRows() const { PrintRowRange(*rows, row_start, row_end); }
00916 
00917   void Fail(int min_debug_level, const char *why) const {
00918     if (debug_level < min_debug_level) return;
00919     tprintf("# %s\n", why);
00920     PrintRows();
00921   }
00922 
00923   ParagraphModel Model() const {
00924     return ParagraphModel(just, margin, first_indent, body_indent, tolerance);
00925   }
00926 
00927   // We print out messages with a debug level at least as great as debug_level.
00928   int debug_level;
00929 
00930   // The Geometric Classifier was asked to find a single paragraph model
00931   // to fit the text rows (*rows)[row_start, row_end)
00932   GenericVector<RowScratchRegisters> *rows;
00933   int row_start;
00934   int row_end;
00935 
00936   // The amount by which we expect the text edge can vary and still be aligned.
00937   int tolerance;
00938 
00939   // Is the script in this text block left-to-right?
00940   // HORRIBLE ROUGH APPROXIMATION.  TODO(eger): Improve
00941   bool ltr;
00942 
00943   // These left and right tab stops were determined to be the common tab
00944   // stops for the given text.
00945   GenericVector<Cluster> left_tabs;
00946   GenericVector<Cluster> right_tabs;
00947 
00948   // These are parameters we must determine to create a ParagraphModel.
00949   tesseract::ParagraphJustification just;
00950   int margin;
00951   int first_indent;
00952   int body_indent;
00953 
00954   // eop_threshold > 0 if the text is fully justified.  See MarkRowsWithModel()
00955   int eop_threshold;
00956 };
00957 
00958 // Given a section of text where strong textual clues did not help identifying
00959 // paragraph breaks, and for which the left and right indents have exactly
00960 // three tab stops between them, attempt to find the paragraph breaks based
00961 // solely on the outline of the text and whether the script is left-to-right.
00962 //
00963 // Algorithm Detail:
00964 //   The selected rows are in the form of a rectangle except
00965 //   for some number of "short lines" of the same length:
00966 //
00967 //   (A1)  xxxxxxxxxxxxx  (B1) xxxxxxxxxxxx
00968 //           xxxxxxxxxxx       xxxxxxxxxx    # A "short" line.
00969 //         xxxxxxxxxxxxx       xxxxxxxxxxxx
00970 //         xxxxxxxxxxxxx       xxxxxxxxxxxx
00971 //
00972 //   We have a slightly different situation if the only short
00973 //   line is at the end of the excerpt.
00974 //
00975 //   (A2) xxxxxxxxxxxxx  (B2) xxxxxxxxxxxx
00976 //        xxxxxxxxxxxxx       xxxxxxxxxxxx
00977 //        xxxxxxxxxxxxx       xxxxxxxxxxxx
00978 //          xxxxxxxxxxx       xxxxxxxxxx     # A "short" line.
00979 //
00980 //   We'll interpret these as follows based on the reasoning in the comment for
00981 //   GeometricClassify():
00982 //       [script direction: first indent, body indent]
00983 //   (A1) LtR: 2,0  RtL: 0,0   (B1) LtR: 0,0  RtL: 2,0
00984 //   (A2) LtR: 2,0  RtL: CrR   (B2) LtR: CrL  RtL: 2,0
00985 void GeometricClassifyThreeTabStopTextBlock(
00986     int debug_level,
00987     GeometricClassifierState &s,
00988     ParagraphTheory *theory) {
00989   int num_rows = s.row_end - s.row_start;
00990   int num_full_rows = 0;
00991   int last_row_full = 0;
00992   for (int i = s.row_start; i < s.row_end; i++) {
00993     if (s.IsFullRow(i)) {
00994       num_full_rows++;
00995       if (i == s.row_end - 1) last_row_full++;
00996     }
00997   }
00998 
00999   if (num_full_rows < 0.7 * num_rows) {
01000     s.Fail(1, "Not enough full lines to know which lines start paras.");
01001     return;
01002   }
01003 
01004   // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
01005   s.eop_threshold = 0;
01006 
01007   if (s.ltr) {
01008     s.AssumeLeftJustification();
01009   } else {
01010     s.AssumeRightJustification();
01011   }
01012 
01013   if (debug_level > 0) {
01014     tprintf("# Not enough variety for clear outline classification. "
01015             "Guessing these are %s aligned based on script.\n",
01016             s.ltr ? "left" : "right");
01017     s.PrintRows();
01018   }
01019 
01020   if (s.AlignTabs().size() == 2) {  // case A1 or A2
01021     s.first_indent = s.AlignTabs()[1].center;
01022     s.body_indent = s.AlignTabs()[0].center;
01023   } else {                      // case B1 or B2
01024     if (num_rows - 1 == num_full_rows - last_row_full) {
01025       // case B2
01026       const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
01027       (*s.rows)[s.row_start].AddStartLine(model);
01028       for (int i = s.row_start + 1; i < s.row_end; i++) {
01029         (*s.rows)[i].AddBodyLine(model);
01030       }
01031       return;
01032     } else {
01033       // case B1
01034       s.first_indent = s.body_indent = s.AlignTabs()[0].center;
01035       s.eop_threshold = (s.OffsideTabs()[0].center +
01036                          s.OffsideTabs()[1].center) / 2;
01037     }
01038   }
01039   const ParagraphModel *model = theory->AddModel(s.Model());
01040   MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
01041                     s.ltr, s.eop_threshold);
01042   return;
01043 }
01044 
01045 // This function is called if strong textual clues were not available, but
01046 // the caller hopes that the paragraph breaks will be super obvious just
01047 // by the outline of the text.
01048 //
01049 // The particularly difficult case is figuring out what's going on if you
01050 // don't have enough short paragraph end lines to tell us what's going on.
01051 //
01052 // For instance, let's say you have the following outline:
01053 //
01054 //   (A1)  xxxxxxxxxxxxxxxxxxxxxx
01055 //           xxxxxxxxxxxxxxxxxxxx
01056 //         xxxxxxxxxxxxxxxxxxxxxx
01057 //         xxxxxxxxxxxxxxxxxxxxxx
01058 //
01059 // Even if we know that the text is left-to-right and so will probably be
01060 // left-aligned, both of the following are possible texts:
01061 //
01062 //  (A1a)  1. Here our list item
01063 //           with two full lines.
01064 //         2. Here a second item.
01065 //         3. Here our third one.
01066 //
01067 //  (A1b)  so ends paragraph one.
01068 //           Here  starts another
01069 //         paragraph  we want  to
01070 //         read.  This  continues
01071 //
01072 // These examples are obvious from the text and should have been caught
01073 // by the StrongEvidenceClassify pass.  However, for languages where we don't
01074 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
01075 // it's worth guessing that (A1b) is the correct interpretation if there are
01076 // far more "full" lines than "short" lines.
01077 void GeometricClassify(int debug_level,
01078                        GenericVector<RowScratchRegisters> *rows,
01079                        int row_start, int row_end,
01080                        ParagraphTheory *theory) {
01081   if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
01082     return;
01083   if (debug_level > 1) {
01084     tprintf("###############################################\n");
01085     tprintf("##### GeometricClassify( rows[%d:%d) )   ####\n",
01086             row_start, row_end);
01087     tprintf("###############################################\n");
01088   }
01089   RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
01090 
01091   GeometricClassifierState s(debug_level, rows, row_start, row_end);
01092   if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
01093     s.Fail(2, "Too much variety for simple outline classification.");
01094     return;
01095   }
01096   if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
01097     s.Fail(1, "Not enough variety for simple outline classification.");
01098     return;
01099   }
01100   if (s.left_tabs.size() + s.right_tabs.size() == 3) {
01101     GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
01102     return;
01103   }
01104 
01105   // At this point, we know that one side has at least two tab stops, and the
01106   // other side has one or two tab stops.
01107   // Left to determine:
01108   //   (1) Which is the body indent and which is the first line indent?
01109   //   (2) Is the text fully justified?
01110 
01111   // If one side happens to have three or more tab stops, assume that side
01112   // is opposite of the aligned side.
01113   if (s.right_tabs.size() > 2) {
01114     s.AssumeLeftJustification();
01115   } else if (s.left_tabs.size() > 2) {
01116     s.AssumeRightJustification();
01117   } else if (s.ltr) {  // guess based on script direction
01118     s.AssumeLeftJustification();
01119   } else {
01120     s.AssumeRightJustification();
01121   }
01122 
01123   if (s.AlignTabs().size() == 2) {
01124     // For each tab stop on the aligned side, how many of them appear
01125     // to be paragraph start lines?  [first lines]
01126     int firsts[2] = {0, 0};
01127     // Count the first line as a likely paragraph start line.
01128     firsts[s.AlignsideTabIndex(s.row_start)]++;
01129     // For each line, if the first word would have fit on the previous
01130     // line count it as a likely paragraph start line.
01131     bool jam_packed = true;
01132     for (int i = s.row_start + 1; i < s.row_end; i++) {
01133       if (s.FirstWordWouldHaveFit(i - 1, i)) {
01134         firsts[s.AlignsideTabIndex(i)]++;
01135         jam_packed = false;
01136       }
01137     }
01138     // Make an extra accounting for the last line of the paragraph just
01139     // in case it's the only short line in the block.  That is, take its
01140     // first word as typical and see if this looks like the *last* line
01141     // of a paragraph.  If so, mark the *other* indent as probably a first.
01142     if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
01143       firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
01144     }
01145 
01146     int percent0firsts, percent1firsts;
01147     percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
01148     percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
01149 
01150     // TODO(eger): Tune these constants if necessary.
01151     if ((percent0firsts < 20 && 30 < percent1firsts) ||
01152         percent0firsts + 30 < percent1firsts) {
01153       s.first_indent = s.AlignTabs()[1].center;
01154       s.body_indent = s.AlignTabs()[0].center;
01155     } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
01156                percent1firsts + 30 < percent0firsts) {
01157       s.first_indent = s.AlignTabs()[0].center;
01158       s.body_indent = s.AlignTabs()[1].center;
01159     } else {
01160       // Ambiguous! Probably lineated (poetry)
01161       if (debug_level > 1) {
01162         tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
01163                 s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
01164         tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
01165                 s.AlignTabs()[0].center, percent0firsts);
01166         tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
01167                 s.AlignTabs()[1].center, percent1firsts);
01168         s.PrintRows();
01169       }
01170       return;
01171     }
01172   } else {
01173     // There's only one tab stop for the "aligned to" side.
01174     s.first_indent = s.body_indent = s.AlignTabs()[0].center;
01175   }
01176 
01177   // At this point, we have our model.
01178   const ParagraphModel *model = theory->AddModel(s.Model());
01179 
01180   // Now all we have to do is figure out if the text is fully justified or not.
01181   // eop_threshold: default to fully justified unless we see evidence below.
01182   //    See description on MarkRowsWithModel()
01183   s.eop_threshold =
01184       (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
01185   // If the text is not fully justified, re-set the eop_threshold to 0.
01186   if (s.AlignTabs().size() == 2) {
01187     // Paragraphs with a paragraph-start indent.
01188     for (int i = s.row_start; i < s.row_end - 1; i++) {
01189       if (ValidFirstLine(s.rows, i + 1, model) &&
01190           !NearlyEqual(s.OffsideTabs()[0].center,
01191                        (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
01192         // We found a non-end-of-paragraph short line: not fully justified.
01193         s.eop_threshold = 0;
01194         break;
01195       }
01196     }
01197   } else {
01198     // Paragraphs with no paragraph-start indent.
01199     for (int i = s.row_start; i < s.row_end - 1; i++) {
01200       if (!s.FirstWordWouldHaveFit(i, i + 1) &&
01201           !NearlyEqual(s.OffsideTabs()[0].center,
01202                        (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
01203         // We found a non-end-of-paragraph short line: not fully justified.
01204         s.eop_threshold = 0;
01205         break;
01206       }
01207     }
01208   }
01209   MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
01210 }
01211 
01212 // =============== Implementation of ParagraphTheory =====================
01213 
01214 const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) {
01215   for (int i = 0; i < models_->size(); i++) {
01216     if ((*models_)[i]->Comparable(model))
01217       return (*models_)[i];
01218   }
01219   ParagraphModel *m = new ParagraphModel(model);
01220   models_->push_back(m);
01221   models_we_added_.push_back_new(m);
01222   return m;
01223 }
01224 
01225 void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) {
01226   for (int i = models_->size() - 1; i >= 0; i--) {
01227     ParagraphModel *m = (*models_)[i];
01228     if (!used_models.contains(m) && models_we_added_.contains(m)) {
01229       models_->remove(i);
01230       models_we_added_.remove(models_we_added_.get_index(m));
01231       delete m;
01232     }
01233   }
01234 }
01235 
01236 // Examine rows[start, end) and try to determine if an existing non-centered
01237 // paragraph model would fit them perfectly.  If so, return a pointer to it.
01238 // If not, return NULL.
01239 const ParagraphModel *ParagraphTheory::Fits(
01240     const GenericVector<RowScratchRegisters> *rows, int start, int end) const {
01241   for (int m = 0; m < models_->size(); m++) {
01242     const ParagraphModel *model = (*models_)[m];
01243     if (model->justification() != JUSTIFICATION_CENTER &&
01244         RowsFitModel(rows, start, end, model))
01245       return model;
01246   }
01247   return NULL;
01248 }
01249 
01250 void ParagraphTheory::NonCenteredModels(SetOfModels *models) {
01251   for (int m = 0; m < models_->size(); m++) {
01252     const ParagraphModel *model = (*models_)[m];
01253     if (model->justification() != JUSTIFICATION_CENTER)
01254       models->push_back_new(model);
01255   }
01256 }
01257 
01258 int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
01259   for (int i = 0; i < models_->size(); i++) {
01260     if ((*models_)[i] == model)
01261       return i;
01262   }
01263   return -1;
01264 }
01265 
01266 bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
01267                     int row, const ParagraphModel *model) {
01268   if (!StrongModel(model)) {
01269     tprintf("ValidFirstLine() should only be called with strong models!\n");
01270   }
01271   return StrongModel(model) &&
01272       model->ValidFirstLine(
01273           (*rows)[row].lmargin_, (*rows)[row].lindent_,
01274           (*rows)[row].rindent_, (*rows)[row].rmargin_);
01275 }
01276 
01277 bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
01278                    int row, const ParagraphModel *model) {
01279   if (!StrongModel(model)) {
01280     tprintf("ValidBodyLine() should only be called with strong models!\n");
01281   }
01282   return StrongModel(model) &&
01283       model->ValidBodyLine(
01284           (*rows)[row].lmargin_, (*rows)[row].lindent_,
01285           (*rows)[row].rindent_, (*rows)[row].rmargin_);
01286 }
01287 
01288 bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
01289                      int a, int b, const ParagraphModel *model) {
01290   if (model != kCrownRight && model != kCrownLeft) {
01291     tprintf("CrownCompatible() should only be called with crown models!\n");
01292     return false;
01293   }
01294   RowScratchRegisters &row_a = (*rows)[a];
01295   RowScratchRegisters &row_b = (*rows)[b];
01296   if (model == kCrownRight) {
01297     return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
01298                        row_b.rindent_ + row_b.rmargin_,
01299                        Epsilon(row_a.ri_->average_interword_space));
01300   }
01301   return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
01302                      row_b.lindent_ + row_b.lmargin_,
01303                      Epsilon(row_a.ri_->average_interword_space));
01304 }
01305 
01306 
01307 // =============== Implementation of ParagraphModelSmearer ====================
01308 
01309 ParagraphModelSmearer::ParagraphModelSmearer(
01310     GenericVector<RowScratchRegisters> *rows,
01311     int row_start, int row_end, ParagraphTheory *theory)
01312         : theory_(theory), rows_(rows), row_start_(row_start),
01313           row_end_(row_end) {
01314   if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
01315     row_start_ = 0;
01316     row_end_ = 0;
01317     return;
01318   }
01319   SetOfModels no_models;
01320   for (int row = row_start - 1; row <= row_end; row++) {
01321     open_models_.push_back(no_models);
01322   }
01323 }
01324 
01325 // see paragraphs_internal.h
01326 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
01327   SetOfModels no_models;
01328   if (row_start < row_start_) row_start = row_start_;
01329   if (row_end > row_end_) row_end = row_end_;
01330 
01331   for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
01332        row++) {
01333     if ((*rows_)[row].ri_->num_words == 0) {
01334       OpenModels(row + 1) = no_models;
01335     } else {
01336       SetOfModels &opened = OpenModels(row);
01337       (*rows_)[row].StartHypotheses(&opened);
01338 
01339       // Which models survive the transition from row to row + 1?
01340       SetOfModels still_open;
01341       for (int m = 0; m < opened.size(); m++) {
01342         if (ValidFirstLine(rows_, row, opened[m]) ||
01343             ValidBodyLine(rows_, row, opened[m])) {
01344           // This is basic filtering; we check likely paragraph starty-ness down
01345           // below in Smear() -- you know, whether the first word would have fit
01346           // and such.
01347           still_open.push_back_new(opened[m]);
01348         }
01349       }
01350       OpenModels(row + 1) = still_open;
01351     }
01352   }
01353 }
01354 
01355 // see paragraphs_internal.h
01356 void ParagraphModelSmearer::Smear() {
01357   CalculateOpenModels(row_start_, row_end_);
01358 
01359   // For each row which we're unsure about (that is, it is LT_UNKNOWN or
01360   // we have multiple LT_START hypotheses), see if there's a model that
01361   // was recently used (an "open" model) which might model it well.
01362   for (int i = row_start_; i < row_end_; i++) {
01363     RowScratchRegisters &row = (*rows_)[i];
01364     if (row.ri_->num_words == 0)
01365       continue;
01366 
01367     // Step One:
01368     //   Figure out if there are "open" models which are left-alined or
01369     //   right-aligned.  This is important for determining whether the
01370     //   "first" word in a row would fit at the "end" of the previous row.
01371     bool left_align_open = false;
01372     bool right_align_open = false;
01373     for (int m = 0; m < OpenModels(i).size(); m++) {
01374       switch (OpenModels(i)[m]->justification()) {
01375         case JUSTIFICATION_LEFT: left_align_open = true; break;
01376         case JUSTIFICATION_RIGHT: right_align_open = true; break;
01377         default: left_align_open = right_align_open = true;
01378       }
01379     }
01380     // Step Two:
01381     //   Use that knowledge to figure out if this row is likely to
01382     //   start a paragraph.
01383     bool likely_start;
01384     if (i == 0) {
01385       likely_start = true;
01386     } else {
01387       if ((left_align_open && right_align_open) ||
01388           (!left_align_open && !right_align_open)) {
01389         likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
01390                                             JUSTIFICATION_LEFT) ||
01391                        LikelyParagraphStart((*rows_)[i - 1], row,
01392                                             JUSTIFICATION_RIGHT);
01393       } else if (left_align_open) {
01394         likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
01395                                             JUSTIFICATION_LEFT);
01396       } else {
01397         likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
01398                                             JUSTIFICATION_RIGHT);
01399       }
01400     }
01401 
01402     // Step Three:
01403     //   If this text line seems like an obvious first line of an
01404     //   open model, or an obvious continuation of an existing
01405     //   modelled paragraph, mark it up.
01406     if (likely_start) {
01407       // Add Start Hypotheses for all Open models that fit.
01408       for (int m = 0; m < OpenModels(i).size(); m++) {
01409         if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
01410           row.AddStartLine(OpenModels(i)[m]);
01411         }
01412       }
01413     } else {
01414       // Add relevant body line hypotheses.
01415       SetOfModels last_line_models;
01416       if (i > 0) {
01417         (*rows_)[i - 1].StrongHypotheses(&last_line_models);
01418       } else {
01419         theory_->NonCenteredModels(&last_line_models);
01420       }
01421       for (int m = 0; m < last_line_models.size(); m++) {
01422         const ParagraphModel *model = last_line_models[m];
01423         if (ValidBodyLine(rows_, i, model))
01424           row.AddBodyLine(model);
01425       }
01426     }
01427 
01428     // Step Four:
01429     //   If we're still quite unsure about this line, go through all
01430     //   models in our theory and see if this row could be the start
01431     //   of any of our  models.
01432     if (row.GetLineType() == LT_UNKNOWN ||
01433         (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
01434       SetOfModels all_models;
01435       theory_->NonCenteredModels(&all_models);
01436       for (int m = 0; m < all_models.size(); m++) {
01437         if (ValidFirstLine(rows_, i, all_models[m])) {
01438           row.AddStartLine(all_models[m]);
01439         }
01440       }
01441     }
01442     // Step Five:
01443     //   Since we may have updated the hypotheses about this row, we need
01444     //   to recalculate the Open models for the rest of rows[i + 1, row_end)
01445     if (row.GetLineType() != LT_UNKNOWN) {
01446       CalculateOpenModels(i + 1, row_end_);
01447     }
01448   }
01449 }
01450 
01451 // ================ Main Paragraph Detection Algorithm =======================
01452 
01453 // Find out what ParagraphModels are actually used, and discard any
01454 // that are not.
01455 void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows,
01456                          ParagraphTheory *theory) {
01457   SetOfModels used_models;
01458   for (int i = 0; i < rows.size(); i++) {
01459     rows[i].StrongHypotheses(&used_models);
01460   }
01461   theory->DiscardUnusedModels(used_models);
01462 }
01463 
01464 // DowngradeWeakestToCrowns:
01465 //   Forget any flush-{left, right} models unless we see two or more
01466 //   of them in sequence.
01467 //
01468 // In pass 3, we start to classify even flush-left paragraphs (paragraphs
01469 // where the first line and body indent are the same) as having proper Models.
01470 // This is generally dangerous, since if you start imagining that flush-left
01471 // is a typical paragraph model when it is not, it will lead you to chop normal
01472 // indented paragraphs in the middle whenever a sentence happens to start on a
01473 // new line (see "This" above).  What to do?
01474 //   What we do is to take any paragraph which is flush left and is not
01475 // preceded by another paragraph of the same model and convert it to a "Crown"
01476 // paragraph.  This is a weak pseudo-ParagraphModel which is a placeholder
01477 // for later.  It means that the paragraph is flush, but it would be desirable
01478 // to mark it as the same model as following text if it fits.  This downgrade
01479 // FlushLeft -> CrownLeft -> Model of following paragraph.  Means that we
01480 // avoid making flush left Paragraph Models whenever we see a top-of-the-page
01481 // half-of-a-paragraph. and instead we mark it the same as normal body text.
01482 //
01483 // Implementation:
01484 //
01485 //   Comb backwards through the row scratch registers, and turn any
01486 //   sequences of body lines of equivalent type abutted against the beginning
01487 //   or a body or start line of a different type into a crown paragraph.
01488 void DowngradeWeakestToCrowns(int debug_level,
01489                               ParagraphTheory *theory,
01490                               GenericVector<RowScratchRegisters> *rows) {
01491   int start;
01492   for (int end = rows->size(); end > 0; end = start) {
01493     // Search back for a body line of a unique type.
01494     const ParagraphModel *model = NULL;
01495     while (end > 0 &&
01496            (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {
01497       end--;
01498     }
01499     if (end == 0) break;
01500     start = end - 1;
01501     while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
01502       start--;  // walk back to the first line that is not the same body type.
01503     }
01504     if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
01505         StrongModel(model) &&
01506         NearlyEqual(model->first_indent(), model->body_indent(),
01507                     model->tolerance())) {
01508         start--;
01509     }
01510     start++;
01511     // Now rows[start, end) is a sequence of unique body hypotheses of model.
01512     if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
01513       continue;
01514     if (!StrongModel(model)) {
01515       while (start > 0 &&
01516              CrownCompatible(rows, start - 1, start, model))
01517         start--;
01518     }
01519     if (start == 0 ||
01520         (!StrongModel(model)) ||
01521         (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
01522       // crownify rows[start, end)
01523       const ParagraphModel *crown_model = model;
01524       if (StrongModel(model)) {
01525           if (model->justification() == JUSTIFICATION_LEFT)
01526             crown_model = kCrownLeft;
01527           else
01528             crown_model = kCrownRight;
01529       }
01530       (*rows)[start].SetUnknown();
01531       (*rows)[start].AddStartLine(crown_model);
01532       for (int row = start + 1; row < end; row++) {
01533         (*rows)[row].SetUnknown();
01534         (*rows)[row].AddBodyLine(crown_model);
01535       }
01536     }
01537   }
01538   DiscardUnusedModels(*rows, theory);
01539 }
01540 
01541 
01542 // Clear all hypotheses about lines [start, end) and reset margins.
01543 //
01544 // The empty space between the left of a row and the block boundary (and
01545 // similarly for the right) is split into two pieces: margin and indent.
01546 // In initial processing, we assume the block is tight and the margin for
01547 // all lines is set to zero.   However, if our first pass does not yield
01548 // models for  everything,  it may be  due to an  inset paragraph like a
01549 // block-quote.   In that case, we make a second pass over that unmarked
01550 // section of the page and reset the "margin" portion of the empty space
01551 // to the common amount of space at  the ends of the lines under consid-
01552 // eration.    This would be equivalent to percentile set to 0. However,
01553 // sometimes we have a single character sticking out in the right margin
01554 // of a text block  (like the 'r' in 'for' on line 3 above),  and we can
01555 // really  just ignore it as an outlier.   To express this, we allow the
01556 // user to specify  the percentile (0..100)  of indent values  to use as
01557 // the common margin for each row in the run of rows[start, end).
01558 void RecomputeMarginsAndClearHypotheses(
01559     GenericVector<RowScratchRegisters> *rows, int start, int end,
01560     int percentile) {
01561   if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
01562     return;
01563 
01564   int lmin, lmax, rmin, rmax;
01565   lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
01566   rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
01567   for (int i = start; i < end; i++) {
01568     RowScratchRegisters &sr = (*rows)[i];
01569     sr.SetUnknown();
01570     if (sr.ri_->num_words == 0)
01571       continue;
01572     UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
01573     UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
01574   }
01575   STATS lefts(lmin, lmax + 1);
01576   STATS rights(rmin, rmax + 1);
01577   for (int i = start; i < end; i++) {
01578     RowScratchRegisters &sr = (*rows)[i];
01579     if (sr.ri_->num_words == 0)
01580       continue;
01581     lefts.add(sr.lmargin_ + sr.lindent_, 1);
01582     rights.add(sr.rmargin_ + sr.rindent_, 1);
01583   }
01584   int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
01585   int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
01586   for (int i = start; i < end; i++) {
01587     RowScratchRegisters &sr = (*rows)[i];
01588     int ldelta = ignorable_left - sr.lmargin_;
01589     sr.lmargin_ += ldelta;
01590     sr.lindent_ -= ldelta;
01591     int rdelta = ignorable_right - sr.rmargin_;
01592     sr.rmargin_ += rdelta;
01593     sr.rindent_ -= rdelta;
01594   }
01595 }
01596 
01597 // Return the median inter-word space in rows[row_start, row_end).
01598 int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
01599                    int row_start, int row_end) {
01600   if (row_end < row_start + 1) return 1;
01601   int word_height = (rows[row_start].ri_->lword_box.height() +
01602                      rows[row_end - 1].ri_->lword_box.height()) / 2;
01603   int word_width = (rows[row_start].ri_->lword_box.width() +
01604                     rows[row_end - 1].ri_->lword_box.width())  / 2;
01605   STATS spacing_widths(0, 5 + word_width);
01606   for (int i = row_start; i < row_end; i++) {
01607     if (rows[i].ri_->num_words > 1) {
01608       spacing_widths.add(rows[i].ri_->average_interword_space, 1);
01609     }
01610   }
01611   int minimum_reasonable_space = word_height / 3;
01612   if (minimum_reasonable_space < 2)
01613     minimum_reasonable_space = 2;
01614   int median = spacing_widths.median();
01615   return (median > minimum_reasonable_space)
01616       ? median : minimum_reasonable_space;
01617 }
01618 
01619 // Return whether the first word on the after line can fit in the space at
01620 // the end of the before line (knowing which way the text is aligned and read).
01621 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
01622                            const RowScratchRegisters &after,
01623                            tesseract::ParagraphJustification justification) {
01624   if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
01625     return true;
01626 
01627   if (justification == JUSTIFICATION_UNKNOWN) {
01628     tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
01629   }
01630   int available_space;
01631   if (justification == JUSTIFICATION_CENTER) {
01632     available_space = before.lindent_ + before.rindent_;
01633   } else {
01634     available_space = before.OffsideIndent(justification);
01635   }
01636   available_space -= before.ri_->average_interword_space;
01637 
01638   if (before.ri_->ltr)
01639     return after.ri_->lword_box.width() < available_space;
01640   return after.ri_->rword_box.width() < available_space;
01641 }
01642 
01643 // Return whether the first word on the after line can fit in the space at
01644 // the end of the before line (not knowing which way the text goes) in a left
01645 // or right alignemnt.
01646 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
01647                            const RowScratchRegisters &after) {
01648   if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
01649     return true;
01650 
01651   int available_space = before.lindent_;
01652   if (before.rindent_ > available_space)
01653     available_space = before.rindent_;
01654   available_space -= before.ri_->average_interword_space;
01655 
01656   if (before.ri_->ltr)
01657     return after.ri_->lword_box.width() < available_space;
01658   return after.ri_->rword_box.width() < available_space;
01659 }
01660 
01661 bool TextSupportsBreak(const RowScratchRegisters &before,
01662                        const RowScratchRegisters &after) {
01663   if (before.ri_->ltr) {
01664     return before.ri_->rword_likely_ends_idea &&
01665            after.ri_->lword_likely_starts_idea;
01666   } else {
01667     return before.ri_->lword_likely_ends_idea &&
01668            after.ri_->rword_likely_starts_idea;
01669   }
01670 }
01671 
01672 bool LikelyParagraphStart(const RowScratchRegisters &before,
01673                           const RowScratchRegisters &after) {
01674   return before.ri_->num_words == 0 ||
01675       (FirstWordWouldHaveFit(before, after) &&
01676        TextSupportsBreak(before, after));
01677 }
01678 
01679 bool LikelyParagraphStart(const RowScratchRegisters &before,
01680                           const RowScratchRegisters &after,
01681                           tesseract::ParagraphJustification j) {
01682   return before.ri_->num_words == 0 ||
01683       (FirstWordWouldHaveFit(before, after, j) &&
01684        TextSupportsBreak(before, after));
01685 }
01686 
01687 // Examine rows[start, end) and try to determine what sort of ParagraphModel
01688 // would fit them as a single paragraph.
01689 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
01690 // If the rows given could be a consistent start to a paragraph, set *consistent
01691 // true.
01692 ParagraphModel InternalParagraphModelByOutline(
01693     const GenericVector<RowScratchRegisters> *rows,
01694     int start, int end, int tolerance, bool *consistent) {
01695   int ltr_line_count = 0;
01696   for (int i = start; i < end; i++) {
01697     ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
01698   }
01699   bool ltr = (ltr_line_count >= (end - start) / 2);
01700 
01701   *consistent = true;
01702   if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
01703     return ParagraphModel();
01704 
01705   // Ensure the caller only passed us a region with a common rmargin and
01706   // lmargin.
01707   int lmargin = (*rows)[start].lmargin_;
01708   int rmargin = (*rows)[start].rmargin_;
01709   int lmin, lmax, rmin, rmax, cmin, cmax;
01710   lmin = lmax = (*rows)[start + 1].lindent_;
01711   rmin = rmax = (*rows)[start + 1].rindent_;
01712   cmin = cmax = 0;
01713   for (int i = start + 1; i < end; i++) {
01714     if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
01715       tprintf("Margins don't match! Software error.\n");
01716       *consistent = false;
01717       return ParagraphModel();
01718     }
01719     UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
01720     UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
01721     UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
01722   }
01723   int ldiff = lmax - lmin;
01724   int rdiff = rmax - rmin;
01725   int cdiff = cmax - cmin;
01726   if (rdiff > tolerance && ldiff > tolerance) {
01727     if (cdiff < tolerance * 2) {
01728       if (end - start < 3)
01729         return ParagraphModel();
01730       return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
01731     }
01732     *consistent = false;
01733     return ParagraphModel();
01734   }
01735   if (end - start < 3)  // Don't return a model for two line paras.
01736     return ParagraphModel();
01737 
01738   // These booleans keep us from saying something is aligned left when the body
01739   // left variance is too large.
01740   bool body_admits_left_alignment = ldiff < tolerance;
01741   bool body_admits_right_alignment = rdiff < tolerance;
01742 
01743   ParagraphModel left_model =
01744       ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
01745                      (lmin + lmax) / 2, tolerance);
01746   ParagraphModel right_model =
01747       ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
01748                      (rmin + rmax) / 2, tolerance);
01749 
01750   // These booleans keep us from having an indent on the "wrong side" for the
01751   // first line.
01752   bool text_admits_left_alignment = ltr || left_model.is_flush();
01753   bool text_admits_right_alignment = !ltr || right_model.is_flush();
01754 
01755   // At least one of the edges is less than tolerance in variance.
01756   // If the other is obviously ragged, it can't be the one aligned to.
01757   // [Note the last line is included in this raggedness.]
01758   if (tolerance < rdiff) {
01759     if (body_admits_left_alignment && text_admits_left_alignment)
01760       return left_model;
01761     *consistent = false;
01762     return ParagraphModel();
01763   }
01764   if (tolerance < ldiff) {
01765     if (body_admits_right_alignment && text_admits_right_alignment)
01766       return right_model;
01767     *consistent = false;
01768     return ParagraphModel();
01769   }
01770 
01771   // At this point, we know the body text doesn't vary much on either side.
01772 
01773   // If the first line juts out oddly in one direction or the other,
01774   // that likely indicates the side aligned to.
01775   int first_left = (*rows)[start].lindent_;
01776   int first_right = (*rows)[start].rindent_;
01777 
01778   if (ltr && body_admits_left_alignment &&
01779       (first_left < lmin || first_left > lmax))
01780     return left_model;
01781   if (!ltr && body_admits_right_alignment &&
01782       (first_right < rmin || first_right > rmax))
01783     return right_model;
01784 
01785   *consistent = false;
01786   return ParagraphModel();
01787 }
01788 
01789 // Examine rows[start, end) and try to determine what sort of ParagraphModel
01790 // would fit them as a single paragraph.   If nothing fits,
01791 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
01792 // output if we're debugging.
01793 ParagraphModel ParagraphModelByOutline(
01794     int debug_level,
01795     const GenericVector<RowScratchRegisters> *rows,
01796     int start, int end, int tolerance) {
01797   bool unused_consistent;
01798   ParagraphModel retval = InternalParagraphModelByOutline(
01799       rows, start, end, tolerance, &unused_consistent);
01800   if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
01801     tprintf("Could not determine a model for this paragraph:\n");
01802     PrintRowRange(*rows, start, end);
01803   }
01804   return retval;
01805 }
01806 
01807 // Do rows[start, end) form a single instance of the given paragraph model?
01808 bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
01809                   int start, int end, const ParagraphModel *model) {
01810   if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
01811     return false;
01812   if (!ValidFirstLine(rows, start, model)) return false;
01813   for (int i = start + 1 ; i < end; i++) {
01814     if (!ValidBodyLine(rows, i, model)) return false;
01815   }
01816   return true;
01817 }
01818 
01819 // Examine rows[row_start, row_end) as an independent section of text,
01820 // and mark rows that are exceptionally clear as start-of-paragraph
01821 // and paragraph-body lines.
01822 //
01823 // We presume that any lines surrounding rows[row_start, row_end) may
01824 // have wildly different paragraph models, so we don't key any data off
01825 // of those lines.
01826 //
01827 // We only take the very strongest signals, as we don't want to get
01828 // confused and marking up centered text, poetry, or source code as
01829 // clearly part of a typical paragraph.
01830 void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows,
01831                         int row_start, int row_end) {
01832   // Record patently obvious body text.
01833   for (int i = row_start + 1; i < row_end; i++) {
01834     const RowScratchRegisters &prev = (*rows)[i - 1];
01835     RowScratchRegisters &curr = (*rows)[i];
01836     tesseract::ParagraphJustification typical_justification =
01837         prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
01838     if (!curr.ri_->rword_likely_starts_idea &&
01839         !curr.ri_->lword_likely_starts_idea &&
01840         !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
01841       curr.SetBodyLine();
01842     }
01843   }
01844 
01845   // Record patently obvious start paragraph lines.
01846   //
01847   // It's an extremely good signal of the start of a paragraph that
01848   // the first word would have fit on the end of the previous line.
01849   // However, applying just that signal would have us mark random
01850   // start lines of lineated text (poetry and source code) and some
01851   // centered headings as paragraph start lines.  Therefore, we use
01852   // a second qualification for a paragraph start: Not only should
01853   // the first word of this line have fit on the previous line,
01854   // but also, this line should go full to the right of the block,
01855   // disallowing a subsequent word from having fit on this line.
01856 
01857   // First row:
01858   {
01859     RowScratchRegisters &curr = (*rows)[row_start];
01860     RowScratchRegisters &next = (*rows)[row_start + 1];
01861     tesseract::ParagraphJustification j =
01862         curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
01863     if (curr.GetLineType() == LT_UNKNOWN &&
01864         !FirstWordWouldHaveFit(curr, next, j) &&
01865         (curr.ri_->lword_likely_starts_idea ||
01866          curr.ri_->rword_likely_starts_idea)) {
01867       curr.SetStartLine();
01868     }
01869   }
01870   // Middle rows
01871   for (int i = row_start + 1; i < row_end - 1; i++) {
01872     RowScratchRegisters &prev = (*rows)[i - 1];
01873     RowScratchRegisters &curr = (*rows)[i];
01874     RowScratchRegisters &next = (*rows)[i + 1];
01875     tesseract::ParagraphJustification j =
01876         curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
01877     if (curr.GetLineType() == LT_UNKNOWN &&
01878         !FirstWordWouldHaveFit(curr, next, j) &&
01879         LikelyParagraphStart(prev, curr, j)) {
01880       curr.SetStartLine();
01881     }
01882   }
01883   // Last row
01884   {  // the short circuit at the top means we have at least two lines.
01885     RowScratchRegisters &prev = (*rows)[row_end - 2];
01886     RowScratchRegisters &curr = (*rows)[row_end - 1];
01887     tesseract::ParagraphJustification j =
01888         curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
01889     if (curr.GetLineType() == LT_UNKNOWN &&
01890         !FirstWordWouldHaveFit(curr, curr, j) &&
01891         LikelyParagraphStart(prev, curr, j)) {
01892       curr.SetStartLine();
01893     }
01894   }
01895 }
01896 
01897 // Look for sequences of a start line followed by some body lines in
01898 // rows[row_start, row_end) and create ParagraphModels for them if
01899 // they seem coherent.
01900 void ModelStrongEvidence(int debug_level,
01901                          GenericVector<RowScratchRegisters> *rows,
01902                          int row_start, int row_end,
01903                          bool allow_flush_models,
01904                          ParagraphTheory *theory) {
01905   if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
01906     return;
01907 
01908   int start = row_start;
01909   while (start < row_end) {
01910     while (start < row_end && (*rows)[start].GetLineType() != LT_START)
01911       start++;
01912     if (start >= row_end - 1)
01913       break;
01914 
01915     int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
01916     int end = start;
01917     ParagraphModel last_model;
01918     bool next_consistent;
01919     do {
01920       ++end;
01921       // rows[row, end) was consistent.
01922       // If rows[row, end + 1) is not consistent,
01923       //   just model rows[row, end)
01924       if (end < row_end - 1) {
01925         RowScratchRegisters &next = (*rows)[end];
01926         LineType lt = next.GetLineType();
01927         next_consistent = lt == LT_BODY ||
01928             (lt == LT_UNKNOWN &&
01929              !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
01930       } else {
01931         next_consistent = false;
01932       }
01933       if (next_consistent) {
01934         ParagraphModel next_model = InternalParagraphModelByOutline(
01935             rows, start, end + 1, tolerance, &next_consistent);
01936         if (((*rows)[start].ri_->ltr &&
01937              last_model.justification() == JUSTIFICATION_LEFT &&
01938              next_model.justification() != JUSTIFICATION_LEFT) ||
01939             (!(*rows)[start].ri_->ltr &&
01940              last_model.justification() == JUSTIFICATION_RIGHT &&
01941              next_model.justification() != JUSTIFICATION_RIGHT)) {
01942           next_consistent = false;
01943         }
01944         last_model = next_model;
01945       } else {
01946         next_consistent = false;
01947       }
01948     } while (next_consistent && end < row_end);
01949     // At this point, rows[start, end) looked like it could have been a
01950     // single paragraph.  If we can make a good ParagraphModel for it,
01951     // do so and mark this sequence with that model.
01952     if (end > start + 1) {
01953       // emit a new paragraph if we have more than one line.
01954       const ParagraphModel *model = NULL;
01955       ParagraphModel new_model = ParagraphModelByOutline(
01956           debug_level, rows, start, end,
01957           Epsilon(InterwordSpace(*rows, start, end)));
01958       if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
01959         // couldn't create a good model, oh well.
01960       } else if (new_model.is_flush()) {
01961         if (end == start + 2) {
01962           // It's very likely we just got two paragraph starts in a row.
01963           end = start + 1;
01964         } else if (start == row_start) {
01965           // Mark this as a Crown.
01966           if (new_model.justification() == JUSTIFICATION_LEFT) {
01967             model = kCrownLeft;
01968           } else {
01969             model = kCrownRight;
01970           }
01971         } else if (allow_flush_models) {
01972           model = theory->AddModel(new_model);
01973         }
01974       } else {
01975         model = theory->AddModel(new_model);
01976       }
01977       if (model) {
01978         (*rows)[start].AddStartLine(model);
01979         for (int i = start + 1; i < end; i++) {
01980           (*rows)[i].AddBodyLine(model);
01981         }
01982       }
01983     }
01984     start = end;
01985   }
01986 }
01987 
01988 // We examine rows[row_start, row_end) and do the following:
01989 //   (1) Clear all existing hypotheses for the rows being considered.
01990 //   (2) Mark up any rows as exceptionally likely to be paragraph starts
01991 //       or paragraph body lines as such using both geometric and textual
01992 //       clues.
01993 //   (3) Form models for any sequence of start + continuation lines.
01994 //   (4) Smear the paragraph models to cover surrounding text.
01995 void StrongEvidenceClassify(int debug_level,
01996                             GenericVector<RowScratchRegisters> *rows,
01997                             int row_start, int row_end,
01998                             ParagraphTheory *theory) {
01999   if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
02000     return;
02001 
02002   if (debug_level > 1) {
02003     tprintf("#############################################\n");
02004     tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
02005     tprintf("#############################################\n");
02006   }
02007 
02008   RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
02009   MarkStrongEvidence(rows, row_start, row_end);
02010 
02011   DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
02012 
02013   // Create paragraph models.
02014   ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
02015 
02016   DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
02017 
02018   // At this point, some rows are marked up as paragraphs with model numbers,
02019   // and some rows are marked up as either LT_START or LT_BODY.  Now let's
02020   // smear any good paragraph hypotheses forward and backward.
02021   ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
02022   smearer.Smear();
02023 }
02024 
02025 void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows,
02026                                int row_start, int row_end,
02027                                ParagraphTheory *theory) {
02028   for (int i = row_start + 1; i < row_end - 1; i++) {
02029     if ((*rows)[i - 1].ri_->has_leaders &&
02030         (*rows)[i].ri_->has_leaders &&
02031         (*rows)[i + 1].ri_->has_leaders) {
02032       const ParagraphModel *model = theory->AddModel(
02033           ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
02034       (*rows)[i].AddStartLine(model);
02035     }
02036   }
02037 }
02038 
02039 // Collect sequences of unique hypotheses in row registers and create proper
02040 // paragraphs for them, referencing the paragraphs in row_owners.
02041 void ConvertHypothesizedModelRunsToParagraphs(
02042     int debug_level,
02043     const GenericVector<RowScratchRegisters> &rows,
02044     GenericVector<PARA *> *row_owners,
02045     ParagraphTheory *theory) {
02046   int end = rows.size();
02047   int start;
02048   for (; end > 0; end = start) {
02049     start = end - 1;
02050     const ParagraphModel *model = NULL;
02051     // TODO(eger): Be smarter about dealing with multiple hypotheses.
02052     bool single_line_paragraph = false;
02053     SetOfModels models;
02054     rows[start].NonNullHypotheses(&models);
02055     if (models.size() > 0) {
02056       model = models[0];
02057       if (rows[start].GetLineType(model) != LT_BODY)
02058         single_line_paragraph = true;
02059     }
02060     if (model && !single_line_paragraph) {
02061       // walk back looking for more body lines and then a start line.
02062       while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
02063         // do nothing
02064       }
02065       if (start < 0 || rows[start].GetLineType(model) != LT_START) {
02066         model = NULL;
02067       }
02068     }
02069     if (model == NULL) {
02070       continue;
02071     }
02072     // rows[start, end) should be a paragraph.
02073     PARA *p = new PARA();
02074     if (model == kCrownLeft || model == kCrownRight) {
02075       p->is_very_first_or_continuation = true;
02076       // Crown paragraph.
02077       //   If we can find an existing ParagraphModel that fits, use it,
02078       //   else create a new one.
02079       for (int row = end; row < rows.size(); row++) {
02080         if ((*row_owners)[row] &&
02081             (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
02082             (start == 0 ||
02083              ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
02084           model = (*row_owners)[row]->model;
02085           break;
02086         }
02087       }
02088       if (model == kCrownLeft) {
02089         // No subsequent model fits, so cons one up.
02090         model = theory->AddModel(ParagraphModel(
02091             JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
02092             0, 0, Epsilon(rows[start].ri_->average_interword_space)));
02093       } else if (model == kCrownRight) {
02094         // No subsequent model fits, so cons one up.
02095         model = theory->AddModel(ParagraphModel(
02096             JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
02097             0, 0, Epsilon(rows[start].ri_->average_interword_space)));
02098       }
02099     }
02100     rows[start].SetUnknown();
02101     rows[start].AddStartLine(model);
02102     for (int i = start + 1; i < end; i++) {
02103       rows[i].SetUnknown();
02104       rows[i].AddBodyLine(model);
02105     }
02106     p->model = model;
02107     p->has_drop_cap = rows[start].ri_->has_drop_cap;
02108     p->is_list_item =
02109         model->justification() == JUSTIFICATION_RIGHT
02110             ? rows[start].ri_->rword_indicates_list_item
02111             : rows[start].ri_->lword_indicates_list_item;
02112     for (int row = start; row < end; row++) {
02113       if ((*row_owners)[row] != NULL) {
02114         tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
02115                 "more than once!\n");
02116       }
02117       (*row_owners)[row] = p;
02118     }
02119   }
02120 }
02121 
02122 struct Interval {
02123   Interval() : begin(0), end(0) {}
02124   Interval(int b, int e) : begin(b), end(e) {}
02125 
02126   int begin;
02127   int end;
02128 };
02129 
02130 // Return whether rows[row] appears to be stranded, meaning that the evidence
02131 // for this row is very weak due to context.  For instance, two lines of source
02132 // code may happen to be indented at the same tab vector as body text starts,
02133 // leading us to think they are two start-of-paragraph lines.  This is not
02134 // optimal.  However, we also don't want to mark a sequence of short dialog
02135 // as "weak," so our heuristic is:
02136 //   (1) If a line is surrounded by lines of unknown type, it's weak.
02137 //   (2) If two lines in a row are start lines for a given paragraph type, but
02138 //       after that the same paragraph type does not continue, they're weak.
02139 bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int row) {
02140   SetOfModels row_models;
02141   rows[row].StrongHypotheses(&row_models);
02142 
02143   for (int m = 0; m < row_models.size(); m++) {
02144     bool all_starts = rows[row].GetLineType();
02145     int run_length = 1;
02146     bool continues = true;
02147     for (int i = row - 1; i >= 0 && continues; i--) {
02148       SetOfModels models;
02149       rows[i].NonNullHypotheses(&models);
02150       switch (rows[i].GetLineType(row_models[m])) {
02151         case LT_START: run_length++; break;
02152         case LT_MULTIPLE:  // explicit fall-through
02153         case LT_BODY: run_length++; all_starts = false; break;
02154         case LT_UNKNOWN:  // explicit fall-through
02155         default: continues = false;
02156       }
02157     }
02158     continues = true;
02159     for (int i = row + 1; i < rows.size() && continues; i++) {
02160       SetOfModels models;
02161       rows[i].NonNullHypotheses(&models);
02162       switch (rows[i].GetLineType(row_models[m])) {
02163         case LT_START: run_length++; break;
02164         case LT_MULTIPLE:  // explicit fall-through
02165         case LT_BODY: run_length++; all_starts = false; break;
02166         case LT_UNKNOWN:  // explicit fall-through
02167         default: continues = false;
02168       }
02169     }
02170     if (run_length > 2 || (!all_starts && run_length > 1)) return false;
02171   }
02172   return true;
02173 }
02174 
02175 // Go through rows[row_start, row_end) and gather up sequences that need better
02176 // classification.
02177 // + Sequences of non-empty rows without hypotheses.
02178 // + Crown paragraphs not immediately followed by a strongly modeled line.
02179 // + Single line paragraphs surrounded by text that doesn't match the
02180 //   model.
02181 void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows,
02182                       GenericVector<Interval> *to_fix,
02183                       int row_start, int row_end) {
02184   to_fix->clear();
02185   for (int i = row_start; i < row_end; i++) {
02186     bool needs_fixing = false;
02187 
02188     SetOfModels models;
02189     SetOfModels models_w_crowns;
02190     rows[i].StrongHypotheses(&models);
02191     rows[i].NonNullHypotheses(&models_w_crowns);
02192     if (models.empty() && models_w_crowns.size() > 0) {
02193       // Crown paragraph.  Is it followed by a modeled line?
02194       for (int end = i + 1; end < rows.size(); end++) {
02195         SetOfModels end_models;
02196         SetOfModels strong_end_models;
02197         rows[end].NonNullHypotheses(&end_models);
02198         rows[end].StrongHypotheses(&strong_end_models);
02199         if (end_models.size() == 0) {
02200           needs_fixing = true;
02201           break;
02202         } else if (strong_end_models.size() > 0) {
02203           needs_fixing = false;
02204           break;
02205         }
02206       }
02207     } else if (models.empty() && rows[i].ri_->num_words > 0) {
02208       // No models at all.
02209       needs_fixing = true;
02210     }
02211 
02212     if (!needs_fixing && !models.empty()) {
02213       needs_fixing = RowIsStranded(rows, i);
02214     }
02215 
02216     if (needs_fixing) {
02217       if (!to_fix->empty() && to_fix->back().end == i - 1)
02218         to_fix->back().end = i;
02219       else
02220         to_fix->push_back(Interval(i, i));
02221     }
02222   }
02223   // Convert inclusive intervals to half-open intervals.
02224   for (int i = 0; i < to_fix->size(); i++) {
02225     (*to_fix)[i].end = (*to_fix)[i].end + 1;
02226   }
02227 }
02228 
02229 // Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
02230 // normalize each row_owner to point to an actual PARA, and output the
02231 // paragraphs in order onto paragraphs.
02232 void CanonicalizeDetectionResults(
02233     GenericVector<PARA *> *row_owners,
02234     PARA_LIST *paragraphs) {
02235   GenericVector<PARA *> &rows = *row_owners;
02236   paragraphs->clear();
02237   PARA_IT out(paragraphs);
02238   PARA *formerly_null = NULL;
02239   for (int i = 0; i < rows.size(); i++) {
02240     if (rows[i] == NULL) {
02241       if (i == 0 || rows[i - 1] != formerly_null) {
02242         rows[i] = formerly_null = new PARA();
02243       } else {
02244         rows[i] = formerly_null;
02245         continue;
02246       }
02247     } else if (i > 0 && rows[i - 1] == rows[i]) {
02248       continue;
02249     }
02250     out.add_after_then_move(rows[i]);
02251   }
02252 }
02253 
02254 // Main entry point for Paragraph Detection Algorithm.
02255 //
02256 // Given a set of equally spaced textlines (described by row_infos),
02257 // Split them into paragraphs.
02258 //
02259 // Output:
02260 //   row_owners - one pointer for each row, to the paragraph it belongs to.
02261 //   paragraphs - this is the actual list of PARA objects.
02262 //   models - the list of paragraph models referenced by the PARA objects.
02263 //            caller is responsible for deleting the models.
02264 void DetectParagraphs(int debug_level,
02265                       GenericVector<RowInfo> *row_infos,
02266                       GenericVector<PARA *> *row_owners,
02267                       PARA_LIST *paragraphs,
02268                       GenericVector<ParagraphModel *> *models) {
02269   GenericVector<RowScratchRegisters> rows;
02270   ParagraphTheory theory(models);
02271 
02272   // Initialize row_owners to be a bunch of NULL pointers.
02273   row_owners->init_to_size(row_infos->size(), NULL);
02274 
02275   // Set up row scratch registers for the main algorithm.
02276   rows.init_to_size(row_infos->size(), RowScratchRegisters());
02277   for (int i = 0; i < row_infos->size(); i++) {
02278     rows[i].Init((*row_infos)[i]);
02279   }
02280 
02281   // Pass 1:
02282   //   Detect sequences of lines that all contain leader dots (.....)
02283   //   These are likely Tables of Contents.  If there are three text lines in
02284   //   a row with leader dots, it's pretty safe to say the middle one should
02285   //   be a paragraph of its own.
02286   SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
02287 
02288   DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
02289 
02290   GenericVector<Interval> leftovers;
02291   LeftoverSegments(rows, &leftovers, 0, rows.size());
02292   for (int i = 0; i < leftovers.size(); i++) {
02293     // Pass 2a:
02294     //   Find any strongly evidenced start-of-paragraph lines.  If they're
02295     //   followed by two lines that look like body lines, make a paragraph
02296     //   model for that and see if that model applies throughout the text
02297     //   (that is, "smear" it).
02298     StrongEvidenceClassify(debug_level, &rows,
02299                            leftovers[i].begin, leftovers[i].end, &theory);
02300 
02301     // Pass 2b:
02302     //   If we had any luck in pass 2a, we got part of the page and didn't
02303     //   know how to classify a few runs of rows. Take the segments that
02304     //   didn't find a model and reprocess them individually.
02305     GenericVector<Interval> leftovers2;
02306     LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
02307     bool pass2a_was_useful = leftovers2.size() > 1 ||
02308         (leftovers2.size() == 1 &&
02309          (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
02310     if (pass2a_was_useful) {
02311       for (int j = 0; j < leftovers2.size(); j++) {
02312         StrongEvidenceClassify(debug_level, &rows,
02313                                leftovers2[j].begin, leftovers2[j].end,
02314                                &theory);
02315       }
02316     }
02317   }
02318 
02319   DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
02320 
02321   // Pass 3:
02322   //   These are the dregs for which we didn't have enough strong textual
02323   //   and geometric clues to form matching models for.  Let's see if
02324   //   the geometric clues are simple enough that we could just use those.
02325   LeftoverSegments(rows, &leftovers, 0, rows.size());
02326   for (int i = 0; i < leftovers.size(); i++) {
02327     GeometricClassify(debug_level, &rows,
02328                       leftovers[i].begin, leftovers[i].end, &theory);
02329   }
02330 
02331   // Undo any flush models for which there's little evidence.
02332   DowngradeWeakestToCrowns(debug_level, &theory, &rows);
02333 
02334   DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
02335 
02336   // Pass 4:
02337   //   Take everything that's still not marked up well and clear all markings.
02338   LeftoverSegments(rows, &leftovers, 0, rows.size());
02339   for (int i = 0; i < leftovers.size(); i++) {
02340     for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
02341       rows[j].SetUnknown();
02342     }
02343   }
02344 
02345   DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
02346 
02347   // Convert all of the unique hypothesis runs to PARAs.
02348   ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
02349                                            &theory);
02350 
02351   DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
02352 
02353   // Finally, clean up any dangling NULL row paragraph parents.
02354   CanonicalizeDetectionResults(row_owners, paragraphs);
02355 }
02356 
02357 // ============ Code interfacing with the rest of Tesseract ==================
02358 
02359 void InitializeTextAndBoxesPreRecognition(const MutableIterator &it,
02360                                           RowInfo *info) {
02361   // Set up text, lword_text, and rword_text (mostly for debug printing).
02362   STRING fake_text;
02363   PageIterator pit(static_cast<const PageIterator&>(it));
02364   bool first_word = true;
02365   if (!pit.Empty(RIL_WORD)) {
02366     do {
02367       fake_text += "x";
02368       if (first_word) info->lword_text += "x";
02369       info->rword_text += "x";
02370       if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
02371           !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
02372         fake_text += " ";
02373         info->rword_text = "";
02374         first_word = false;
02375       }
02376     } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
02377              pit.Next(RIL_SYMBOL));
02378   }
02379   if (fake_text.size() == 0) return;
02380 
02381   int lspaces = info->pix_ldistance / info->average_interword_space;
02382   for (int i = 0; i < lspaces; i++) {
02383     info->text += ' ';
02384   }
02385   info->text += fake_text;
02386 
02387   // Set up lword_box, rword_box, and num_words.
02388   PAGE_RES_IT page_res_it = *it.PageResIt();
02389   WERD_RES *word_res = page_res_it.restart_row();
02390   ROW_RES *this_row = page_res_it.row();
02391 
02392   WERD_RES *lword = NULL;
02393   WERD_RES *rword = NULL;
02394   info->num_words = 0;
02395   do {
02396     if (word_res) {
02397       if (!lword) lword = word_res;
02398       if (rword != word_res) info->num_words++;
02399       rword = word_res;
02400     }
02401     word_res = page_res_it.forward();
02402   } while (page_res_it.row() == this_row);
02403   info->lword_box = lword->word->bounding_box();
02404   info->rword_box = rword->word->bounding_box();
02405 }
02406 
02407 
02408 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph
02409 // detector RowInfo with all relevant information from the row.
02410 void InitializeRowInfo(bool after_recognition,
02411                        const MutableIterator &it,
02412                        RowInfo *info) {
02413   if (it.PageResIt()->row() != NULL) {
02414     ROW *row = it.PageResIt()->row()->row;
02415     info->pix_ldistance = row->lmargin();
02416     info->pix_rdistance = row->rmargin();
02417     info->average_interword_space =
02418         row->space() > 0 ? row->space() : MAX(row->x_height(), 1);
02419     info->pix_xheight = row->x_height();
02420     info->has_leaders = false;
02421     info->has_drop_cap = row->has_drop_cap();
02422     info->ltr = true;  // set below depending on word scripts
02423   } else {
02424     info->pix_ldistance = info->pix_rdistance = 0;
02425     info->average_interword_space = 1;
02426     info->pix_xheight = 1.0;
02427     info->has_leaders = false;
02428     info->has_drop_cap = false;
02429     info->ltr = true;
02430   }
02431 
02432   info->num_words = 0;
02433   info->lword_indicates_list_item = false;
02434   info->lword_likely_starts_idea = false;
02435   info->lword_likely_ends_idea = false;
02436   info->rword_indicates_list_item = false;
02437   info->rword_likely_starts_idea = false;
02438   info->rword_likely_ends_idea = false;
02439   info->has_leaders = false;
02440   info->ltr = 1;
02441 
02442   if (!after_recognition) {
02443     InitializeTextAndBoxesPreRecognition(it, info);
02444     return;
02445   }
02446   info->text = "";
02447   char *text = it.GetUTF8Text(RIL_TEXTLINE);
02448   int trailing_ws_idx = strlen(text);  // strip trailing space
02449   while (trailing_ws_idx > 0 &&
02450          // isspace() only takes ASCII
02451          ((text[trailing_ws_idx - 1] & 0x80) == 0) &&
02452          isspace(text[trailing_ws_idx - 1]))
02453     trailing_ws_idx--;
02454   if (trailing_ws_idx > 0) {
02455     int lspaces = info->pix_ldistance / info->average_interword_space;
02456     for (int i = 0; i < lspaces; i++)
02457       info->text += ' ';
02458     for (int i = 0; i < trailing_ws_idx; i++)
02459       info->text += text[i];
02460   }
02461   delete []text;
02462 
02463   if (info->text.size() == 0) {
02464     return;
02465   }
02466 
02467   PAGE_RES_IT page_res_it = *it.PageResIt();
02468   GenericVector<WERD_RES *> werds;
02469   WERD_RES *word_res = page_res_it.restart_row();
02470   ROW_RES *this_row = page_res_it.row();
02471   int num_leaders = 0;
02472   int ltr = 0;
02473   int rtl = 0;
02474   do {
02475     if (word_res && word_res->best_choice->unichar_string().length() > 0) {
02476       werds.push_back(word_res);
02477       ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
02478       rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
02479       if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
02480     }
02481     word_res = page_res_it.forward();
02482   } while (page_res_it.row() == this_row);
02483   info->ltr = ltr >= rtl;
02484   info->has_leaders = num_leaders > 3;
02485   info->num_words = werds.size();
02486   if (werds.size() > 0) {
02487     WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
02488     info->lword_text = lword->best_choice->unichar_string().string();
02489     info->rword_text = rword->best_choice->unichar_string().string();
02490     info->lword_box = lword->word->bounding_box();
02491     info->rword_box = rword->word->bounding_box();
02492     LeftWordAttributes(lword->uch_set, lword->best_choice,
02493                        info->lword_text,
02494                        &info->lword_indicates_list_item,
02495                        &info->lword_likely_starts_idea,
02496                        &info->lword_likely_ends_idea);
02497     RightWordAttributes(rword->uch_set, rword->best_choice,
02498                         info->rword_text,
02499                         &info->rword_indicates_list_item,
02500                         &info->rword_likely_starts_idea,
02501                         &info->rword_likely_ends_idea);
02502   }
02503 }
02504 
02505 // This is called after rows have been identified and words are recognized.
02506 // Much of this could be implemented before word recognition, but text helps
02507 // to identify bulleted lists and gives good signals for sentence boundaries.
02508 void DetectParagraphs(int debug_level,
02509                       bool after_text_recognition,
02510                       const MutableIterator *block_start,
02511                       GenericVector<ParagraphModel *> *models) {
02512   // Clear out any preconceived notions.
02513   if (block_start->Empty(RIL_TEXTLINE)) {
02514     return;
02515   }
02516   BLOCK *block = block_start->PageResIt()->block()->block;
02517   block->para_list()->clear();
02518   bool is_image_block = block->poly_block() && !block->poly_block()->IsText();
02519 
02520   // Convert the Tesseract structures to RowInfos
02521   // for the paragraph detection algorithm.
02522   MutableIterator row(*block_start);
02523   if (row.Empty(RIL_TEXTLINE))
02524     return;  // end of input already.
02525 
02526   GenericVector<RowInfo> row_infos;
02527   do {
02528     if (!row.PageResIt()->row())
02529       continue;  // empty row.
02530     row.PageResIt()->row()->row->set_para(NULL);
02531     row_infos.push_back(RowInfo());
02532     RowInfo &ri = row_infos.back();
02533     InitializeRowInfo(after_text_recognition, row, &ri);
02534   } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
02535            row.Next(RIL_TEXTLINE));
02536 
02537   // If we're called before text recognition, we might not have
02538   // tight block bounding boxes, so trim by the minimum on each side.
02539   if (row_infos.size() > 0) {
02540     int min_lmargin = row_infos[0].pix_ldistance;
02541     int min_rmargin = row_infos[0].pix_rdistance;
02542     for (int i = 1; i < row_infos.size(); i++) {
02543       if (row_infos[i].pix_ldistance < min_lmargin)
02544         min_lmargin = row_infos[i].pix_ldistance;
02545       if (row_infos[i].pix_rdistance < min_rmargin)
02546         min_rmargin = row_infos[i].pix_rdistance;
02547     }
02548     if (min_lmargin > 0 || min_rmargin > 0) {
02549       for (int i = 0; i < row_infos.size(); i++) {
02550         row_infos[i].pix_ldistance -= min_lmargin;
02551         row_infos[i].pix_rdistance -= min_rmargin;
02552       }
02553     }
02554   }
02555 
02556   // Run the paragraph detection algorithm.
02557   GenericVector<PARA *> row_owners;
02558   GenericVector<PARA *> the_paragraphs;
02559   if (!is_image_block) {
02560     DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
02561                      models);
02562   } else {
02563     row_owners.init_to_size(row_infos.size(), NULL);
02564     CanonicalizeDetectionResults(&row_owners, block->para_list());
02565   }
02566 
02567   // Now stitch in the row_owners into the rows.
02568   row = *block_start;
02569   for (int i = 0; i < row_owners.size(); i++) {
02570     while (!row.PageResIt()->row())
02571       row.Next(RIL_TEXTLINE);
02572     row.PageResIt()->row()->row->set_para(row_owners[i]);
02573     row.Next(RIL_TEXTLINE);
02574   }
02575 }
02576 
02577 }  // namespace
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines