tesseract
3.03
|
00001 /********************************************************************** 00002 * File: paragraphs.cpp 00003 * Description: Paragraph detection for tesseract. 00004 * Author: David Eger 00005 * Created: 25 February 2011 00006 * 00007 * (C) Copyright 2011, Google Inc. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 #ifdef _MSC_VER 00020 #define __func__ __FUNCTION__ 00021 #endif 00022 00023 #include <ctype.h> 00024 00025 #include "genericvector.h" 00026 #include "helpers.h" 00027 #include "mutableiterator.h" 00028 #include "ocrpara.h" 00029 #include "pageres.h" 00030 #include "paragraphs.h" 00031 #include "paragraphs_internal.h" 00032 #include "publictypes.h" 00033 #include "ratngs.h" 00034 #include "rect.h" 00035 #include "statistc.h" 00036 #include "strngs.h" 00037 #include "tprintf.h" 00038 #include "unicharset.h" 00039 #include "unicodes.h" 00040 00041 namespace tesseract { 00042 00043 // Special "weak" ParagraphModels. 00044 const ParagraphModel *kCrownLeft 00045 = reinterpret_cast<ParagraphModel *>(0xDEAD111F); 00046 const ParagraphModel *kCrownRight 00047 = reinterpret_cast<ParagraphModel *>(0xDEAD888F); 00048 00049 // Given the width of a typical space between words, what is the threshold 00050 // by which by which we think left and right alignments for paragraphs 00051 // can vary and still be aligned. 00052 static int Epsilon(int space_pix) { 00053 return space_pix * 4 / 5; 00054 } 00055 00056 static bool AcceptableRowArgs( 00057 int debug_level, int min_num_rows, const char *function_name, 00058 const GenericVector<RowScratchRegisters> *rows, 00059 int row_start, int row_end) { 00060 if (row_start < 0 || row_end > rows->size() || row_start > row_end) { 00061 tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n", 00062 row_start, row_end, rows->size()); 00063 return false; 00064 } 00065 if (row_end - row_start < min_num_rows) { 00066 if (debug_level > 1) { 00067 tprintf("# Too few rows[%d, %d) for %s.\n", 00068 row_start, row_end, function_name); 00069 } 00070 return false; 00071 } 00072 return true; 00073 } 00074 00075 // =============================== Debug Code ================================ 00076 00077 // Convert an integer to a decimal string. 00078 static STRING StrOf(int num) { 00079 char buffer[30]; 00080 snprintf(buffer, sizeof(buffer), "%d", num); 00081 return STRING(buffer); 00082 } 00083 00084 // Given a row-major matrix of unicode text and a column separator, print 00085 // a formatted table. For ASCII, we get good column alignment. 00086 static void PrintTable(const GenericVector<GenericVector<STRING> > &rows, 00087 const STRING &colsep) { 00088 GenericVector<int> max_col_widths; 00089 for (int r = 0; r < rows.size(); r++) { 00090 int num_columns = rows[r].size(); 00091 for (int c = 0; c < num_columns; c++) { 00092 int num_unicodes = 0; 00093 for (int i = 0; i < rows[r][c].size(); i++) { 00094 if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++; 00095 } 00096 if (c >= max_col_widths.size()) { 00097 max_col_widths.push_back(num_unicodes); 00098 } else { 00099 if (num_unicodes > max_col_widths[c]) 00100 max_col_widths[c] = num_unicodes; 00101 } 00102 } 00103 } 00104 00105 GenericVector<STRING> col_width_patterns; 00106 for (int c = 0; c < max_col_widths.size(); c++) { 00107 col_width_patterns.push_back( 00108 STRING("%-") + StrOf(max_col_widths[c]) + "s"); 00109 } 00110 00111 for (int r = 0; r < rows.size(); r++) { 00112 for (int c = 0; c < rows[r].size(); c++) { 00113 if (c > 0) 00114 tprintf("%s", colsep.string()); 00115 tprintf(col_width_patterns[c].string(), rows[r][c].string()); 00116 } 00117 tprintf("\n"); 00118 } 00119 } 00120 00121 STRING RtlEmbed(const STRING &word, bool rtlify) { 00122 if (rtlify) 00123 return STRING(kRLE) + word + STRING(kPDF); 00124 return word; 00125 } 00126 00127 // Print the current thoughts of the paragraph detector. 00128 static void PrintDetectorState(const ParagraphTheory &theory, 00129 const GenericVector<RowScratchRegisters> &rows) { 00130 GenericVector<GenericVector<STRING> > output; 00131 output.push_back(GenericVector<STRING>()); 00132 output.back().push_back("#row"); 00133 output.back().push_back("space"); 00134 output.back().push_back(".."); 00135 output.back().push_back("lword[widthSEL]"); 00136 output.back().push_back("rword[widthSEL]"); 00137 RowScratchRegisters::AppendDebugHeaderFields(&output.back()); 00138 output.back().push_back("text"); 00139 00140 for (int i = 0; i < rows.size(); i++) { 00141 output.push_back(GenericVector<STRING>()); 00142 GenericVector<STRING> &row = output.back(); 00143 const RowInfo& ri = *rows[i].ri_; 00144 row.push_back(StrOf(i)); 00145 row.push_back(StrOf(ri.average_interword_space)); 00146 row.push_back(ri.has_leaders ? ".." : " "); 00147 row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + 00148 "[" + StrOf(ri.lword_box.width()) + 00149 (ri.lword_likely_starts_idea ? "S" : "s") + 00150 (ri.lword_likely_ends_idea ? "E" : "e") + 00151 (ri.lword_indicates_list_item ? "L" : "l") + 00152 "]"); 00153 row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + 00154 "[" + StrOf(ri.rword_box.width()) + 00155 (ri.rword_likely_starts_idea ? "S" : "s") + 00156 (ri.rword_likely_ends_idea ? "E" : "e") + 00157 (ri.rword_indicates_list_item ? "L" : "l") + 00158 "]"); 00159 rows[i].AppendDebugInfo(theory, &row); 00160 row.push_back(RtlEmbed(ri.text, !ri.ltr)); 00161 } 00162 PrintTable(output, " "); 00163 00164 tprintf("Active Paragraph Models:\n"); 00165 for (int m = 0; m < theory.models().size(); m++) { 00166 tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string()); 00167 } 00168 } 00169 00170 static void DebugDump( 00171 bool should_print, 00172 const STRING &phase, 00173 const ParagraphTheory &theory, 00174 const GenericVector<RowScratchRegisters> &rows) { 00175 if (!should_print) 00176 return; 00177 tprintf("# %s\n", phase.string()); 00178 PrintDetectorState(theory, rows); 00179 } 00180 00181 // Print out the text for rows[row_start, row_end) 00182 static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows, 00183 int row_start, int row_end) { 00184 tprintf("======================================\n"); 00185 for (int row = row_start; row < row_end; row++) { 00186 tprintf("%s\n", rows[row].ri_->text.string()); 00187 } 00188 tprintf("======================================\n"); 00189 } 00190 00191 // ============= Brain Dead Language Model (ASCII Version) =================== 00192 00193 bool IsLatinLetter(int ch) { 00194 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); 00195 } 00196 00197 bool IsDigitLike(int ch) { 00198 return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I'; 00199 } 00200 00201 bool IsOpeningPunct(int ch) { 00202 return strchr("'\"({[", ch) != NULL; 00203 } 00204 00205 bool IsTerminalPunct(int ch) { 00206 return strchr(":'\".?!]})", ch) != NULL; 00207 } 00208 00209 // Return a pointer after consuming as much text as qualifies as roman numeral. 00210 const char *SkipChars(const char *str, const char *toskip) { 00211 while (*str != '\0' && strchr(toskip, *str)) { str++; } 00212 return str; 00213 } 00214 00215 const char *SkipChars(const char *str, bool (*skip)(int)) { 00216 while (*str != '\0' && skip(*str)) { str++; } 00217 return str; 00218 } 00219 00220 const char *SkipOne(const char *str, const char *toskip) { 00221 if (*str != '\0' && strchr(toskip, *str)) return str + 1; 00222 return str; 00223 } 00224 00225 // Return whether it is very likely that this is a numeral marker that could 00226 // start a list item. Some examples include: 00227 // A I iii. VI (2) 3.5. [C-4] 00228 bool LikelyListNumeral(const STRING &word) { 00229 const char *kRomans = "ivxlmdIVXLMD"; 00230 const char *kDigits = "012345789"; 00231 const char *kOpen = "[{("; 00232 const char *kSep = ":;-.,"; 00233 const char *kClose = "]})"; 00234 00235 int num_segments = 0; 00236 const char *pos = word.string(); 00237 while (*pos != '\0' && num_segments < 3) { 00238 // skip up to two open parens. 00239 const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen); 00240 const char *numeral_end = SkipChars(numeral_start, kRomans); 00241 if (numeral_end != numeral_start) { 00242 // Got Roman Numeral. Great. 00243 } else { 00244 numeral_end = SkipChars(numeral_start, kDigits); 00245 if (numeral_end == numeral_start) { 00246 // If there's a single latin letter, we can use that. 00247 numeral_end = SkipChars(numeral_start, IsLatinLetter); 00248 if (numeral_end - numeral_start != 1) 00249 break; 00250 } 00251 } 00252 // We got some sort of numeral. 00253 num_segments++; 00254 // Skip any trailing parens or punctuation. 00255 pos = SkipChars(SkipChars(numeral_end, kClose), kSep); 00256 if (pos == numeral_end) 00257 break; 00258 } 00259 return *pos == '\0'; 00260 } 00261 00262 bool LikelyListMark(const STRING &word) { 00263 const char *kListMarks = "0Oo*.,+."; 00264 return word.size() == 1 && strchr(kListMarks, word[0]) != NULL; 00265 } 00266 00267 bool AsciiLikelyListItem(const STRING &word) { 00268 return LikelyListMark(word) || LikelyListNumeral(word); 00269 } 00270 00271 // ========== Brain Dead Language Model (Tesseract Version) ================ 00272 00273 // Return the first Unicode Codepoint from werd[pos]. 00274 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) { 00275 if (!u || !werd || pos > werd->length()) 00276 return 0; 00277 return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni(); 00278 } 00279 00280 // A useful helper class for finding the first j >= i so that word[j] 00281 // does not have given character type. 00282 class UnicodeSpanSkipper { 00283 public: 00284 UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word) 00285 : u_(unicharset), word_(word) { wordlen_ = word->length(); } 00286 00287 // Given an input position, return the first position >= pos not punc. 00288 int SkipPunc(int pos); 00289 // Given an input position, return the first position >= pos not digit. 00290 int SkipDigits(int pos); 00291 // Given an input position, return the first position >= pos not roman. 00292 int SkipRomans(int pos); 00293 // Given an input position, return the first position >= pos not alpha. 00294 int SkipAlpha(int pos); 00295 00296 private: 00297 const UNICHARSET *u_; 00298 const WERD_CHOICE *word_; 00299 int wordlen_; 00300 }; 00301 00302 int UnicodeSpanSkipper::SkipPunc(int pos) { 00303 while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++; 00304 return pos; 00305 } 00306 00307 int UnicodeSpanSkipper::SkipDigits(int pos) { 00308 while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) || 00309 IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++; 00310 return pos; 00311 } 00312 00313 int UnicodeSpanSkipper::SkipRomans(int pos) { 00314 const char *kRomans = "ivxlmdIVXLMD"; 00315 while (pos < wordlen_) { 00316 int ch = UnicodeFor(u_, word_, pos); 00317 if (ch >= 0xF0 || strchr(kRomans, ch) == 0) break; 00318 pos++; 00319 } 00320 return pos; 00321 } 00322 00323 int UnicodeSpanSkipper::SkipAlpha(int pos) { 00324 while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++; 00325 return pos; 00326 } 00327 00328 bool LikelyListMarkUnicode(int ch) { 00329 if (ch < 0x80) { 00330 STRING single_ch; 00331 single_ch += ch; 00332 return LikelyListMark(single_ch); 00333 } 00334 switch (ch) { 00335 // TODO(eger) expand this list of unicodes as needed. 00336 case 0x00B0: // degree sign 00337 case 0x2022: // bullet 00338 case 0x25E6: // white bullet 00339 case 0x00B7: // middle dot 00340 case 0x25A1: // white square 00341 case 0x25A0: // black square 00342 case 0x25AA: // black small square 00343 case 0x2B1D: // black very small square 00344 case 0x25BA: // black right-pointing pointer 00345 case 0x25CF: // black circle 00346 case 0x25CB: // white circle 00347 return true; 00348 default: 00349 break; // fall through 00350 } 00351 return false; 00352 } 00353 00354 // Return whether it is very likely that this is a numeral marker that could 00355 // start a list item. Some examples include: 00356 // A I iii. VI (2) 3.5. [C-4] 00357 bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) { 00358 if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) 00359 return true; 00360 00361 UnicodeSpanSkipper m(u, werd); 00362 int num_segments = 0; 00363 int pos = 0; 00364 while (pos < werd->length() && num_segments < 3) { 00365 int numeral_start = m.SkipPunc(pos); 00366 if (numeral_start > pos + 1) break; 00367 int numeral_end = m.SkipRomans(numeral_start); 00368 if (numeral_end == numeral_start) { 00369 numeral_end = m.SkipDigits(numeral_start); 00370 if (numeral_end == numeral_start) { 00371 // If there's a single latin letter, we can use that. 00372 numeral_end = m.SkipAlpha(numeral_start); 00373 if (numeral_end - numeral_start != 1) 00374 break; 00375 } 00376 } 00377 // We got some sort of numeral. 00378 num_segments++; 00379 // Skip any trailing punctuation. 00380 pos = m.SkipPunc(numeral_end); 00381 if (pos == numeral_end) 00382 break; 00383 } 00384 return pos == werd->length(); 00385 } 00386 00387 // ========= Brain Dead Language Model (combined entry points) ================ 00388 00389 // Given the leftmost word of a line either as a Tesseract unicharset + werd 00390 // or a utf8 string, set the following attributes for it: 00391 // is_list - this word might be a list number or bullet. 00392 // starts_idea - this word is likely to start a sentence. 00393 // ends_idea - this word is likely to end a sentence. 00394 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, 00395 const STRING &utf8, 00396 bool *is_list, bool *starts_idea, bool *ends_idea) { 00397 *is_list = false; 00398 *starts_idea = false; 00399 *ends_idea = false; 00400 if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty 00401 *ends_idea = true; 00402 return; 00403 } 00404 00405 if (unicharset && werd) { // We have a proper werd and unicharset so use it. 00406 if (UniLikelyListItem(unicharset, werd)) { 00407 *is_list = true; 00408 *starts_idea = true; 00409 *ends_idea = true; 00410 } 00411 if (unicharset->get_isupper(werd->unichar_id(0))) { 00412 *starts_idea = true; 00413 } 00414 if (unicharset->get_ispunctuation(werd->unichar_id(0))) { 00415 *starts_idea = true; 00416 *ends_idea = true; 00417 } 00418 } else { // Assume utf8 is mostly ASCII 00419 if (AsciiLikelyListItem(utf8)) { 00420 *is_list = true; 00421 *starts_idea = true; 00422 } 00423 int start_letter = utf8[0]; 00424 if (IsOpeningPunct(start_letter)) { 00425 *starts_idea = true; 00426 } 00427 if (IsTerminalPunct(start_letter)) { 00428 *ends_idea = true; 00429 } 00430 if (start_letter >= 'A' && start_letter <= 'Z') { 00431 *starts_idea = true; 00432 } 00433 } 00434 } 00435 00436 // Given the rightmost word of a line either as a Tesseract unicharset + werd 00437 // or a utf8 string, set the following attributes for it: 00438 // is_list - this word might be a list number or bullet. 00439 // starts_idea - this word is likely to start a sentence. 00440 // ends_idea - this word is likely to end a sentence. 00441 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, 00442 const STRING &utf8, 00443 bool *is_list, bool *starts_idea, bool *ends_idea) { 00444 *is_list = false; 00445 *starts_idea = false; 00446 *ends_idea = false; 00447 if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty 00448 *ends_idea = true; 00449 return; 00450 } 00451 00452 if (unicharset && werd) { // We have a proper werd and unicharset so use it. 00453 if (UniLikelyListItem(unicharset, werd)) { 00454 *is_list = true; 00455 *starts_idea = true; 00456 } 00457 UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1); 00458 if (unicharset->get_ispunctuation(last_letter)) { 00459 *ends_idea = true; 00460 } 00461 } else { // Assume utf8 is mostly ASCII 00462 if (AsciiLikelyListItem(utf8)) { 00463 *is_list = true; 00464 *starts_idea = true; 00465 } 00466 int last_letter = utf8[utf8.size() - 1]; 00467 if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) { 00468 *ends_idea = true; 00469 } 00470 } 00471 } 00472 00473 // =============== Implementation of RowScratchRegisters ===================== 00474 /* static */ 00475 void RowScratchRegisters::AppendDebugHeaderFields( 00476 GenericVector<STRING> *header) { 00477 header->push_back("[lmarg,lind;rind,rmarg]"); 00478 header->push_back("model"); 00479 } 00480 00481 void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory, 00482 GenericVector<STRING> *dbg) const { 00483 char s[30]; 00484 snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]", 00485 lmargin_, lindent_, rindent_, rmargin_); 00486 dbg->push_back(s); 00487 STRING model_string; 00488 model_string += static_cast<char>(GetLineType()); 00489 model_string += ":"; 00490 00491 int model_numbers = 0; 00492 for (int h = 0; h < hypotheses_.size(); h++) { 00493 if (hypotheses_[h].model == NULL) 00494 continue; 00495 if (model_numbers > 0) 00496 model_string += ","; 00497 if (StrongModel(hypotheses_[h].model)) { 00498 model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model)); 00499 } else if (hypotheses_[h].model == kCrownLeft) { 00500 model_string += "CrL"; 00501 } else if (hypotheses_[h].model == kCrownRight) { 00502 model_string += "CrR"; 00503 } 00504 model_numbers++; 00505 } 00506 if (model_numbers == 0) 00507 model_string += "0"; 00508 00509 dbg->push_back(model_string); 00510 } 00511 00512 void RowScratchRegisters::Init(const RowInfo &row) { 00513 ri_ = &row; 00514 lmargin_ = 0; 00515 lindent_ = row.pix_ldistance; 00516 rmargin_ = 0; 00517 rindent_ = row.pix_rdistance; 00518 } 00519 00520 LineType RowScratchRegisters::GetLineType() const { 00521 if (hypotheses_.empty()) 00522 return LT_UNKNOWN; 00523 bool has_start = false; 00524 bool has_body = false; 00525 for (int i = 0; i < hypotheses_.size(); i++) { 00526 switch (hypotheses_[i].ty) { 00527 case LT_START: has_start = true; break; 00528 case LT_BODY: has_body = true; break; 00529 default: 00530 tprintf("Encountered bad value in hypothesis list: %c\n", 00531 hypotheses_[i].ty); 00532 break; 00533 } 00534 } 00535 if (has_start && has_body) 00536 return LT_MULTIPLE; 00537 return has_start ? LT_START : LT_BODY; 00538 } 00539 00540 LineType RowScratchRegisters::GetLineType(const ParagraphModel *model) const { 00541 if (hypotheses_.empty()) 00542 return LT_UNKNOWN; 00543 bool has_start = false; 00544 bool has_body = false; 00545 for (int i = 0; i < hypotheses_.size(); i++) { 00546 if (hypotheses_[i].model != model) 00547 continue; 00548 switch (hypotheses_[i].ty) { 00549 case LT_START: has_start = true; break; 00550 case LT_BODY: has_body = true; break; 00551 default: 00552 tprintf("Encountered bad value in hypothesis list: %c\n", 00553 hypotheses_[i].ty); 00554 break; 00555 } 00556 } 00557 if (has_start && has_body) 00558 return LT_MULTIPLE; 00559 return has_start ? LT_START : LT_BODY; 00560 } 00561 00562 void RowScratchRegisters::SetStartLine() { 00563 LineType current_lt = GetLineType(); 00564 if (current_lt != LT_UNKNOWN && current_lt != LT_START) { 00565 tprintf("Trying to set a line to be START when it's already BODY.\n"); 00566 } 00567 if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) { 00568 hypotheses_.push_back_new(LineHypothesis(LT_START, NULL)); 00569 } 00570 } 00571 00572 void RowScratchRegisters::SetBodyLine() { 00573 LineType current_lt = GetLineType(); 00574 if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) { 00575 tprintf("Trying to set a line to be BODY when it's already START.\n"); 00576 } 00577 if (current_lt == LT_UNKNOWN || current_lt == LT_START) { 00578 hypotheses_.push_back_new(LineHypothesis(LT_BODY, NULL)); 00579 } 00580 } 00581 00582 void RowScratchRegisters::AddStartLine(const ParagraphModel *model) { 00583 hypotheses_.push_back_new(LineHypothesis(LT_START, model)); 00584 int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, NULL)); 00585 if (old_idx >= 0) 00586 hypotheses_.remove(old_idx); 00587 } 00588 00589 void RowScratchRegisters::AddBodyLine(const ParagraphModel *model) { 00590 hypotheses_.push_back_new(LineHypothesis(LT_BODY, model)); 00591 int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, NULL)); 00592 if (old_idx >= 0) 00593 hypotheses_.remove(old_idx); 00594 } 00595 00596 void RowScratchRegisters::StartHypotheses(SetOfModels *models) const { 00597 for (int h = 0; h < hypotheses_.size(); h++) { 00598 if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model)) 00599 models->push_back_new(hypotheses_[h].model); 00600 } 00601 } 00602 00603 void RowScratchRegisters::StrongHypotheses(SetOfModels *models) const { 00604 for (int h = 0; h < hypotheses_.size(); h++) { 00605 if (StrongModel(hypotheses_[h].model)) 00606 models->push_back_new(hypotheses_[h].model); 00607 } 00608 } 00609 00610 void RowScratchRegisters::NonNullHypotheses(SetOfModels *models) const { 00611 for (int h = 0; h < hypotheses_.size(); h++) { 00612 if (hypotheses_[h].model != NULL) 00613 models->push_back_new(hypotheses_[h].model); 00614 } 00615 } 00616 00617 const ParagraphModel *RowScratchRegisters::UniqueStartHypothesis() const { 00618 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) 00619 return NULL; 00620 return hypotheses_[0].model; 00621 } 00622 00623 const ParagraphModel *RowScratchRegisters::UniqueBodyHypothesis() const { 00624 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) 00625 return NULL; 00626 return hypotheses_[0].model; 00627 } 00628 00629 // Discard any hypotheses whose model is not in the given list. 00630 void RowScratchRegisters::DiscardNonMatchingHypotheses( 00631 const SetOfModels &models) { 00632 if (models.empty()) 00633 return; 00634 for (int h = hypotheses_.size() - 1; h >= 0; h--) { 00635 if (!models.contains(hypotheses_[h].model)) { 00636 hypotheses_.remove(h); 00637 } 00638 } 00639 } 00640 00641 // ============ Geometry based Paragraph Detection Algorithm ================= 00642 00643 struct Cluster { 00644 Cluster() : center(0), count(0) {} 00645 Cluster(int cen, int num) : center(cen), count(num) {} 00646 00647 int center; // The center of the cluster. 00648 int count; // The number of entries within the cluster. 00649 }; 00650 00651 class SimpleClusterer { 00652 public: 00653 explicit SimpleClusterer(int max_cluster_width) 00654 : max_cluster_width_(max_cluster_width) {} 00655 void Add(int value) { values_.push_back(value); } 00656 int size() const { return values_.size(); } 00657 void GetClusters(GenericVector<Cluster> *clusters); 00658 00659 private: 00660 int max_cluster_width_; 00661 GenericVectorEqEq<int> values_; 00662 }; 00663 00664 // Return the index of the cluster closest to value. 00665 int ClosestCluster(const GenericVector<Cluster> &clusters, int value) { 00666 int best_index = 0; 00667 for (int i = 0; i < clusters.size(); i++) { 00668 if (abs(value - clusters[i].center) < 00669 abs(value - clusters[best_index].center)) 00670 best_index = i; 00671 } 00672 return best_index; 00673 } 00674 00675 void SimpleClusterer::GetClusters(GenericVector<Cluster> *clusters) { 00676 clusters->clear(); 00677 values_.sort(); 00678 for (int i = 0; i < values_.size();) { 00679 int orig_i = i; 00680 int lo = values_[i]; 00681 int hi = lo; 00682 while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) { 00683 hi = values_[i]; 00684 } 00685 clusters->push_back(Cluster((hi + lo) / 2, i - orig_i)); 00686 } 00687 } 00688 00689 // Calculate left- and right-indent tab stop values seen in 00690 // rows[row_start, row_end) given a tolerance of tolerance. 00691 void CalculateTabStops(GenericVector<RowScratchRegisters> *rows, 00692 int row_start, int row_end, 00693 int tolerance, 00694 GenericVector<Cluster> *left_tabs, 00695 GenericVector<Cluster> *right_tabs) { 00696 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) 00697 return; 00698 // First pass: toss all left and right indents into clusterers. 00699 SimpleClusterer initial_lefts(tolerance); 00700 SimpleClusterer initial_rights(tolerance); 00701 GenericVector<Cluster> initial_left_tabs; 00702 GenericVector<Cluster> initial_right_tabs; 00703 for (int i = row_start; i < row_end; i++) { 00704 initial_lefts.Add((*rows)[i].lindent_); 00705 initial_rights.Add((*rows)[i].rindent_); 00706 } 00707 initial_lefts.GetClusters(&initial_left_tabs); 00708 initial_rights.GetClusters(&initial_right_tabs); 00709 00710 // Second pass: cluster only lines that are not "stray" 00711 // An example of a stray line is a page number -- a line whose start 00712 // and end tab-stops are far outside the typical start and end tab-stops 00713 // for the block. 00714 // Put another way, we only cluster data from lines whose start or end 00715 // tab stop is frequent. 00716 SimpleClusterer lefts(tolerance); 00717 SimpleClusterer rights(tolerance); 00718 00719 // Outlier elimination. We might want to switch this to test outlier-ness 00720 // based on how strange a position an outlier is in instead of or in addition 00721 // to how rare it is. These outliers get re-added if we end up having too 00722 // few tab stops, to work with, however. 00723 int infrequent_enough_to_ignore = 0; 00724 if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1; 00725 if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2; 00726 00727 for (int i = row_start; i < row_end; i++) { 00728 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); 00729 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); 00730 if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore || 00731 initial_right_tabs[ridx].count > infrequent_enough_to_ignore) { 00732 lefts.Add((*rows)[i].lindent_); 00733 rights.Add((*rows)[i].rindent_); 00734 } 00735 } 00736 lefts.GetClusters(left_tabs); 00737 rights.GetClusters(right_tabs); 00738 00739 if ((left_tabs->size() == 1 && right_tabs->size() >= 4) || 00740 (right_tabs->size() == 1 && left_tabs->size() >= 4)) { 00741 // One side is really ragged, and the other only has one tab stop, 00742 // so those "insignificant outliers" are probably important, actually. 00743 // This often happens on a page of an index. Add back in the ones 00744 // we omitted in the first pass. 00745 for (int i = row_start; i < row_end; i++) { 00746 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_); 00747 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_); 00748 if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore || 00749 initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) { 00750 lefts.Add((*rows)[i].lindent_); 00751 rights.Add((*rows)[i].rindent_); 00752 } 00753 } 00754 } 00755 lefts.GetClusters(left_tabs); 00756 rights.GetClusters(right_tabs); 00757 00758 // If one side is almost a two-indent aligned side, and the other clearly 00759 // isn't, try to prune out the least frequent tab stop from that side. 00760 if (left_tabs->size() == 3 && right_tabs->size() >= 4) { 00761 int to_prune = -1; 00762 for (int i = left_tabs->size() - 1; i >= 0; i--) { 00763 if (to_prune < 0 || 00764 (*left_tabs)[i].count < (*left_tabs)[to_prune].count) { 00765 to_prune = i; 00766 } 00767 } 00768 if (to_prune >= 0 && 00769 (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) { 00770 left_tabs->remove(to_prune); 00771 } 00772 } 00773 if (right_tabs->size() == 3 && left_tabs->size() >= 4) { 00774 int to_prune = -1; 00775 for (int i = right_tabs->size() - 1; i >= 0; i--) { 00776 if (to_prune < 0 || 00777 (*right_tabs)[i].count < (*right_tabs)[to_prune].count) { 00778 to_prune = i; 00779 } 00780 } 00781 if (to_prune >= 0 && 00782 (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) { 00783 right_tabs->remove(to_prune); 00784 } 00785 } 00786 } 00787 00788 // Given a paragraph model mark rows[row_start, row_end) as said model 00789 // start or body lines. 00790 // 00791 // Case 1: model->first_indent_ != model->body_indent_ 00792 // Differentiating the paragraph start lines from the paragraph body lines in 00793 // this case is easy, we just see how far each line is indented. 00794 // 00795 // Case 2: model->first_indent_ == model->body_indent_ 00796 // Here, we find end-of-paragraph lines by looking for "short lines." 00797 // What constitutes a "short line" changes depending on whether the text 00798 // ragged-right[left] or fully justified (aligned left and right). 00799 // 00800 // Case 2a: Ragged Right (or Left) text. (eop_threshold == 0) 00801 // We have a new paragraph it the first word would have at the end 00802 // of the previous line. 00803 // 00804 // Case 2b: Fully Justified. (eop_threshold > 0) 00805 // We mark a line as short (end of paragraph) if the offside indent 00806 // is greater than eop_threshold. 00807 void MarkRowsWithModel(GenericVector<RowScratchRegisters> *rows, 00808 int row_start, int row_end, 00809 const ParagraphModel *model, 00810 bool ltr, 00811 int eop_threshold) { 00812 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) 00813 return; 00814 for (int row = row_start; row < row_end; row++) { 00815 bool valid_first = ValidFirstLine(rows, row, model); 00816 bool valid_body = ValidBodyLine(rows, row, model); 00817 if (valid_first && !valid_body) { 00818 (*rows)[row].AddStartLine(model); 00819 } else if (valid_body && !valid_first) { 00820 (*rows)[row].AddBodyLine(model); 00821 } else if (valid_body && valid_first) { 00822 bool after_eop = (row == row_start); 00823 if (row > row_start) { 00824 if (eop_threshold > 0) { 00825 if (model->justification() == JUSTIFICATION_LEFT) { 00826 after_eop = (*rows)[row - 1].rindent_ > eop_threshold; 00827 } else { 00828 after_eop = (*rows)[row - 1].lindent_ > eop_threshold; 00829 } 00830 } else { 00831 after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], 00832 model->justification()); 00833 } 00834 } 00835 if (after_eop) { 00836 (*rows)[row].AddStartLine(model); 00837 } else { 00838 (*rows)[row].AddBodyLine(model); 00839 } 00840 } else { 00841 // Do nothing. Stray row. 00842 } 00843 } 00844 } 00845 00846 // GeometricClassifierState holds all of the information we'll use while 00847 // trying to determine a paragraph model for the text lines in a block of 00848 // text: 00849 // + the rows under consideration [row_start, row_end) 00850 // + the common left- and right-indent tab stops 00851 // + does the block start out left-to-right or right-to-left 00852 // Further, this struct holds the data we amass for the (single) ParagraphModel 00853 // we'll assign to the text lines (assuming we get that far). 00854 struct GeometricClassifierState { 00855 GeometricClassifierState(int dbg_level, 00856 GenericVector<RowScratchRegisters> *r, 00857 int r_start, int r_end) 00858 : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end), 00859 margin(0) { 00860 tolerance = InterwordSpace(*r, r_start, r_end); 00861 CalculateTabStops(r, r_start, r_end, tolerance, 00862 &left_tabs, &right_tabs); 00863 if (debug_level >= 3) { 00864 tprintf("Geometry: TabStop cluster tolerance = %d; " 00865 "%d left tabs; %d right tabs\n", 00866 tolerance, left_tabs.size(), right_tabs.size()); 00867 } 00868 ltr = (*r)[r_start].ri_->ltr; 00869 } 00870 00871 void AssumeLeftJustification() { 00872 just = tesseract::JUSTIFICATION_LEFT; 00873 margin = (*rows)[row_start].lmargin_; 00874 } 00875 00876 void AssumeRightJustification() { 00877 just = tesseract::JUSTIFICATION_RIGHT; 00878 margin = (*rows)[row_start].rmargin_; 00879 } 00880 00881 // Align tabs are the tab stops the text is aligned to. 00882 const GenericVector<Cluster> &AlignTabs() const { 00883 if (just == tesseract::JUSTIFICATION_RIGHT) return right_tabs; 00884 return left_tabs; 00885 } 00886 00887 // Offside tabs are the tab stops opposite the tabs used to align the text. 00888 // 00889 // Note that for a left-to-right text which is aligned to the right such as 00890 // this function comment, the offside tabs are the horizontal tab stops 00891 // marking the beginning of ("Note", "this" and "marking"). 00892 const GenericVector<Cluster> &OffsideTabs() const { 00893 if (just == tesseract::JUSTIFICATION_RIGHT) return left_tabs; 00894 return right_tabs; 00895 } 00896 00897 // Return whether the i'th row extends from the leftmost left tab stop 00898 // to the right most right tab stop. 00899 bool IsFullRow(int i) const { 00900 return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 && 00901 ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0; 00902 } 00903 00904 int AlignsideTabIndex(int row_idx) const { 00905 return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just)); 00906 } 00907 00908 // Given what we know about the paragraph justification (just), would the 00909 // first word of row_b have fit at the end of row_a? 00910 bool FirstWordWouldHaveFit(int row_a, int row_b) { 00911 return ::tesseract::FirstWordWouldHaveFit( 00912 (*rows)[row_a], (*rows)[row_b], just); 00913 } 00914 00915 void PrintRows() const { PrintRowRange(*rows, row_start, row_end); } 00916 00917 void Fail(int min_debug_level, const char *why) const { 00918 if (debug_level < min_debug_level) return; 00919 tprintf("# %s\n", why); 00920 PrintRows(); 00921 } 00922 00923 ParagraphModel Model() const { 00924 return ParagraphModel(just, margin, first_indent, body_indent, tolerance); 00925 } 00926 00927 // We print out messages with a debug level at least as great as debug_level. 00928 int debug_level; 00929 00930 // The Geometric Classifier was asked to find a single paragraph model 00931 // to fit the text rows (*rows)[row_start, row_end) 00932 GenericVector<RowScratchRegisters> *rows; 00933 int row_start; 00934 int row_end; 00935 00936 // The amount by which we expect the text edge can vary and still be aligned. 00937 int tolerance; 00938 00939 // Is the script in this text block left-to-right? 00940 // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve 00941 bool ltr; 00942 00943 // These left and right tab stops were determined to be the common tab 00944 // stops for the given text. 00945 GenericVector<Cluster> left_tabs; 00946 GenericVector<Cluster> right_tabs; 00947 00948 // These are parameters we must determine to create a ParagraphModel. 00949 tesseract::ParagraphJustification just; 00950 int margin; 00951 int first_indent; 00952 int body_indent; 00953 00954 // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel() 00955 int eop_threshold; 00956 }; 00957 00958 // Given a section of text where strong textual clues did not help identifying 00959 // paragraph breaks, and for which the left and right indents have exactly 00960 // three tab stops between them, attempt to find the paragraph breaks based 00961 // solely on the outline of the text and whether the script is left-to-right. 00962 // 00963 // Algorithm Detail: 00964 // The selected rows are in the form of a rectangle except 00965 // for some number of "short lines" of the same length: 00966 // 00967 // (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx 00968 // xxxxxxxxxxx xxxxxxxxxx # A "short" line. 00969 // xxxxxxxxxxxxx xxxxxxxxxxxx 00970 // xxxxxxxxxxxxx xxxxxxxxxxxx 00971 // 00972 // We have a slightly different situation if the only short 00973 // line is at the end of the excerpt. 00974 // 00975 // (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx 00976 // xxxxxxxxxxxxx xxxxxxxxxxxx 00977 // xxxxxxxxxxxxx xxxxxxxxxxxx 00978 // xxxxxxxxxxx xxxxxxxxxx # A "short" line. 00979 // 00980 // We'll interpret these as follows based on the reasoning in the comment for 00981 // GeometricClassify(): 00982 // [script direction: first indent, body indent] 00983 // (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0 00984 // (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0 00985 void GeometricClassifyThreeTabStopTextBlock( 00986 int debug_level, 00987 GeometricClassifierState &s, 00988 ParagraphTheory *theory) { 00989 int num_rows = s.row_end - s.row_start; 00990 int num_full_rows = 0; 00991 int last_row_full = 0; 00992 for (int i = s.row_start; i < s.row_end; i++) { 00993 if (s.IsFullRow(i)) { 00994 num_full_rows++; 00995 if (i == s.row_end - 1) last_row_full++; 00996 } 00997 } 00998 00999 if (num_full_rows < 0.7 * num_rows) { 01000 s.Fail(1, "Not enough full lines to know which lines start paras."); 01001 return; 01002 } 01003 01004 // eop_threshold gets set if we're fully justified; see MarkRowsWithModel() 01005 s.eop_threshold = 0; 01006 01007 if (s.ltr) { 01008 s.AssumeLeftJustification(); 01009 } else { 01010 s.AssumeRightJustification(); 01011 } 01012 01013 if (debug_level > 0) { 01014 tprintf("# Not enough variety for clear outline classification. " 01015 "Guessing these are %s aligned based on script.\n", 01016 s.ltr ? "left" : "right"); 01017 s.PrintRows(); 01018 } 01019 01020 if (s.AlignTabs().size() == 2) { // case A1 or A2 01021 s.first_indent = s.AlignTabs()[1].center; 01022 s.body_indent = s.AlignTabs()[0].center; 01023 } else { // case B1 or B2 01024 if (num_rows - 1 == num_full_rows - last_row_full) { 01025 // case B2 01026 const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight; 01027 (*s.rows)[s.row_start].AddStartLine(model); 01028 for (int i = s.row_start + 1; i < s.row_end; i++) { 01029 (*s.rows)[i].AddBodyLine(model); 01030 } 01031 return; 01032 } else { 01033 // case B1 01034 s.first_indent = s.body_indent = s.AlignTabs()[0].center; 01035 s.eop_threshold = (s.OffsideTabs()[0].center + 01036 s.OffsideTabs()[1].center) / 2; 01037 } 01038 } 01039 const ParagraphModel *model = theory->AddModel(s.Model()); 01040 MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, 01041 s.ltr, s.eop_threshold); 01042 return; 01043 } 01044 01045 // This function is called if strong textual clues were not available, but 01046 // the caller hopes that the paragraph breaks will be super obvious just 01047 // by the outline of the text. 01048 // 01049 // The particularly difficult case is figuring out what's going on if you 01050 // don't have enough short paragraph end lines to tell us what's going on. 01051 // 01052 // For instance, let's say you have the following outline: 01053 // 01054 // (A1) xxxxxxxxxxxxxxxxxxxxxx 01055 // xxxxxxxxxxxxxxxxxxxx 01056 // xxxxxxxxxxxxxxxxxxxxxx 01057 // xxxxxxxxxxxxxxxxxxxxxx 01058 // 01059 // Even if we know that the text is left-to-right and so will probably be 01060 // left-aligned, both of the following are possible texts: 01061 // 01062 // (A1a) 1. Here our list item 01063 // with two full lines. 01064 // 2. Here a second item. 01065 // 3. Here our third one. 01066 // 01067 // (A1b) so ends paragraph one. 01068 // Here starts another 01069 // paragraph we want to 01070 // read. This continues 01071 // 01072 // These examples are obvious from the text and should have been caught 01073 // by the StrongEvidenceClassify pass. However, for languages where we don't 01074 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese), 01075 // it's worth guessing that (A1b) is the correct interpretation if there are 01076 // far more "full" lines than "short" lines. 01077 void GeometricClassify(int debug_level, 01078 GenericVector<RowScratchRegisters> *rows, 01079 int row_start, int row_end, 01080 ParagraphTheory *theory) { 01081 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) 01082 return; 01083 if (debug_level > 1) { 01084 tprintf("###############################################\n"); 01085 tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n", 01086 row_start, row_end); 01087 tprintf("###############################################\n"); 01088 } 01089 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); 01090 01091 GeometricClassifierState s(debug_level, rows, row_start, row_end); 01092 if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) { 01093 s.Fail(2, "Too much variety for simple outline classification."); 01094 return; 01095 } 01096 if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) { 01097 s.Fail(1, "Not enough variety for simple outline classification."); 01098 return; 01099 } 01100 if (s.left_tabs.size() + s.right_tabs.size() == 3) { 01101 GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory); 01102 return; 01103 } 01104 01105 // At this point, we know that one side has at least two tab stops, and the 01106 // other side has one or two tab stops. 01107 // Left to determine: 01108 // (1) Which is the body indent and which is the first line indent? 01109 // (2) Is the text fully justified? 01110 01111 // If one side happens to have three or more tab stops, assume that side 01112 // is opposite of the aligned side. 01113 if (s.right_tabs.size() > 2) { 01114 s.AssumeLeftJustification(); 01115 } else if (s.left_tabs.size() > 2) { 01116 s.AssumeRightJustification(); 01117 } else if (s.ltr) { // guess based on script direction 01118 s.AssumeLeftJustification(); 01119 } else { 01120 s.AssumeRightJustification(); 01121 } 01122 01123 if (s.AlignTabs().size() == 2) { 01124 // For each tab stop on the aligned side, how many of them appear 01125 // to be paragraph start lines? [first lines] 01126 int firsts[2] = {0, 0}; 01127 // Count the first line as a likely paragraph start line. 01128 firsts[s.AlignsideTabIndex(s.row_start)]++; 01129 // For each line, if the first word would have fit on the previous 01130 // line count it as a likely paragraph start line. 01131 bool jam_packed = true; 01132 for (int i = s.row_start + 1; i < s.row_end; i++) { 01133 if (s.FirstWordWouldHaveFit(i - 1, i)) { 01134 firsts[s.AlignsideTabIndex(i)]++; 01135 jam_packed = false; 01136 } 01137 } 01138 // Make an extra accounting for the last line of the paragraph just 01139 // in case it's the only short line in the block. That is, take its 01140 // first word as typical and see if this looks like the *last* line 01141 // of a paragraph. If so, mark the *other* indent as probably a first. 01142 if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) { 01143 firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++; 01144 } 01145 01146 int percent0firsts, percent1firsts; 01147 percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count; 01148 percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count; 01149 01150 // TODO(eger): Tune these constants if necessary. 01151 if ((percent0firsts < 20 && 30 < percent1firsts) || 01152 percent0firsts + 30 < percent1firsts) { 01153 s.first_indent = s.AlignTabs()[1].center; 01154 s.body_indent = s.AlignTabs()[0].center; 01155 } else if ((percent1firsts < 20 && 30 < percent0firsts) || 01156 percent1firsts + 30 < percent0firsts) { 01157 s.first_indent = s.AlignTabs()[0].center; 01158 s.body_indent = s.AlignTabs()[1].center; 01159 } else { 01160 // Ambiguous! Probably lineated (poetry) 01161 if (debug_level > 1) { 01162 tprintf("# Cannot determine %s indent likely to start paragraphs.\n", 01163 s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right"); 01164 tprintf("# Indent of %d looks like a first line %d%% of the time.\n", 01165 s.AlignTabs()[0].center, percent0firsts); 01166 tprintf("# Indent of %d looks like a first line %d%% of the time.\n", 01167 s.AlignTabs()[1].center, percent1firsts); 01168 s.PrintRows(); 01169 } 01170 return; 01171 } 01172 } else { 01173 // There's only one tab stop for the "aligned to" side. 01174 s.first_indent = s.body_indent = s.AlignTabs()[0].center; 01175 } 01176 01177 // At this point, we have our model. 01178 const ParagraphModel *model = theory->AddModel(s.Model()); 01179 01180 // Now all we have to do is figure out if the text is fully justified or not. 01181 // eop_threshold: default to fully justified unless we see evidence below. 01182 // See description on MarkRowsWithModel() 01183 s.eop_threshold = 01184 (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2; 01185 // If the text is not fully justified, re-set the eop_threshold to 0. 01186 if (s.AlignTabs().size() == 2) { 01187 // Paragraphs with a paragraph-start indent. 01188 for (int i = s.row_start; i < s.row_end - 1; i++) { 01189 if (ValidFirstLine(s.rows, i + 1, model) && 01190 !NearlyEqual(s.OffsideTabs()[0].center, 01191 (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) { 01192 // We found a non-end-of-paragraph short line: not fully justified. 01193 s.eop_threshold = 0; 01194 break; 01195 } 01196 } 01197 } else { 01198 // Paragraphs with no paragraph-start indent. 01199 for (int i = s.row_start; i < s.row_end - 1; i++) { 01200 if (!s.FirstWordWouldHaveFit(i, i + 1) && 01201 !NearlyEqual(s.OffsideTabs()[0].center, 01202 (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) { 01203 // We found a non-end-of-paragraph short line: not fully justified. 01204 s.eop_threshold = 0; 01205 break; 01206 } 01207 } 01208 } 01209 MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold); 01210 } 01211 01212 // =============== Implementation of ParagraphTheory ===================== 01213 01214 const ParagraphModel *ParagraphTheory::AddModel(const ParagraphModel &model) { 01215 for (int i = 0; i < models_->size(); i++) { 01216 if ((*models_)[i]->Comparable(model)) 01217 return (*models_)[i]; 01218 } 01219 ParagraphModel *m = new ParagraphModel(model); 01220 models_->push_back(m); 01221 models_we_added_.push_back_new(m); 01222 return m; 01223 } 01224 01225 void ParagraphTheory::DiscardUnusedModels(const SetOfModels &used_models) { 01226 for (int i = models_->size() - 1; i >= 0; i--) { 01227 ParagraphModel *m = (*models_)[i]; 01228 if (!used_models.contains(m) && models_we_added_.contains(m)) { 01229 models_->remove(i); 01230 models_we_added_.remove(models_we_added_.get_index(m)); 01231 delete m; 01232 } 01233 } 01234 } 01235 01236 // Examine rows[start, end) and try to determine if an existing non-centered 01237 // paragraph model would fit them perfectly. If so, return a pointer to it. 01238 // If not, return NULL. 01239 const ParagraphModel *ParagraphTheory::Fits( 01240 const GenericVector<RowScratchRegisters> *rows, int start, int end) const { 01241 for (int m = 0; m < models_->size(); m++) { 01242 const ParagraphModel *model = (*models_)[m]; 01243 if (model->justification() != JUSTIFICATION_CENTER && 01244 RowsFitModel(rows, start, end, model)) 01245 return model; 01246 } 01247 return NULL; 01248 } 01249 01250 void ParagraphTheory::NonCenteredModels(SetOfModels *models) { 01251 for (int m = 0; m < models_->size(); m++) { 01252 const ParagraphModel *model = (*models_)[m]; 01253 if (model->justification() != JUSTIFICATION_CENTER) 01254 models->push_back_new(model); 01255 } 01256 } 01257 01258 int ParagraphTheory::IndexOf(const ParagraphModel *model) const { 01259 for (int i = 0; i < models_->size(); i++) { 01260 if ((*models_)[i] == model) 01261 return i; 01262 } 01263 return -1; 01264 } 01265 01266 bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows, 01267 int row, const ParagraphModel *model) { 01268 if (!StrongModel(model)) { 01269 tprintf("ValidFirstLine() should only be called with strong models!\n"); 01270 } 01271 return StrongModel(model) && 01272 model->ValidFirstLine( 01273 (*rows)[row].lmargin_, (*rows)[row].lindent_, 01274 (*rows)[row].rindent_, (*rows)[row].rmargin_); 01275 } 01276 01277 bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows, 01278 int row, const ParagraphModel *model) { 01279 if (!StrongModel(model)) { 01280 tprintf("ValidBodyLine() should only be called with strong models!\n"); 01281 } 01282 return StrongModel(model) && 01283 model->ValidBodyLine( 01284 (*rows)[row].lmargin_, (*rows)[row].lindent_, 01285 (*rows)[row].rindent_, (*rows)[row].rmargin_); 01286 } 01287 01288 bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows, 01289 int a, int b, const ParagraphModel *model) { 01290 if (model != kCrownRight && model != kCrownLeft) { 01291 tprintf("CrownCompatible() should only be called with crown models!\n"); 01292 return false; 01293 } 01294 RowScratchRegisters &row_a = (*rows)[a]; 01295 RowScratchRegisters &row_b = (*rows)[b]; 01296 if (model == kCrownRight) { 01297 return NearlyEqual(row_a.rindent_ + row_a.rmargin_, 01298 row_b.rindent_ + row_b.rmargin_, 01299 Epsilon(row_a.ri_->average_interword_space)); 01300 } 01301 return NearlyEqual(row_a.lindent_ + row_a.lmargin_, 01302 row_b.lindent_ + row_b.lmargin_, 01303 Epsilon(row_a.ri_->average_interword_space)); 01304 } 01305 01306 01307 // =============== Implementation of ParagraphModelSmearer ==================== 01308 01309 ParagraphModelSmearer::ParagraphModelSmearer( 01310 GenericVector<RowScratchRegisters> *rows, 01311 int row_start, int row_end, ParagraphTheory *theory) 01312 : theory_(theory), rows_(rows), row_start_(row_start), 01313 row_end_(row_end) { 01314 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) { 01315 row_start_ = 0; 01316 row_end_ = 0; 01317 return; 01318 } 01319 SetOfModels no_models; 01320 for (int row = row_start - 1; row <= row_end; row++) { 01321 open_models_.push_back(no_models); 01322 } 01323 } 01324 01325 // see paragraphs_internal.h 01326 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) { 01327 SetOfModels no_models; 01328 if (row_start < row_start_) row_start = row_start_; 01329 if (row_end > row_end_) row_end = row_end_; 01330 01331 for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; 01332 row++) { 01333 if ((*rows_)[row].ri_->num_words == 0) { 01334 OpenModels(row + 1) = no_models; 01335 } else { 01336 SetOfModels &opened = OpenModels(row); 01337 (*rows_)[row].StartHypotheses(&opened); 01338 01339 // Which models survive the transition from row to row + 1? 01340 SetOfModels still_open; 01341 for (int m = 0; m < opened.size(); m++) { 01342 if (ValidFirstLine(rows_, row, opened[m]) || 01343 ValidBodyLine(rows_, row, opened[m])) { 01344 // This is basic filtering; we check likely paragraph starty-ness down 01345 // below in Smear() -- you know, whether the first word would have fit 01346 // and such. 01347 still_open.push_back_new(opened[m]); 01348 } 01349 } 01350 OpenModels(row + 1) = still_open; 01351 } 01352 } 01353 } 01354 01355 // see paragraphs_internal.h 01356 void ParagraphModelSmearer::Smear() { 01357 CalculateOpenModels(row_start_, row_end_); 01358 01359 // For each row which we're unsure about (that is, it is LT_UNKNOWN or 01360 // we have multiple LT_START hypotheses), see if there's a model that 01361 // was recently used (an "open" model) which might model it well. 01362 for (int i = row_start_; i < row_end_; i++) { 01363 RowScratchRegisters &row = (*rows_)[i]; 01364 if (row.ri_->num_words == 0) 01365 continue; 01366 01367 // Step One: 01368 // Figure out if there are "open" models which are left-alined or 01369 // right-aligned. This is important for determining whether the 01370 // "first" word in a row would fit at the "end" of the previous row. 01371 bool left_align_open = false; 01372 bool right_align_open = false; 01373 for (int m = 0; m < OpenModels(i).size(); m++) { 01374 switch (OpenModels(i)[m]->justification()) { 01375 case JUSTIFICATION_LEFT: left_align_open = true; break; 01376 case JUSTIFICATION_RIGHT: right_align_open = true; break; 01377 default: left_align_open = right_align_open = true; 01378 } 01379 } 01380 // Step Two: 01381 // Use that knowledge to figure out if this row is likely to 01382 // start a paragraph. 01383 bool likely_start; 01384 if (i == 0) { 01385 likely_start = true; 01386 } else { 01387 if ((left_align_open && right_align_open) || 01388 (!left_align_open && !right_align_open)) { 01389 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, 01390 JUSTIFICATION_LEFT) || 01391 LikelyParagraphStart((*rows_)[i - 1], row, 01392 JUSTIFICATION_RIGHT); 01393 } else if (left_align_open) { 01394 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, 01395 JUSTIFICATION_LEFT); 01396 } else { 01397 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, 01398 JUSTIFICATION_RIGHT); 01399 } 01400 } 01401 01402 // Step Three: 01403 // If this text line seems like an obvious first line of an 01404 // open model, or an obvious continuation of an existing 01405 // modelled paragraph, mark it up. 01406 if (likely_start) { 01407 // Add Start Hypotheses for all Open models that fit. 01408 for (int m = 0; m < OpenModels(i).size(); m++) { 01409 if (ValidFirstLine(rows_, i, OpenModels(i)[m])) { 01410 row.AddStartLine(OpenModels(i)[m]); 01411 } 01412 } 01413 } else { 01414 // Add relevant body line hypotheses. 01415 SetOfModels last_line_models; 01416 if (i > 0) { 01417 (*rows_)[i - 1].StrongHypotheses(&last_line_models); 01418 } else { 01419 theory_->NonCenteredModels(&last_line_models); 01420 } 01421 for (int m = 0; m < last_line_models.size(); m++) { 01422 const ParagraphModel *model = last_line_models[m]; 01423 if (ValidBodyLine(rows_, i, model)) 01424 row.AddBodyLine(model); 01425 } 01426 } 01427 01428 // Step Four: 01429 // If we're still quite unsure about this line, go through all 01430 // models in our theory and see if this row could be the start 01431 // of any of our models. 01432 if (row.GetLineType() == LT_UNKNOWN || 01433 (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) { 01434 SetOfModels all_models; 01435 theory_->NonCenteredModels(&all_models); 01436 for (int m = 0; m < all_models.size(); m++) { 01437 if (ValidFirstLine(rows_, i, all_models[m])) { 01438 row.AddStartLine(all_models[m]); 01439 } 01440 } 01441 } 01442 // Step Five: 01443 // Since we may have updated the hypotheses about this row, we need 01444 // to recalculate the Open models for the rest of rows[i + 1, row_end) 01445 if (row.GetLineType() != LT_UNKNOWN) { 01446 CalculateOpenModels(i + 1, row_end_); 01447 } 01448 } 01449 } 01450 01451 // ================ Main Paragraph Detection Algorithm ======================= 01452 01453 // Find out what ParagraphModels are actually used, and discard any 01454 // that are not. 01455 void DiscardUnusedModels(const GenericVector<RowScratchRegisters> &rows, 01456 ParagraphTheory *theory) { 01457 SetOfModels used_models; 01458 for (int i = 0; i < rows.size(); i++) { 01459 rows[i].StrongHypotheses(&used_models); 01460 } 01461 theory->DiscardUnusedModels(used_models); 01462 } 01463 01464 // DowngradeWeakestToCrowns: 01465 // Forget any flush-{left, right} models unless we see two or more 01466 // of them in sequence. 01467 // 01468 // In pass 3, we start to classify even flush-left paragraphs (paragraphs 01469 // where the first line and body indent are the same) as having proper Models. 01470 // This is generally dangerous, since if you start imagining that flush-left 01471 // is a typical paragraph model when it is not, it will lead you to chop normal 01472 // indented paragraphs in the middle whenever a sentence happens to start on a 01473 // new line (see "This" above). What to do? 01474 // What we do is to take any paragraph which is flush left and is not 01475 // preceded by another paragraph of the same model and convert it to a "Crown" 01476 // paragraph. This is a weak pseudo-ParagraphModel which is a placeholder 01477 // for later. It means that the paragraph is flush, but it would be desirable 01478 // to mark it as the same model as following text if it fits. This downgrade 01479 // FlushLeft -> CrownLeft -> Model of following paragraph. Means that we 01480 // avoid making flush left Paragraph Models whenever we see a top-of-the-page 01481 // half-of-a-paragraph. and instead we mark it the same as normal body text. 01482 // 01483 // Implementation: 01484 // 01485 // Comb backwards through the row scratch registers, and turn any 01486 // sequences of body lines of equivalent type abutted against the beginning 01487 // or a body or start line of a different type into a crown paragraph. 01488 void DowngradeWeakestToCrowns(int debug_level, 01489 ParagraphTheory *theory, 01490 GenericVector<RowScratchRegisters> *rows) { 01491 int start; 01492 for (int end = rows->size(); end > 0; end = start) { 01493 // Search back for a body line of a unique type. 01494 const ParagraphModel *model = NULL; 01495 while (end > 0 && 01496 (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) { 01497 end--; 01498 } 01499 if (end == 0) break; 01500 start = end - 1; 01501 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) { 01502 start--; // walk back to the first line that is not the same body type. 01503 } 01504 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && 01505 StrongModel(model) && 01506 NearlyEqual(model->first_indent(), model->body_indent(), 01507 model->tolerance())) { 01508 start--; 01509 } 01510 start++; 01511 // Now rows[start, end) is a sequence of unique body hypotheses of model. 01512 if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) 01513 continue; 01514 if (!StrongModel(model)) { 01515 while (start > 0 && 01516 CrownCompatible(rows, start - 1, start, model)) 01517 start--; 01518 } 01519 if (start == 0 || 01520 (!StrongModel(model)) || 01521 (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) { 01522 // crownify rows[start, end) 01523 const ParagraphModel *crown_model = model; 01524 if (StrongModel(model)) { 01525 if (model->justification() == JUSTIFICATION_LEFT) 01526 crown_model = kCrownLeft; 01527 else 01528 crown_model = kCrownRight; 01529 } 01530 (*rows)[start].SetUnknown(); 01531 (*rows)[start].AddStartLine(crown_model); 01532 for (int row = start + 1; row < end; row++) { 01533 (*rows)[row].SetUnknown(); 01534 (*rows)[row].AddBodyLine(crown_model); 01535 } 01536 } 01537 } 01538 DiscardUnusedModels(*rows, theory); 01539 } 01540 01541 01542 // Clear all hypotheses about lines [start, end) and reset margins. 01543 // 01544 // The empty space between the left of a row and the block boundary (and 01545 // similarly for the right) is split into two pieces: margin and indent. 01546 // In initial processing, we assume the block is tight and the margin for 01547 // all lines is set to zero. However, if our first pass does not yield 01548 // models for everything, it may be due to an inset paragraph like a 01549 // block-quote. In that case, we make a second pass over that unmarked 01550 // section of the page and reset the "margin" portion of the empty space 01551 // to the common amount of space at the ends of the lines under consid- 01552 // eration. This would be equivalent to percentile set to 0. However, 01553 // sometimes we have a single character sticking out in the right margin 01554 // of a text block (like the 'r' in 'for' on line 3 above), and we can 01555 // really just ignore it as an outlier. To express this, we allow the 01556 // user to specify the percentile (0..100) of indent values to use as 01557 // the common margin for each row in the run of rows[start, end). 01558 void RecomputeMarginsAndClearHypotheses( 01559 GenericVector<RowScratchRegisters> *rows, int start, int end, 01560 int percentile) { 01561 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) 01562 return; 01563 01564 int lmin, lmax, rmin, rmax; 01565 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_; 01566 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_; 01567 for (int i = start; i < end; i++) { 01568 RowScratchRegisters &sr = (*rows)[i]; 01569 sr.SetUnknown(); 01570 if (sr.ri_->num_words == 0) 01571 continue; 01572 UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax); 01573 UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax); 01574 } 01575 STATS lefts(lmin, lmax + 1); 01576 STATS rights(rmin, rmax + 1); 01577 for (int i = start; i < end; i++) { 01578 RowScratchRegisters &sr = (*rows)[i]; 01579 if (sr.ri_->num_words == 0) 01580 continue; 01581 lefts.add(sr.lmargin_ + sr.lindent_, 1); 01582 rights.add(sr.rmargin_ + sr.rindent_, 1); 01583 } 01584 int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0); 01585 int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0); 01586 for (int i = start; i < end; i++) { 01587 RowScratchRegisters &sr = (*rows)[i]; 01588 int ldelta = ignorable_left - sr.lmargin_; 01589 sr.lmargin_ += ldelta; 01590 sr.lindent_ -= ldelta; 01591 int rdelta = ignorable_right - sr.rmargin_; 01592 sr.rmargin_ += rdelta; 01593 sr.rindent_ -= rdelta; 01594 } 01595 } 01596 01597 // Return the median inter-word space in rows[row_start, row_end). 01598 int InterwordSpace(const GenericVector<RowScratchRegisters> &rows, 01599 int row_start, int row_end) { 01600 if (row_end < row_start + 1) return 1; 01601 int word_height = (rows[row_start].ri_->lword_box.height() + 01602 rows[row_end - 1].ri_->lword_box.height()) / 2; 01603 int word_width = (rows[row_start].ri_->lword_box.width() + 01604 rows[row_end - 1].ri_->lword_box.width()) / 2; 01605 STATS spacing_widths(0, 5 + word_width); 01606 for (int i = row_start; i < row_end; i++) { 01607 if (rows[i].ri_->num_words > 1) { 01608 spacing_widths.add(rows[i].ri_->average_interword_space, 1); 01609 } 01610 } 01611 int minimum_reasonable_space = word_height / 3; 01612 if (minimum_reasonable_space < 2) 01613 minimum_reasonable_space = 2; 01614 int median = spacing_widths.median(); 01615 return (median > minimum_reasonable_space) 01616 ? median : minimum_reasonable_space; 01617 } 01618 01619 // Return whether the first word on the after line can fit in the space at 01620 // the end of the before line (knowing which way the text is aligned and read). 01621 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, 01622 const RowScratchRegisters &after, 01623 tesseract::ParagraphJustification justification) { 01624 if (before.ri_->num_words == 0 || after.ri_->num_words == 0) 01625 return true; 01626 01627 if (justification == JUSTIFICATION_UNKNOWN) { 01628 tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n"); 01629 } 01630 int available_space; 01631 if (justification == JUSTIFICATION_CENTER) { 01632 available_space = before.lindent_ + before.rindent_; 01633 } else { 01634 available_space = before.OffsideIndent(justification); 01635 } 01636 available_space -= before.ri_->average_interword_space; 01637 01638 if (before.ri_->ltr) 01639 return after.ri_->lword_box.width() < available_space; 01640 return after.ri_->rword_box.width() < available_space; 01641 } 01642 01643 // Return whether the first word on the after line can fit in the space at 01644 // the end of the before line (not knowing which way the text goes) in a left 01645 // or right alignemnt. 01646 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, 01647 const RowScratchRegisters &after) { 01648 if (before.ri_->num_words == 0 || after.ri_->num_words == 0) 01649 return true; 01650 01651 int available_space = before.lindent_; 01652 if (before.rindent_ > available_space) 01653 available_space = before.rindent_; 01654 available_space -= before.ri_->average_interword_space; 01655 01656 if (before.ri_->ltr) 01657 return after.ri_->lword_box.width() < available_space; 01658 return after.ri_->rword_box.width() < available_space; 01659 } 01660 01661 bool TextSupportsBreak(const RowScratchRegisters &before, 01662 const RowScratchRegisters &after) { 01663 if (before.ri_->ltr) { 01664 return before.ri_->rword_likely_ends_idea && 01665 after.ri_->lword_likely_starts_idea; 01666 } else { 01667 return before.ri_->lword_likely_ends_idea && 01668 after.ri_->rword_likely_starts_idea; 01669 } 01670 } 01671 01672 bool LikelyParagraphStart(const RowScratchRegisters &before, 01673 const RowScratchRegisters &after) { 01674 return before.ri_->num_words == 0 || 01675 (FirstWordWouldHaveFit(before, after) && 01676 TextSupportsBreak(before, after)); 01677 } 01678 01679 bool LikelyParagraphStart(const RowScratchRegisters &before, 01680 const RowScratchRegisters &after, 01681 tesseract::ParagraphJustification j) { 01682 return before.ri_->num_words == 0 || 01683 (FirstWordWouldHaveFit(before, after, j) && 01684 TextSupportsBreak(before, after)); 01685 } 01686 01687 // Examine rows[start, end) and try to determine what sort of ParagraphModel 01688 // would fit them as a single paragraph. 01689 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN. 01690 // If the rows given could be a consistent start to a paragraph, set *consistent 01691 // true. 01692 ParagraphModel InternalParagraphModelByOutline( 01693 const GenericVector<RowScratchRegisters> *rows, 01694 int start, int end, int tolerance, bool *consistent) { 01695 int ltr_line_count = 0; 01696 for (int i = start; i < end; i++) { 01697 ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr); 01698 } 01699 bool ltr = (ltr_line_count >= (end - start) / 2); 01700 01701 *consistent = true; 01702 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) 01703 return ParagraphModel(); 01704 01705 // Ensure the caller only passed us a region with a common rmargin and 01706 // lmargin. 01707 int lmargin = (*rows)[start].lmargin_; 01708 int rmargin = (*rows)[start].rmargin_; 01709 int lmin, lmax, rmin, rmax, cmin, cmax; 01710 lmin = lmax = (*rows)[start + 1].lindent_; 01711 rmin = rmax = (*rows)[start + 1].rindent_; 01712 cmin = cmax = 0; 01713 for (int i = start + 1; i < end; i++) { 01714 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) { 01715 tprintf("Margins don't match! Software error.\n"); 01716 *consistent = false; 01717 return ParagraphModel(); 01718 } 01719 UpdateRange((*rows)[i].lindent_, &lmin, &lmax); 01720 UpdateRange((*rows)[i].rindent_, &rmin, &rmax); 01721 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax); 01722 } 01723 int ldiff = lmax - lmin; 01724 int rdiff = rmax - rmin; 01725 int cdiff = cmax - cmin; 01726 if (rdiff > tolerance && ldiff > tolerance) { 01727 if (cdiff < tolerance * 2) { 01728 if (end - start < 3) 01729 return ParagraphModel(); 01730 return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance); 01731 } 01732 *consistent = false; 01733 return ParagraphModel(); 01734 } 01735 if (end - start < 3) // Don't return a model for two line paras. 01736 return ParagraphModel(); 01737 01738 // These booleans keep us from saying something is aligned left when the body 01739 // left variance is too large. 01740 bool body_admits_left_alignment = ldiff < tolerance; 01741 bool body_admits_right_alignment = rdiff < tolerance; 01742 01743 ParagraphModel left_model = 01744 ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_, 01745 (lmin + lmax) / 2, tolerance); 01746 ParagraphModel right_model = 01747 ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_, 01748 (rmin + rmax) / 2, tolerance); 01749 01750 // These booleans keep us from having an indent on the "wrong side" for the 01751 // first line. 01752 bool text_admits_left_alignment = ltr || left_model.is_flush(); 01753 bool text_admits_right_alignment = !ltr || right_model.is_flush(); 01754 01755 // At least one of the edges is less than tolerance in variance. 01756 // If the other is obviously ragged, it can't be the one aligned to. 01757 // [Note the last line is included in this raggedness.] 01758 if (tolerance < rdiff) { 01759 if (body_admits_left_alignment && text_admits_left_alignment) 01760 return left_model; 01761 *consistent = false; 01762 return ParagraphModel(); 01763 } 01764 if (tolerance < ldiff) { 01765 if (body_admits_right_alignment && text_admits_right_alignment) 01766 return right_model; 01767 *consistent = false; 01768 return ParagraphModel(); 01769 } 01770 01771 // At this point, we know the body text doesn't vary much on either side. 01772 01773 // If the first line juts out oddly in one direction or the other, 01774 // that likely indicates the side aligned to. 01775 int first_left = (*rows)[start].lindent_; 01776 int first_right = (*rows)[start].rindent_; 01777 01778 if (ltr && body_admits_left_alignment && 01779 (first_left < lmin || first_left > lmax)) 01780 return left_model; 01781 if (!ltr && body_admits_right_alignment && 01782 (first_right < rmin || first_right > rmax)) 01783 return right_model; 01784 01785 *consistent = false; 01786 return ParagraphModel(); 01787 } 01788 01789 // Examine rows[start, end) and try to determine what sort of ParagraphModel 01790 // would fit them as a single paragraph. If nothing fits, 01791 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug 01792 // output if we're debugging. 01793 ParagraphModel ParagraphModelByOutline( 01794 int debug_level, 01795 const GenericVector<RowScratchRegisters> *rows, 01796 int start, int end, int tolerance) { 01797 bool unused_consistent; 01798 ParagraphModel retval = InternalParagraphModelByOutline( 01799 rows, start, end, tolerance, &unused_consistent); 01800 if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) { 01801 tprintf("Could not determine a model for this paragraph:\n"); 01802 PrintRowRange(*rows, start, end); 01803 } 01804 return retval; 01805 } 01806 01807 // Do rows[start, end) form a single instance of the given paragraph model? 01808 bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows, 01809 int start, int end, const ParagraphModel *model) { 01810 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) 01811 return false; 01812 if (!ValidFirstLine(rows, start, model)) return false; 01813 for (int i = start + 1 ; i < end; i++) { 01814 if (!ValidBodyLine(rows, i, model)) return false; 01815 } 01816 return true; 01817 } 01818 01819 // Examine rows[row_start, row_end) as an independent section of text, 01820 // and mark rows that are exceptionally clear as start-of-paragraph 01821 // and paragraph-body lines. 01822 // 01823 // We presume that any lines surrounding rows[row_start, row_end) may 01824 // have wildly different paragraph models, so we don't key any data off 01825 // of those lines. 01826 // 01827 // We only take the very strongest signals, as we don't want to get 01828 // confused and marking up centered text, poetry, or source code as 01829 // clearly part of a typical paragraph. 01830 void MarkStrongEvidence(GenericVector<RowScratchRegisters> *rows, 01831 int row_start, int row_end) { 01832 // Record patently obvious body text. 01833 for (int i = row_start + 1; i < row_end; i++) { 01834 const RowScratchRegisters &prev = (*rows)[i - 1]; 01835 RowScratchRegisters &curr = (*rows)[i]; 01836 tesseract::ParagraphJustification typical_justification = 01837 prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; 01838 if (!curr.ri_->rword_likely_starts_idea && 01839 !curr.ri_->lword_likely_starts_idea && 01840 !FirstWordWouldHaveFit(prev, curr, typical_justification)) { 01841 curr.SetBodyLine(); 01842 } 01843 } 01844 01845 // Record patently obvious start paragraph lines. 01846 // 01847 // It's an extremely good signal of the start of a paragraph that 01848 // the first word would have fit on the end of the previous line. 01849 // However, applying just that signal would have us mark random 01850 // start lines of lineated text (poetry and source code) and some 01851 // centered headings as paragraph start lines. Therefore, we use 01852 // a second qualification for a paragraph start: Not only should 01853 // the first word of this line have fit on the previous line, 01854 // but also, this line should go full to the right of the block, 01855 // disallowing a subsequent word from having fit on this line. 01856 01857 // First row: 01858 { 01859 RowScratchRegisters &curr = (*rows)[row_start]; 01860 RowScratchRegisters &next = (*rows)[row_start + 1]; 01861 tesseract::ParagraphJustification j = 01862 curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; 01863 if (curr.GetLineType() == LT_UNKNOWN && 01864 !FirstWordWouldHaveFit(curr, next, j) && 01865 (curr.ri_->lword_likely_starts_idea || 01866 curr.ri_->rword_likely_starts_idea)) { 01867 curr.SetStartLine(); 01868 } 01869 } 01870 // Middle rows 01871 for (int i = row_start + 1; i < row_end - 1; i++) { 01872 RowScratchRegisters &prev = (*rows)[i - 1]; 01873 RowScratchRegisters &curr = (*rows)[i]; 01874 RowScratchRegisters &next = (*rows)[i + 1]; 01875 tesseract::ParagraphJustification j = 01876 curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; 01877 if (curr.GetLineType() == LT_UNKNOWN && 01878 !FirstWordWouldHaveFit(curr, next, j) && 01879 LikelyParagraphStart(prev, curr, j)) { 01880 curr.SetStartLine(); 01881 } 01882 } 01883 // Last row 01884 { // the short circuit at the top means we have at least two lines. 01885 RowScratchRegisters &prev = (*rows)[row_end - 2]; 01886 RowScratchRegisters &curr = (*rows)[row_end - 1]; 01887 tesseract::ParagraphJustification j = 01888 curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT; 01889 if (curr.GetLineType() == LT_UNKNOWN && 01890 !FirstWordWouldHaveFit(curr, curr, j) && 01891 LikelyParagraphStart(prev, curr, j)) { 01892 curr.SetStartLine(); 01893 } 01894 } 01895 } 01896 01897 // Look for sequences of a start line followed by some body lines in 01898 // rows[row_start, row_end) and create ParagraphModels for them if 01899 // they seem coherent. 01900 void ModelStrongEvidence(int debug_level, 01901 GenericVector<RowScratchRegisters> *rows, 01902 int row_start, int row_end, 01903 bool allow_flush_models, 01904 ParagraphTheory *theory) { 01905 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) 01906 return; 01907 01908 int start = row_start; 01909 while (start < row_end) { 01910 while (start < row_end && (*rows)[start].GetLineType() != LT_START) 01911 start++; 01912 if (start >= row_end - 1) 01913 break; 01914 01915 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space); 01916 int end = start; 01917 ParagraphModel last_model; 01918 bool next_consistent; 01919 do { 01920 ++end; 01921 // rows[row, end) was consistent. 01922 // If rows[row, end + 1) is not consistent, 01923 // just model rows[row, end) 01924 if (end < row_end - 1) { 01925 RowScratchRegisters &next = (*rows)[end]; 01926 LineType lt = next.GetLineType(); 01927 next_consistent = lt == LT_BODY || 01928 (lt == LT_UNKNOWN && 01929 !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end])); 01930 } else { 01931 next_consistent = false; 01932 } 01933 if (next_consistent) { 01934 ParagraphModel next_model = InternalParagraphModelByOutline( 01935 rows, start, end + 1, tolerance, &next_consistent); 01936 if (((*rows)[start].ri_->ltr && 01937 last_model.justification() == JUSTIFICATION_LEFT && 01938 next_model.justification() != JUSTIFICATION_LEFT) || 01939 (!(*rows)[start].ri_->ltr && 01940 last_model.justification() == JUSTIFICATION_RIGHT && 01941 next_model.justification() != JUSTIFICATION_RIGHT)) { 01942 next_consistent = false; 01943 } 01944 last_model = next_model; 01945 } else { 01946 next_consistent = false; 01947 } 01948 } while (next_consistent && end < row_end); 01949 // At this point, rows[start, end) looked like it could have been a 01950 // single paragraph. If we can make a good ParagraphModel for it, 01951 // do so and mark this sequence with that model. 01952 if (end > start + 1) { 01953 // emit a new paragraph if we have more than one line. 01954 const ParagraphModel *model = NULL; 01955 ParagraphModel new_model = ParagraphModelByOutline( 01956 debug_level, rows, start, end, 01957 Epsilon(InterwordSpace(*rows, start, end))); 01958 if (new_model.justification() == JUSTIFICATION_UNKNOWN) { 01959 // couldn't create a good model, oh well. 01960 } else if (new_model.is_flush()) { 01961 if (end == start + 2) { 01962 // It's very likely we just got two paragraph starts in a row. 01963 end = start + 1; 01964 } else if (start == row_start) { 01965 // Mark this as a Crown. 01966 if (new_model.justification() == JUSTIFICATION_LEFT) { 01967 model = kCrownLeft; 01968 } else { 01969 model = kCrownRight; 01970 } 01971 } else if (allow_flush_models) { 01972 model = theory->AddModel(new_model); 01973 } 01974 } else { 01975 model = theory->AddModel(new_model); 01976 } 01977 if (model) { 01978 (*rows)[start].AddStartLine(model); 01979 for (int i = start + 1; i < end; i++) { 01980 (*rows)[i].AddBodyLine(model); 01981 } 01982 } 01983 } 01984 start = end; 01985 } 01986 } 01987 01988 // We examine rows[row_start, row_end) and do the following: 01989 // (1) Clear all existing hypotheses for the rows being considered. 01990 // (2) Mark up any rows as exceptionally likely to be paragraph starts 01991 // or paragraph body lines as such using both geometric and textual 01992 // clues. 01993 // (3) Form models for any sequence of start + continuation lines. 01994 // (4) Smear the paragraph models to cover surrounding text. 01995 void StrongEvidenceClassify(int debug_level, 01996 GenericVector<RowScratchRegisters> *rows, 01997 int row_start, int row_end, 01998 ParagraphTheory *theory) { 01999 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) 02000 return; 02001 02002 if (debug_level > 1) { 02003 tprintf("#############################################\n"); 02004 tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end); 02005 tprintf("#############################################\n"); 02006 } 02007 02008 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10); 02009 MarkStrongEvidence(rows, row_start, row_end); 02010 02011 DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows); 02012 02013 // Create paragraph models. 02014 ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory); 02015 02016 DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows); 02017 02018 // At this point, some rows are marked up as paragraphs with model numbers, 02019 // and some rows are marked up as either LT_START or LT_BODY. Now let's 02020 // smear any good paragraph hypotheses forward and backward. 02021 ParagraphModelSmearer smearer(rows, row_start, row_end, theory); 02022 smearer.Smear(); 02023 } 02024 02025 void SeparateSimpleLeaderLines(GenericVector<RowScratchRegisters> *rows, 02026 int row_start, int row_end, 02027 ParagraphTheory *theory) { 02028 for (int i = row_start + 1; i < row_end - 1; i++) { 02029 if ((*rows)[i - 1].ri_->has_leaders && 02030 (*rows)[i].ri_->has_leaders && 02031 (*rows)[i + 1].ri_->has_leaders) { 02032 const ParagraphModel *model = theory->AddModel( 02033 ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0)); 02034 (*rows)[i].AddStartLine(model); 02035 } 02036 } 02037 } 02038 02039 // Collect sequences of unique hypotheses in row registers and create proper 02040 // paragraphs for them, referencing the paragraphs in row_owners. 02041 void ConvertHypothesizedModelRunsToParagraphs( 02042 int debug_level, 02043 const GenericVector<RowScratchRegisters> &rows, 02044 GenericVector<PARA *> *row_owners, 02045 ParagraphTheory *theory) { 02046 int end = rows.size(); 02047 int start; 02048 for (; end > 0; end = start) { 02049 start = end - 1; 02050 const ParagraphModel *model = NULL; 02051 // TODO(eger): Be smarter about dealing with multiple hypotheses. 02052 bool single_line_paragraph = false; 02053 SetOfModels models; 02054 rows[start].NonNullHypotheses(&models); 02055 if (models.size() > 0) { 02056 model = models[0]; 02057 if (rows[start].GetLineType(model) != LT_BODY) 02058 single_line_paragraph = true; 02059 } 02060 if (model && !single_line_paragraph) { 02061 // walk back looking for more body lines and then a start line. 02062 while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) { 02063 // do nothing 02064 } 02065 if (start < 0 || rows[start].GetLineType(model) != LT_START) { 02066 model = NULL; 02067 } 02068 } 02069 if (model == NULL) { 02070 continue; 02071 } 02072 // rows[start, end) should be a paragraph. 02073 PARA *p = new PARA(); 02074 if (model == kCrownLeft || model == kCrownRight) { 02075 p->is_very_first_or_continuation = true; 02076 // Crown paragraph. 02077 // If we can find an existing ParagraphModel that fits, use it, 02078 // else create a new one. 02079 for (int row = end; row < rows.size(); row++) { 02080 if ((*row_owners)[row] && 02081 (ValidBodyLine(&rows, start, (*row_owners)[row]->model) && 02082 (start == 0 || 02083 ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) { 02084 model = (*row_owners)[row]->model; 02085 break; 02086 } 02087 } 02088 if (model == kCrownLeft) { 02089 // No subsequent model fits, so cons one up. 02090 model = theory->AddModel(ParagraphModel( 02091 JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_, 02092 0, 0, Epsilon(rows[start].ri_->average_interword_space))); 02093 } else if (model == kCrownRight) { 02094 // No subsequent model fits, so cons one up. 02095 model = theory->AddModel(ParagraphModel( 02096 JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_, 02097 0, 0, Epsilon(rows[start].ri_->average_interword_space))); 02098 } 02099 } 02100 rows[start].SetUnknown(); 02101 rows[start].AddStartLine(model); 02102 for (int i = start + 1; i < end; i++) { 02103 rows[i].SetUnknown(); 02104 rows[i].AddBodyLine(model); 02105 } 02106 p->model = model; 02107 p->has_drop_cap = rows[start].ri_->has_drop_cap; 02108 p->is_list_item = 02109 model->justification() == JUSTIFICATION_RIGHT 02110 ? rows[start].ri_->rword_indicates_list_item 02111 : rows[start].ri_->lword_indicates_list_item; 02112 for (int row = start; row < end; row++) { 02113 if ((*row_owners)[row] != NULL) { 02114 tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called " 02115 "more than once!\n"); 02116 } 02117 (*row_owners)[row] = p; 02118 } 02119 } 02120 } 02121 02122 struct Interval { 02123 Interval() : begin(0), end(0) {} 02124 Interval(int b, int e) : begin(b), end(e) {} 02125 02126 int begin; 02127 int end; 02128 }; 02129 02130 // Return whether rows[row] appears to be stranded, meaning that the evidence 02131 // for this row is very weak due to context. For instance, two lines of source 02132 // code may happen to be indented at the same tab vector as body text starts, 02133 // leading us to think they are two start-of-paragraph lines. This is not 02134 // optimal. However, we also don't want to mark a sequence of short dialog 02135 // as "weak," so our heuristic is: 02136 // (1) If a line is surrounded by lines of unknown type, it's weak. 02137 // (2) If two lines in a row are start lines for a given paragraph type, but 02138 // after that the same paragraph type does not continue, they're weak. 02139 bool RowIsStranded(const GenericVector<RowScratchRegisters> &rows, int row) { 02140 SetOfModels row_models; 02141 rows[row].StrongHypotheses(&row_models); 02142 02143 for (int m = 0; m < row_models.size(); m++) { 02144 bool all_starts = rows[row].GetLineType(); 02145 int run_length = 1; 02146 bool continues = true; 02147 for (int i = row - 1; i >= 0 && continues; i--) { 02148 SetOfModels models; 02149 rows[i].NonNullHypotheses(&models); 02150 switch (rows[i].GetLineType(row_models[m])) { 02151 case LT_START: run_length++; break; 02152 case LT_MULTIPLE: // explicit fall-through 02153 case LT_BODY: run_length++; all_starts = false; break; 02154 case LT_UNKNOWN: // explicit fall-through 02155 default: continues = false; 02156 } 02157 } 02158 continues = true; 02159 for (int i = row + 1; i < rows.size() && continues; i++) { 02160 SetOfModels models; 02161 rows[i].NonNullHypotheses(&models); 02162 switch (rows[i].GetLineType(row_models[m])) { 02163 case LT_START: run_length++; break; 02164 case LT_MULTIPLE: // explicit fall-through 02165 case LT_BODY: run_length++; all_starts = false; break; 02166 case LT_UNKNOWN: // explicit fall-through 02167 default: continues = false; 02168 } 02169 } 02170 if (run_length > 2 || (!all_starts && run_length > 1)) return false; 02171 } 02172 return true; 02173 } 02174 02175 // Go through rows[row_start, row_end) and gather up sequences that need better 02176 // classification. 02177 // + Sequences of non-empty rows without hypotheses. 02178 // + Crown paragraphs not immediately followed by a strongly modeled line. 02179 // + Single line paragraphs surrounded by text that doesn't match the 02180 // model. 02181 void LeftoverSegments(const GenericVector<RowScratchRegisters> &rows, 02182 GenericVector<Interval> *to_fix, 02183 int row_start, int row_end) { 02184 to_fix->clear(); 02185 for (int i = row_start; i < row_end; i++) { 02186 bool needs_fixing = false; 02187 02188 SetOfModels models; 02189 SetOfModels models_w_crowns; 02190 rows[i].StrongHypotheses(&models); 02191 rows[i].NonNullHypotheses(&models_w_crowns); 02192 if (models.empty() && models_w_crowns.size() > 0) { 02193 // Crown paragraph. Is it followed by a modeled line? 02194 for (int end = i + 1; end < rows.size(); end++) { 02195 SetOfModels end_models; 02196 SetOfModels strong_end_models; 02197 rows[end].NonNullHypotheses(&end_models); 02198 rows[end].StrongHypotheses(&strong_end_models); 02199 if (end_models.size() == 0) { 02200 needs_fixing = true; 02201 break; 02202 } else if (strong_end_models.size() > 0) { 02203 needs_fixing = false; 02204 break; 02205 } 02206 } 02207 } else if (models.empty() && rows[i].ri_->num_words > 0) { 02208 // No models at all. 02209 needs_fixing = true; 02210 } 02211 02212 if (!needs_fixing && !models.empty()) { 02213 needs_fixing = RowIsStranded(rows, i); 02214 } 02215 02216 if (needs_fixing) { 02217 if (!to_fix->empty() && to_fix->back().end == i - 1) 02218 to_fix->back().end = i; 02219 else 02220 to_fix->push_back(Interval(i, i)); 02221 } 02222 } 02223 // Convert inclusive intervals to half-open intervals. 02224 for (int i = 0; i < to_fix->size(); i++) { 02225 (*to_fix)[i].end = (*to_fix)[i].end + 1; 02226 } 02227 } 02228 02229 // Given a set of row_owners pointing to PARAs or NULL (no paragraph known), 02230 // normalize each row_owner to point to an actual PARA, and output the 02231 // paragraphs in order onto paragraphs. 02232 void CanonicalizeDetectionResults( 02233 GenericVector<PARA *> *row_owners, 02234 PARA_LIST *paragraphs) { 02235 GenericVector<PARA *> &rows = *row_owners; 02236 paragraphs->clear(); 02237 PARA_IT out(paragraphs); 02238 PARA *formerly_null = NULL; 02239 for (int i = 0; i < rows.size(); i++) { 02240 if (rows[i] == NULL) { 02241 if (i == 0 || rows[i - 1] != formerly_null) { 02242 rows[i] = formerly_null = new PARA(); 02243 } else { 02244 rows[i] = formerly_null; 02245 continue; 02246 } 02247 } else if (i > 0 && rows[i - 1] == rows[i]) { 02248 continue; 02249 } 02250 out.add_after_then_move(rows[i]); 02251 } 02252 } 02253 02254 // Main entry point for Paragraph Detection Algorithm. 02255 // 02256 // Given a set of equally spaced textlines (described by row_infos), 02257 // Split them into paragraphs. 02258 // 02259 // Output: 02260 // row_owners - one pointer for each row, to the paragraph it belongs to. 02261 // paragraphs - this is the actual list of PARA objects. 02262 // models - the list of paragraph models referenced by the PARA objects. 02263 // caller is responsible for deleting the models. 02264 void DetectParagraphs(int debug_level, 02265 GenericVector<RowInfo> *row_infos, 02266 GenericVector<PARA *> *row_owners, 02267 PARA_LIST *paragraphs, 02268 GenericVector<ParagraphModel *> *models) { 02269 GenericVector<RowScratchRegisters> rows; 02270 ParagraphTheory theory(models); 02271 02272 // Initialize row_owners to be a bunch of NULL pointers. 02273 row_owners->init_to_size(row_infos->size(), NULL); 02274 02275 // Set up row scratch registers for the main algorithm. 02276 rows.init_to_size(row_infos->size(), RowScratchRegisters()); 02277 for (int i = 0; i < row_infos->size(); i++) { 02278 rows[i].Init((*row_infos)[i]); 02279 } 02280 02281 // Pass 1: 02282 // Detect sequences of lines that all contain leader dots (.....) 02283 // These are likely Tables of Contents. If there are three text lines in 02284 // a row with leader dots, it's pretty safe to say the middle one should 02285 // be a paragraph of its own. 02286 SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory); 02287 02288 DebugDump(debug_level > 1, "End of Pass 1", theory, rows); 02289 02290 GenericVector<Interval> leftovers; 02291 LeftoverSegments(rows, &leftovers, 0, rows.size()); 02292 for (int i = 0; i < leftovers.size(); i++) { 02293 // Pass 2a: 02294 // Find any strongly evidenced start-of-paragraph lines. If they're 02295 // followed by two lines that look like body lines, make a paragraph 02296 // model for that and see if that model applies throughout the text 02297 // (that is, "smear" it). 02298 StrongEvidenceClassify(debug_level, &rows, 02299 leftovers[i].begin, leftovers[i].end, &theory); 02300 02301 // Pass 2b: 02302 // If we had any luck in pass 2a, we got part of the page and didn't 02303 // know how to classify a few runs of rows. Take the segments that 02304 // didn't find a model and reprocess them individually. 02305 GenericVector<Interval> leftovers2; 02306 LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end); 02307 bool pass2a_was_useful = leftovers2.size() > 1 || 02308 (leftovers2.size() == 1 && 02309 (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size())); 02310 if (pass2a_was_useful) { 02311 for (int j = 0; j < leftovers2.size(); j++) { 02312 StrongEvidenceClassify(debug_level, &rows, 02313 leftovers2[j].begin, leftovers2[j].end, 02314 &theory); 02315 } 02316 } 02317 } 02318 02319 DebugDump(debug_level > 1, "End of Pass 2", theory, rows); 02320 02321 // Pass 3: 02322 // These are the dregs for which we didn't have enough strong textual 02323 // and geometric clues to form matching models for. Let's see if 02324 // the geometric clues are simple enough that we could just use those. 02325 LeftoverSegments(rows, &leftovers, 0, rows.size()); 02326 for (int i = 0; i < leftovers.size(); i++) { 02327 GeometricClassify(debug_level, &rows, 02328 leftovers[i].begin, leftovers[i].end, &theory); 02329 } 02330 02331 // Undo any flush models for which there's little evidence. 02332 DowngradeWeakestToCrowns(debug_level, &theory, &rows); 02333 02334 DebugDump(debug_level > 1, "End of Pass 3", theory, rows); 02335 02336 // Pass 4: 02337 // Take everything that's still not marked up well and clear all markings. 02338 LeftoverSegments(rows, &leftovers, 0, rows.size()); 02339 for (int i = 0; i < leftovers.size(); i++) { 02340 for (int j = leftovers[i].begin; j < leftovers[i].end; j++) { 02341 rows[j].SetUnknown(); 02342 } 02343 } 02344 02345 DebugDump(debug_level > 1, "End of Pass 4", theory, rows); 02346 02347 // Convert all of the unique hypothesis runs to PARAs. 02348 ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, 02349 &theory); 02350 02351 DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows); 02352 02353 // Finally, clean up any dangling NULL row paragraph parents. 02354 CanonicalizeDetectionResults(row_owners, paragraphs); 02355 } 02356 02357 // ============ Code interfacing with the rest of Tesseract ================== 02358 02359 void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, 02360 RowInfo *info) { 02361 // Set up text, lword_text, and rword_text (mostly for debug printing). 02362 STRING fake_text; 02363 PageIterator pit(static_cast<const PageIterator&>(it)); 02364 bool first_word = true; 02365 if (!pit.Empty(RIL_WORD)) { 02366 do { 02367 fake_text += "x"; 02368 if (first_word) info->lword_text += "x"; 02369 info->rword_text += "x"; 02370 if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) && 02371 !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) { 02372 fake_text += " "; 02373 info->rword_text = ""; 02374 first_word = false; 02375 } 02376 } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && 02377 pit.Next(RIL_SYMBOL)); 02378 } 02379 if (fake_text.size() == 0) return; 02380 02381 int lspaces = info->pix_ldistance / info->average_interword_space; 02382 for (int i = 0; i < lspaces; i++) { 02383 info->text += ' '; 02384 } 02385 info->text += fake_text; 02386 02387 // Set up lword_box, rword_box, and num_words. 02388 PAGE_RES_IT page_res_it = *it.PageResIt(); 02389 WERD_RES *word_res = page_res_it.restart_row(); 02390 ROW_RES *this_row = page_res_it.row(); 02391 02392 WERD_RES *lword = NULL; 02393 WERD_RES *rword = NULL; 02394 info->num_words = 0; 02395 do { 02396 if (word_res) { 02397 if (!lword) lword = word_res; 02398 if (rword != word_res) info->num_words++; 02399 rword = word_res; 02400 } 02401 word_res = page_res_it.forward(); 02402 } while (page_res_it.row() == this_row); 02403 info->lword_box = lword->word->bounding_box(); 02404 info->rword_box = rword->word->bounding_box(); 02405 } 02406 02407 02408 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph 02409 // detector RowInfo with all relevant information from the row. 02410 void InitializeRowInfo(bool after_recognition, 02411 const MutableIterator &it, 02412 RowInfo *info) { 02413 if (it.PageResIt()->row() != NULL) { 02414 ROW *row = it.PageResIt()->row()->row; 02415 info->pix_ldistance = row->lmargin(); 02416 info->pix_rdistance = row->rmargin(); 02417 info->average_interword_space = 02418 row->space() > 0 ? row->space() : MAX(row->x_height(), 1); 02419 info->pix_xheight = row->x_height(); 02420 info->has_leaders = false; 02421 info->has_drop_cap = row->has_drop_cap(); 02422 info->ltr = true; // set below depending on word scripts 02423 } else { 02424 info->pix_ldistance = info->pix_rdistance = 0; 02425 info->average_interword_space = 1; 02426 info->pix_xheight = 1.0; 02427 info->has_leaders = false; 02428 info->has_drop_cap = false; 02429 info->ltr = true; 02430 } 02431 02432 info->num_words = 0; 02433 info->lword_indicates_list_item = false; 02434 info->lword_likely_starts_idea = false; 02435 info->lword_likely_ends_idea = false; 02436 info->rword_indicates_list_item = false; 02437 info->rword_likely_starts_idea = false; 02438 info->rword_likely_ends_idea = false; 02439 info->has_leaders = false; 02440 info->ltr = 1; 02441 02442 if (!after_recognition) { 02443 InitializeTextAndBoxesPreRecognition(it, info); 02444 return; 02445 } 02446 info->text = ""; 02447 char *text = it.GetUTF8Text(RIL_TEXTLINE); 02448 int trailing_ws_idx = strlen(text); // strip trailing space 02449 while (trailing_ws_idx > 0 && 02450 // isspace() only takes ASCII 02451 ((text[trailing_ws_idx - 1] & 0x80) == 0) && 02452 isspace(text[trailing_ws_idx - 1])) 02453 trailing_ws_idx--; 02454 if (trailing_ws_idx > 0) { 02455 int lspaces = info->pix_ldistance / info->average_interword_space; 02456 for (int i = 0; i < lspaces; i++) 02457 info->text += ' '; 02458 for (int i = 0; i < trailing_ws_idx; i++) 02459 info->text += text[i]; 02460 } 02461 delete []text; 02462 02463 if (info->text.size() == 0) { 02464 return; 02465 } 02466 02467 PAGE_RES_IT page_res_it = *it.PageResIt(); 02468 GenericVector<WERD_RES *> werds; 02469 WERD_RES *word_res = page_res_it.restart_row(); 02470 ROW_RES *this_row = page_res_it.row(); 02471 int num_leaders = 0; 02472 int ltr = 0; 02473 int rtl = 0; 02474 do { 02475 if (word_res && word_res->best_choice->unichar_string().length() > 0) { 02476 werds.push_back(word_res); 02477 ltr += word_res->AnyLtrCharsInWord() ? 1 : 0; 02478 rtl += word_res->AnyRtlCharsInWord() ? 1 : 0; 02479 if (word_res->word->flag(W_REP_CHAR)) num_leaders++; 02480 } 02481 word_res = page_res_it.forward(); 02482 } while (page_res_it.row() == this_row); 02483 info->ltr = ltr >= rtl; 02484 info->has_leaders = num_leaders > 3; 02485 info->num_words = werds.size(); 02486 if (werds.size() > 0) { 02487 WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1]; 02488 info->lword_text = lword->best_choice->unichar_string().string(); 02489 info->rword_text = rword->best_choice->unichar_string().string(); 02490 info->lword_box = lword->word->bounding_box(); 02491 info->rword_box = rword->word->bounding_box(); 02492 LeftWordAttributes(lword->uch_set, lword->best_choice, 02493 info->lword_text, 02494 &info->lword_indicates_list_item, 02495 &info->lword_likely_starts_idea, 02496 &info->lword_likely_ends_idea); 02497 RightWordAttributes(rword->uch_set, rword->best_choice, 02498 info->rword_text, 02499 &info->rword_indicates_list_item, 02500 &info->rword_likely_starts_idea, 02501 &info->rword_likely_ends_idea); 02502 } 02503 } 02504 02505 // This is called after rows have been identified and words are recognized. 02506 // Much of this could be implemented before word recognition, but text helps 02507 // to identify bulleted lists and gives good signals for sentence boundaries. 02508 void DetectParagraphs(int debug_level, 02509 bool after_text_recognition, 02510 const MutableIterator *block_start, 02511 GenericVector<ParagraphModel *> *models) { 02512 // Clear out any preconceived notions. 02513 if (block_start->Empty(RIL_TEXTLINE)) { 02514 return; 02515 } 02516 BLOCK *block = block_start->PageResIt()->block()->block; 02517 block->para_list()->clear(); 02518 bool is_image_block = block->poly_block() && !block->poly_block()->IsText(); 02519 02520 // Convert the Tesseract structures to RowInfos 02521 // for the paragraph detection algorithm. 02522 MutableIterator row(*block_start); 02523 if (row.Empty(RIL_TEXTLINE)) 02524 return; // end of input already. 02525 02526 GenericVector<RowInfo> row_infos; 02527 do { 02528 if (!row.PageResIt()->row()) 02529 continue; // empty row. 02530 row.PageResIt()->row()->row->set_para(NULL); 02531 row_infos.push_back(RowInfo()); 02532 RowInfo &ri = row_infos.back(); 02533 InitializeRowInfo(after_text_recognition, row, &ri); 02534 } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && 02535 row.Next(RIL_TEXTLINE)); 02536 02537 // If we're called before text recognition, we might not have 02538 // tight block bounding boxes, so trim by the minimum on each side. 02539 if (row_infos.size() > 0) { 02540 int min_lmargin = row_infos[0].pix_ldistance; 02541 int min_rmargin = row_infos[0].pix_rdistance; 02542 for (int i = 1; i < row_infos.size(); i++) { 02543 if (row_infos[i].pix_ldistance < min_lmargin) 02544 min_lmargin = row_infos[i].pix_ldistance; 02545 if (row_infos[i].pix_rdistance < min_rmargin) 02546 min_rmargin = row_infos[i].pix_rdistance; 02547 } 02548 if (min_lmargin > 0 || min_rmargin > 0) { 02549 for (int i = 0; i < row_infos.size(); i++) { 02550 row_infos[i].pix_ldistance -= min_lmargin; 02551 row_infos[i].pix_rdistance -= min_rmargin; 02552 } 02553 } 02554 } 02555 02556 // Run the paragraph detection algorithm. 02557 GenericVector<PARA *> row_owners; 02558 GenericVector<PARA *> the_paragraphs; 02559 if (!is_image_block) { 02560 DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), 02561 models); 02562 } else { 02563 row_owners.init_to_size(row_infos.size(), NULL); 02564 CanonicalizeDetectionResults(&row_owners, block->para_list()); 02565 } 02566 02567 // Now stitch in the row_owners into the rows. 02568 row = *block_start; 02569 for (int i = 0; i < row_owners.size(); i++) { 02570 while (!row.PageResIt()->row()) 02571 row.Next(RIL_TEXTLINE); 02572 row.PageResIt()->row()->row->set_para(row_owners[i]); 02573 row.Next(RIL_TEXTLINE); 02574 } 02575 } 02576 02577 } // namespace