00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
00021 #define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
00022
00023 #include "paragraphs.h"
00024 #include "strings.h"
00025
00026
00027
00028
00029 class WERD_CHOICE;
00030
00031 namespace tesseract {
00032
00033
00034 bool AsciiLikelyListItem(const STRING &word);
00035
00036
00037 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
00038
00039
00040
00041 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
00042 const STRING &utf8,
00043 bool *is_list, bool *starts_idea, bool *ends_idea);
00044
00045
00046 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
00047 const STRING &utf8,
00048 bool *is_list, bool *starts_idea, bool *ends_idea);
00049
00050 enum LineType {
00051 LT_START = 'S',
00052 LT_BODY = 'C',
00053 LT_UNKNOWN = 'U',
00054 LT_MULTIPLE = 'M',
00055 };
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068 extern const ParagraphModel *kCrownLeft;
00069 extern const ParagraphModel *kCrownRight;
00070
00071 inline bool StrongModel(const ParagraphModel *model) {
00072 return model != NULL && model != kCrownLeft && model != kCrownRight;
00073 }
00074
00075 struct LineHypothesis {
00076 LineHypothesis() : ty(LT_UNKNOWN), model(NULL) {}
00077 LineHypothesis(LineType line_type, const ParagraphModel *m)
00078 : ty(line_type), model(m) {}
00079 LineHypothesis(const LineHypothesis &other)
00080 : ty(other.ty), model(other.model) {}
00081
00082 bool operator==(const LineHypothesis &other) const {
00083 return ty == other.ty && model == other.model;
00084 }
00085
00086 LineType ty;
00087 const ParagraphModel *model;
00088 };
00089
00090 class ParagraphTheory;
00091
00092 typedef GenericVectorEqEq<const ParagraphModel *> SetOfModels;
00093
00094
00095
00096 class RowScratchRegisters {
00097 public:
00098
00099 void Init(const RowInfo &row);
00100
00101 LineType GetLineType() const;
00102
00103 LineType GetLineType(const ParagraphModel *model) const;
00104
00105
00106
00107 void SetStartLine();
00108
00109
00110
00111 void SetBodyLine();
00112
00113
00114 void AddStartLine(const ParagraphModel *model);
00115
00116 void AddBodyLine(const ParagraphModel *model);
00117
00118
00119 void SetUnknown() { hypotheses_.truncate(0); }
00120
00121
00122 void StartHypotheses(SetOfModels *models) const;
00123
00124
00125 void StrongHypotheses(SetOfModels *models) const;
00126
00127
00128 void NonNullHypotheses(SetOfModels *models) const;
00129
00130
00131 void DiscardNonMatchingHypotheses(const SetOfModels &models);
00132
00133
00134
00135 const ParagraphModel *UniqueStartHypothesis() const;
00136
00137
00138
00139 const ParagraphModel *UniqueBodyHypothesis() const;
00140
00141
00142 int OffsideIndent(tesseract::ParagraphJustification just) const {
00143 switch (just) {
00144 case tesseract::JUSTIFICATION_RIGHT: return lindent_;
00145 case tesseract::JUSTIFICATION_LEFT: return rindent_;
00146 default: return lindent_ > rindent_ ? lindent_ : rindent_;
00147 }
00148 }
00149
00150
00151 int AlignsideIndent(tesseract::ParagraphJustification just) const {
00152 switch (just) {
00153 case tesseract::JUSTIFICATION_RIGHT: return rindent_;
00154 case tesseract::JUSTIFICATION_LEFT: return lindent_;
00155 default: return lindent_ > rindent_ ? lindent_ : rindent_;
00156 }
00157 }
00158
00159
00160 static void AppendDebugHeaderFields(GenericVector<STRING> *header);
00161
00162
00163 void AppendDebugInfo(const ParagraphTheory &theory,
00164 GenericVector<STRING> *dbg) const;
00165
00166 const RowInfo *ri_;
00167
00168
00169
00170
00171
00172
00173 int lmargin_;
00174 int lindent_;
00175 int rindent_;
00176 int rmargin_;
00177
00178 private:
00179
00180 GenericVectorEqEq<LineHypothesis> hypotheses_;
00181 };
00182
00183
00184
00185 class ParagraphTheory {
00186 public:
00187
00188
00189 explicit ParagraphTheory(GenericVector<ParagraphModel *> *models)
00190 : models_(models) {}
00191 GenericVector<ParagraphModel *> &models() { return *models_; }
00192 const GenericVector<ParagraphModel *> &models() const { return *models_; }
00193
00194
00195
00196 const ParagraphModel *AddModel(const ParagraphModel &model);
00197
00198
00199 void DiscardUnusedModels(const SetOfModels &used_models);
00200
00201
00202 void NonCenteredModels(SetOfModels *models);
00203
00204
00205
00206 const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows,
00207 int start, int end) const;
00208
00209 int IndexOf(const ParagraphModel *model) const;
00210
00211 private:
00212 GenericVector<ParagraphModel *> *models_;
00213 GenericVectorEqEq<ParagraphModel *> models_we_added_;
00214 };
00215
00216 bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
00217 int row, const ParagraphModel *model);
00218 bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
00219 int row, const ParagraphModel *model);
00220 bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
00221 int a, int b, const ParagraphModel *model);
00222
00223
00224
00225
00226
00227
00228
00229
00230 class ParagraphModelSmearer {
00231 public:
00232 ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,
00233 int row_start, int row_end,
00234 ParagraphTheory *theory);
00235
00236
00237
00238
00239 void Smear();
00240
00241 private:
00242
00243
00244
00245
00246
00247 void CalculateOpenModels(int row_start, int row_end);
00248
00249 SetOfModels &OpenModels(int row) {
00250 return open_models_[row - row_start_ + 1];
00251 }
00252
00253 ParagraphTheory *theory_;
00254 GenericVector<RowScratchRegisters> *rows_;
00255 int row_start_;
00256 int row_end_;
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266 GenericVector<SetOfModels> open_models_;
00267 };
00268
00269
00270
00271
00272 void RecomputeMarginsAndClearHypotheses(
00273 GenericVector<RowScratchRegisters> *rows, int start, int end,
00274 int percentile);
00275
00276
00277 int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
00278 int row_start, int row_end);
00279
00280
00281
00282 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
00283 const RowScratchRegisters &after,
00284 tesseract::ParagraphJustification justification);
00285
00286
00287
00288 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
00289 const RowScratchRegisters &after);
00290
00291
00292 bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
00293 int start, int end, const ParagraphModel *model);
00294
00295
00296 bool LikelyParagraphStart(const RowScratchRegisters &before,
00297 const RowScratchRegisters &after,
00298 tesseract::ParagraphJustification j);
00299
00300
00301
00302
00303 void CanonicalizeDetectionResults(
00304 GenericVector<PARA *> *row_owners,
00305 PARA_LIST *paragraphs);
00306
00307 }
00308 #endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_