tesseract  3.03
tesseract Namespace Reference

Classes

class  TessBaseAPI
class  TessResultRenderer
class  TessTextRenderer
class  TessHOcrRenderer
class  TessPDFRenderer
class  TessUnlvRenderer
class  TessBoxTextRenderer
class  CubeRecoContext
class  CubeClassifier
class  CubeTessClassifier
struct  DocQualCallbacks
class  EquationDetect
class  LTRResultIterator
class  ChoiceIterator
class  MutableIterator
class  PageIterator
struct  BlobData
class  UnicodeSpanSkipper
struct  Cluster
class  SimpleClusterer
struct  GeometricClassifierState
struct  Interval
class  RowInfo
struct  LineHypothesis
class  RowScratchRegisters
class  ParagraphTheory
class  ParagraphModelSmearer
class  ResultIterator
class  TesseractCubeCombiner
struct  TesseractStats
struct  WordData
class  Tesseract
class  ImageThresholder
class  BoxWord
class  CCStruct
class  DetLineFit
class  DPPoint
struct  FontSpacingInfo
struct  FontInfo
struct  FontSet
class  FontInfoTable
struct  ParamsTrainingHypothesis
class  ParamsTrainingBundle
class  UnicharIdArrayUtils
class  AmbigSpec
class  UnicharAmbigs
class  BitVector
class  CCUtilMutex
class  CCUtil
class  DoublePtr
class  GenericHeap
class  PointerVector
class  IndexMap
class  IndexMapBiDi
struct  KDPair
struct  KDPairInc
struct  KDPairDec
class  KDPtrPair
struct  KDPtrPairInc
struct  KDPtrPairDec
class  KDVector
class  ObjectCache
struct  ParamsVectors
class  ParamUtils
class  Param
class  IntParam
class  BoolParam
class  StringParam
class  DoubleParam
class  TFile
class  TessdataManager
class  Classify
class  ErrorCounter
class  IntFeatureDist
class  IntFeatureMap
class  IntFeatureSpace
class  ClassPruner
struct  ShapeDist
class  MasterTrainer
class  SampleIterator
class  ShapeClassifier
struct  UnicharRating
struct  ShapeRating
struct  ShapeQueueEntry
struct  UnicharAndFonts
class  Shape
class  ShapeTable
class  TessClassifier
class  TrainingSample
class  TrainingSampleSet
class  AltList
class  BeamSearch
class  Bmp8
class  CachedFile
class  CharAltList
struct  Bigram
struct  CharBigram
struct  CharBigramTable
class  CharBigrams
class  CharSamp
class  CharSampEnum
class  CharSampSet
class  CharSet
class  CharClassifier
class  CharClassifierFactory
class  ConCompPt
class  ConComp
class  ConvNetCharClassifier
class  CubeLineObject
class  CubeLineSegmenter
class  CubeObject
class  CubeSearchObject
class  CubeTuningParams
class  CubeUtils
class  FeatureBase
class  FeatureBmp
class  FeatureChebyshev
class  FeatureHybrid
class  HybridNeuralNetCharClassifier
class  LangModEdge
class  LangModel
class  SearchColumn
class  SearchNode
class  SearchNodeHashTable
class  SearchObject
class  TessLangModEdge
class  TessLangModel
class  TuningParams
class  WordAltList
class  WordListLangModel
struct  PairSizeInfo
struct  FontPairSizeInfo
class  WordSizeModel
class  WordUnigrams
class  CUtil
struct  NodeChild
class  Dawg
struct  DawgPosition
class  DawgPositionVector
class  SquishedDawg
struct  DawgLoader
class  DawgCache
struct  DawgArgs
class  Dict
class  Trie
class  InputFileBuffer
class  NeuralNet
class  Neuron
struct  AlignedBlobParams
class  AlignedBlob
class  BaselineRow
class  BaselineBlock
class  BaselineDetect
class  GridBase
class  IntGrid
class  BBGrid
struct  PtrHash
class  GridSearch
class  TabEventHandler
class  BlobGrid
class  CCNonTextDetect
class  ColumnFinder
class  ColPartition
class  ColPartitionGrid
class  ColPartitionSet
class  PixelHistogram
class  ShiroRekhaSplitter
class  EquationDetectBase
class  ImageFind
class  LineFinder
class  StrokeWidth
class  TabFind
class  ColSegment
class  TableFinder
class  StructuredTable
class  TableRecognizer
class  TabConstraint
class  TabVector
class  TextlineProjection
class  Textord
class  WorkingPartSet
class  BoxChar
class  File
class  InputBuffer
class  OutputBuffer
class  IcuErrorCode
class  LigatureTable
class  PangoFontInfo
class  FontUtils
class  StringRenderer
struct  SpacingProperties
struct  AssociateStats
class  AssociateUtils
class  LanguageModel
struct  LMConsistencyInfo
class  LMPainPoints
struct  LanguageModelDawgInfo
struct  LanguageModelNgramInfo
struct  ViterbiStateEntry
struct  LanguageModelState
struct  BestChoiceBundle
class  ParamsModel
class  SegSearchPending
class  FRAGMENT
class  Wordrec

Typedefs

typedef int(Dict::* DictFunc )(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
typedef double(Dict::* ProbabilityInContextFunc )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
typedef float(Dict::* ParamsModelClassifyFunc )(const char *lang, void *path)
typedef void(Wordrec::* FillLatticeFunc )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
typedef TessCallback4< const
UNICHARSET &, int,
PageIterator *, Pix * > 
TruthCallback
typedef GenericVectorEqEq
< const ParagraphModel * > 
SetOfModels
typedef void(Tesseract::* WordRecognizer )(WordData *word_data, WERD_RES *word)
typedef GenericVector
< ParamsTrainingHypothesis
ParamsTrainingHypothesisList
typedef GenericVector< UNICHAR_IDUnicharIdVector
typedef GenericVector
< AmbigSpec_LIST * > 
UnicharAmbigsVector
typedef bool(* FileReader )(const STRING &filename, GenericVector< char > *data)
typedef bool(* FileWriter )(const GenericVector< char > &data, const STRING &filename)
typedef KDPairInc< int, int > IntKDPair
typedef GenericHeap
< ShapeQueueEntry
ShapeQueue
typedef signed int char_32
typedef basic_string< char_32string_32
typedef GenericVector< NodeChildNodeChildVector
typedef GenericVector< int > SuccessorList
typedef GenericVector
< SuccessorList * > 
SuccessorListsVector
typedef GenericVector< Dawg * > DawgVector
typedef GridSearch< BLOBNBOX,
BLOBNBOX_CLIST, BLOBNBOX_C_IT > 
BlobGridSearch
typedef GridSearch
< ColPartition,
ColPartition_CLIST,
ColPartition_C_IT > 
ColPartitionGridSearch
typedef GenericVector
< ColPartitionSet * > 
PartSetVector
typedef TessResultCallback1
< bool, int > 
WidthCallback
typedef BBGrid< ColSegment,
ColSegment_CLIST,
ColSegment_C_IT > 
ColSegmentGrid
typedef GridSearch< ColSegment,
ColSegment_CLIST,
ColSegment_C_IT > 
ColSegmentGridSearch
typedef hash_map< string,
string, StringHash
LigHash
typedef GenericHeap
< MatrixCoordPair
PainPointHeap
typedef unsigned char LanguageModelFlagsType

Enumerations

enum  LineType { LT_START = 'S', LT_BODY = 'C', LT_UNKNOWN = 'U', LT_MULTIPLE = 'M' }
enum  CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT }
enum  NormalizationMode { NM_BASELINE = -3, NM_CHAR_ISOTROPIC = -2, NM_CHAR_ANISOTROPIC = -1 }
enum  kParamsTrainingFeatureType {
  PTRAIN_DIGITS_SHORT, PTRAIN_DIGITS_MED, PTRAIN_DIGITS_LONG, PTRAIN_NUM_SHORT,
  PTRAIN_NUM_MED, PTRAIN_NUM_LONG, PTRAIN_DOC_SHORT, PTRAIN_DOC_MED,
  PTRAIN_DOC_LONG, PTRAIN_DICT_SHORT, PTRAIN_DICT_MED, PTRAIN_DICT_LONG,
  PTRAIN_FREQ_SHORT, PTRAIN_FREQ_MED, PTRAIN_FREQ_LONG, PTRAIN_SHAPE_COST_PER_CHAR,
  PTRAIN_NGRAM_COST_PER_CHAR, PTRAIN_NUM_BAD_PUNC, PTRAIN_NUM_BAD_CASE, PTRAIN_XHEIGHT_CONSISTENCY,
  PTRAIN_NUM_BAD_CHAR_TYPE, PTRAIN_NUM_BAD_SPACING, PTRAIN_NUM_BAD_FONT, PTRAIN_RATING_PER_CHAR,
  PTRAIN_NUM_FEATURE_TYPES
}
enum  Orientation { ORIENTATION_PAGE_UP = 0, ORIENTATION_PAGE_RIGHT = 1, ORIENTATION_PAGE_DOWN = 2, ORIENTATION_PAGE_LEFT = 3 }
enum  WritingDirection { WRITING_DIRECTION_LEFT_TO_RIGHT = 0, WRITING_DIRECTION_RIGHT_TO_LEFT = 1, WRITING_DIRECTION_TOP_TO_BOTTOM = 2 }
enum  TextlineOrder { TEXTLINE_ORDER_LEFT_TO_RIGHT = 0, TEXTLINE_ORDER_RIGHT_TO_LEFT = 1, TEXTLINE_ORDER_TOP_TO_BOTTOM = 2 }
enum  PageSegMode {
  PSM_OSD_ONLY, PSM_AUTO_OSD, PSM_AUTO_ONLY, PSM_AUTO,
  PSM_SINGLE_COLUMN, PSM_SINGLE_BLOCK_VERT_TEXT, PSM_SINGLE_BLOCK, PSM_SINGLE_LINE,
  PSM_SINGLE_WORD, PSM_CIRCLE_WORD, PSM_SINGLE_CHAR, PSM_SPARSE_TEXT,
  PSM_SPARSE_TEXT_OSD, PSM_COUNT
}
enum  PageIteratorLevel {
  RIL_BLOCK, RIL_PARA, RIL_TEXTLINE, RIL_WORD,
  RIL_SYMBOL
}
enum  ParagraphJustification { JUSTIFICATION_UNKNOWN, JUSTIFICATION_LEFT, JUSTIFICATION_CENTER, JUSTIFICATION_RIGHT }
enum  OcrEngineMode { OEM_TESSERACT_ONLY, OEM_CUBE_ONLY, OEM_TESSERACT_CUBE_COMBINED, OEM_DEFAULT }
enum  ScriptPos { SP_NORMAL, SP_SUBSCRIPT, SP_SUPERSCRIPT, SP_DROPCAP }
enum  AmbigType {
  NOT_AMBIG, REPLACE_AMBIG, DEFINITE_AMBIG, SIMILAR_AMBIG,
  CASE_AMBIG, AMBIG_TYPE_COUNT
}
enum  SetParamConstraint { SET_PARAM_CONSTRAINT_NONE, SET_PARAM_CONSTRAINT_DEBUG_ONLY, SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY, SET_PARAM_CONSTRAINT_NON_INIT_ONLY }
enum  TessdataType {
  TESSDATA_LANG_CONFIG, TESSDATA_UNICHARSET, TESSDATA_AMBIGS, TESSDATA_INTTEMP,
  TESSDATA_PFFMTABLE, TESSDATA_NORMPROTO, TESSDATA_PUNC_DAWG, TESSDATA_SYSTEM_DAWG,
  TESSDATA_NUMBER_DAWG, TESSDATA_FREQ_DAWG, TESSDATA_FIXED_LENGTH_DAWGS, TESSDATA_CUBE_UNICHARSET,
  TESSDATA_CUBE_SYSTEM_DAWG, TESSDATA_SHAPE_TABLE, TESSDATA_BIGRAM_DAWG, TESSDATA_UNAMBIG_DAWG,
  TESSDATA_PARAMS_MODEL, TESSDATA_NUM_ENTRIES
}
enum  CharSegmentationType { CST_FRAGMENT, CST_WHOLE, CST_IMPROPER, CST_NGRAM }
enum  CountTypes {
  CT_UNICHAR_TOP_OK, CT_UNICHAR_TOP1_ERR, CT_UNICHAR_TOP2_ERR, CT_UNICHAR_TOPN_ERR,
  CT_UNICHAR_TOPTOP_ERR, CT_OK_MULTI_UNICHAR, CT_OK_JOINED, CT_OK_BROKEN,
  CT_REJECT, CT_FONT_ATTR_ERR, CT_OK_MULTI_FONT, CT_NUM_RESULTS,
  CT_RANK, CT_REJECTED_JUNK, CT_ACCEPTED_JUNK, CT_SIZE
}
enum  DawgType {
  DAWG_TYPE_PUNCTUATION, DAWG_TYPE_WORD, DAWG_TYPE_NUMBER, DAWG_TYPE_PATTERN,
  DAWG_TYPE_COUNT
}
enum  XHeightConsistencyEnum { XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT }
enum  ColumnSpanningType {
  CST_NOISE, CST_FLOWING, CST_HEADING, CST_PULLOUT,
  CST_COUNT
}
enum  NeighbourPartitionType {
  NPT_HTEXT, NPT_VTEXT, NPT_WEAK_HTEXT, NPT_WEAK_VTEXT,
  NPT_IMAGE, NPT_COUNT
}
enum  LeftOrRight { LR_LEFT, LR_RIGHT }
enum  ColSegType {
  COL_UNKNOWN, COL_TEXT, COL_TABLE, COL_MIXED,
  COL_COUNT
}
enum  TabAlignment {
  TA_LEFT_ALIGNED, TA_LEFT_RAGGED, TA_CENTER_JUSTIFIED, TA_RIGHT_ALIGNED,
  TA_RIGHT_RAGGED, TA_SEPARATOR, TA_COUNT
}
enum  LMPainPointsType {
  LM_PPTYPE_BLAMER, LM_PPTYPE_AMBIG, LM_PPTYPE_PATH, LM_PPTYPE_SHAPE,
  LM_PPTYPE_NUM
}

Functions

double prec (double x)
long dist2 (int x1, int y1, int x2, int y2)
TBOX char_box_to_tbox (Box *char_box, TBOX word_box, int x_offset)
bool IsTextOrEquationType (PolyBlockType type)
bool IsLeftIndented (const EquationDetect::IndentType type)
bool IsRightIndented (const EquationDetect::IndentType type)
STRING RtlEmbed (const STRING &word, bool rtlify)
bool IsLatinLetter (int ch)
bool IsDigitLike (int ch)
bool IsOpeningPunct (int ch)
bool IsTerminalPunct (int ch)
const char * SkipChars (const char *str, const char *toskip)
const char * SkipChars (const char *str, bool(*skip)(int))
const char * SkipOne (const char *str, const char *toskip)
bool LikelyListNumeral (const STRING &word)
bool LikelyListMark (const STRING &word)
bool AsciiLikelyListItem (const STRING &word)
int UnicodeFor (const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
bool LikelyListMarkUnicode (int ch)
bool UniLikelyListItem (const UNICHARSET *u, const WERD_CHOICE *werd)
void LeftWordAttributes (const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
void RightWordAttributes (const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
int ClosestCluster (const GenericVector< Cluster > &clusters, int value)
void CalculateTabStops (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, int tolerance, GenericVector< Cluster > *left_tabs, GenericVector< Cluster > *right_tabs)
void MarkRowsWithModel (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, const ParagraphModel *model, bool ltr, int eop_threshold)
void GeometricClassifyThreeTabStopTextBlock (int debug_level, GeometricClassifierState &s, ParagraphTheory *theory)
void GeometricClassify (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
bool ValidFirstLine (const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool ValidBodyLine (const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool CrownCompatible (const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
void DiscardUnusedModels (const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory)
void DowngradeWeakestToCrowns (int debug_level, ParagraphTheory *theory, GenericVector< RowScratchRegisters > *rows)
void RecomputeMarginsAndClearHypotheses (GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
int InterwordSpace (const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
bool FirstWordWouldHaveFit (const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool FirstWordWouldHaveFit (const RowScratchRegisters &before, const RowScratchRegisters &after)
bool TextSupportsBreak (const RowScratchRegisters &before, const RowScratchRegisters &after)
bool LikelyParagraphStart (const RowScratchRegisters &before, const RowScratchRegisters &after)
bool LikelyParagraphStart (const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification j)
ParagraphModel InternalParagraphModelByOutline (const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance, bool *consistent)
ParagraphModel ParagraphModelByOutline (int debug_level, const GenericVector< RowScratchRegisters > *rows, int start, int end, int tolerance)
bool RowsFitModel (const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
void MarkStrongEvidence (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end)
void ModelStrongEvidence (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, bool allow_flush_models, ParagraphTheory *theory)
void StrongEvidenceClassify (int debug_level, GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
void SeparateSimpleLeaderLines (GenericVector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
void ConvertHypothesizedModelRunsToParagraphs (int debug_level, const GenericVector< RowScratchRegisters > &rows, GenericVector< PARA * > *row_owners, ParagraphTheory *theory)
bool RowIsStranded (const GenericVector< RowScratchRegisters > &rows, int row)
void LeftoverSegments (const GenericVector< RowScratchRegisters > &rows, GenericVector< Interval > *to_fix, int row_start, int row_end)
void CanonicalizeDetectionResults (GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs)
void DetectParagraphs (int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA * > *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel * > *models)
void InitializeTextAndBoxesPreRecognition (const MutableIterator &it, RowInfo *info)
void InitializeRowInfo (bool after_recognition, const MutableIterator &it, RowInfo *info)
void DetectParagraphs (int debug_level, bool after_text_recognition, const MutableIterator *block_start, GenericVector< ParagraphModel * > *models)
bool StrongModel (const ParagraphModel *model)
bool read_t (PAGE_RES_IT *page_res_it, TBOX *tbox)
void YOutlierPieces (WERD_RES *word, int rebuilt_blob_index, int super_y_bottom, int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers, ScriptPos *trailing_pos, int *num_trailing_outliers)
bool CompareFontInfo (const FontInfo &fi1, const FontInfo &fi2)
bool CompareFontSet (const FontSet &fs1, const FontSet &fs2)
void FontInfoDeleteCallback (FontInfo f)
void FontSetDeleteCallback (FontSet fs)
bool read_info (FILE *f, FontInfo *fi, bool swap)
bool write_info (FILE *f, const FontInfo &fi)
bool read_spacing_info (FILE *f, FontInfo *fi, bool swap)
bool write_spacing_info (FILE *f, const FontInfo &fi)
bool read_set (FILE *f, FontSet *fs, bool swap)
bool write_set (FILE *f, const FontSet &fs)
int OtsuThreshold (Pix *src_pix, int left, int top, int width, int height, int **thresholds, int **hi_values)
void HistogramRect (Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
int OtsuStats (const int *histogram, int *H_out, int *omega0_out)
int ParamsTrainingFeatureByName (const char *name)
const char * ScriptPosToString (enum ScriptPos script_pos)
 ELISTIZE (AmbigSpec)
 ELISTIZEH (AmbigSpec)
bool LoadDataFromFile (const STRING &filename, GenericVector< char > *data)
bool SaveDataToFile (const GenericVector< char > &data, const STRING &filename)
template<typename T >
bool cmp_eq (T const &t1, T const &t2)
template<typename T >
int sort_cmp (const void *t1, const void *t2)
template<typename T >
int sort_ptr_cmp (const void *t1, const void *t2)
TrainingSampleBlobToTrainingSample (const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
uinT8 NormalizeDirection (uinT8 dir, const FCOORD &unnormed_pos, const DENORM &denorm, const DENORM *root_denorm)
void ClearFeatureSpaceWindow (NORM_METHOD norm_method, ScrollView *window)
void CallWithUTF8 (TessCallback1< const char * > *cb, const WERD_CHOICE *wc)
Pix * GridReducedPix (const TBOX &box, int gridsize, ICOORD bleft, int *left, int *bottom)
Pix * TraceOutlineOnReducedPix (C_OUTLINE *outline, int gridsize, ICOORD bleft, int *left, int *bottom)
Pix * TraceBlockOnReducedPix (BLOCK *block, int gridsize, ICOORD bleft, int *left, int *bottom)
template<class BBC >
int SortByBoxLeft (const void *void1, const void *void2)
template<class BBC >
int SortRightToLeft (const void *void1, const void *void2)
template<class BBC >
int SortByBoxBottom (const void *void1, const void *void2)
template<typename T >
void DeleteObject (T *object)
void ParseCommandLineFlags (const char *usage, int *argc, char ***argv, const bool remove_flags)
ShapeTableLoadShapeTable (const STRING &file_prefix)
void WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table)
MasterTrainerLoadTrainingData (int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
Pix * DegradeImage (Pix *input, int exposure, float *rotation)
void UTF8ToUTF32 (const char *utf8_str, GenericVector< char32 > *str32)
void UTF32ToUTF8 (const GenericVector< char32 > &str32, STRING *utf8_str)
bool is_hyphen_punc (const char32 ch)
bool is_single_quote (const char32 ch)
bool is_double_quote (const char32 ch)
STRING NormalizeUTF8String (const char *str8)
void NormalizeChar32 (char32 ch, GenericVector< char32 > *str)
char32 OCRNormalize (char32 ch)
bool IsOCREquivalent (char32 ch1, char32 ch2)
bool IsValidCodepoint (const char32 ch)
bool IsWhitespace (const char32 ch)
bool IsUTF8Whitespace (const char *text)
int SpanUTF8Whitespace (const char *text)
int SpanUTF8NotWhitespace (const char *text)
bool IsInterchangeValid (const char32 ch)
bool IsInterchangeValid7BitAscii (const char32 ch)
char32 FullwidthToHalfwidth (const char32 ch)
Pix * CairoARGB32ToPixFormat (cairo_surface_t *surface)
void ExtractFontProperties (const string &utf8_text, StringRenderer *render, const string &output_base)
bool MakeIndividualGlyphs (Pix *pix, const vector< BoxChar * > &vbox, const int input_tiff_page)
 ELISTIZE (ViterbiStateEntry)
 ELISTIZEH (ViterbiStateEntry)
template<class BLOB_CHOICE >
int SortByUnicharID (const void *void1, const void *void2)
template<class BLOB_CHOICE >
int SortByRating (const void *void1, const void *void2)

Variables

const int kBasicBufSize = 2048
const float kMathDigitDensityTh1 = 0.25
const float kMathDigitDensityTh2 = 0.1
const float kMathItalicDensityTh = 0.5
const float kUnclearDensityTh = 0.25
const int kSeedBlobsCountTh = 10
const int kLeftIndentAlignmentCountTh = 1
const int kMaxCharTopRange = 48
const int kMinCredibleResolution = 70
 Minimum believable resolution.
const int kDefaultResolution = 300
 Default resolution used if input in not believable.
const int kMaxCircleErosions = 8
const ParagraphModelkCrownLeft = reinterpret_cast<ParagraphModel *>(0xDEAD111F)
const ParagraphModelkCrownRight = reinterpret_cast<ParagraphModel *>(0xDEAD888F)
const inT16 kMaxBoxEdgeDiff = 2
const int kBoxClipTolerance = 2
const int kNumEndPoints = 3
const int kMinPointsForErrorCount = 16
const int kMaxRealDistance = 2.0
const int kHistogramSize = 256
const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1)
CCUtilMutex tprintfMutex
const char * kUTF8LineSeparator = "\u2028"
const char * kUTF8ParagraphSeparator = "\u2029"
const char * kLRM = "\u200E"
const char * kRLM = "\u200F"
const char * kRLE = "\u202A"
const char * kPDF = "\u202C"
const char * kHyphenLikeUTF8 []
const char * kApostropheLikeUTF8 []
const char kUniversalAmbigsFile []
const int ksizeofUniversalAmbigsFile = sizeof(kUniversalAmbigsFile)
const double kRatingEpsilon = 1.0 / 32
const int kMaxOffsetDist = 32
const double kMinPCLengthIncrease = 1.0 / 1024
const int kMinClusteredShapes = 1
const int kMaxUnicharsPerCluster = 2000
const float kFontMergeDistance = 0.025
const float kInfiniteDist = 999.0f
const int kRandomizingCenter = 128
const int kTestChar = -1
const int kSquareLimit = 25
const int kPrime1 = 17
const int kPrime2 = 13
const int kMinOutlierSamples = 5
const int kStateCnt = 4
const int kNumLiteralCnt = 5
const int case_state_table [6][4]
const char kDoNotReverse [] = "RRP_DO_NO_REVERSE"
const char kReverseIfHasRTL [] = "RRP_REVERSE_IF_HAS_RTL"
const char kForceReverse [] = "RRP_FORCE_REVERSE"
const char *const RTLReversePolicyNames []
const double kAlignedFraction = 0.03125
const double kRaggedFraction = 2.5
const double kAlignedGapFraction = 0.75
const double kRaggedGapFraction = 1.0
const int kVLineAlignment = 3
const int kVLineGutter = 1
const int kVLineSearchSize = 150
const int kMinRaggedTabs = 5
const int kMinAlignedTabs = 4
const int kVLineMinLength = 500
const double kMinTabGradient = 4.0
const int kMaxSkewFactor = 15
const char * kTextordDebugPix = "psdebug_pix"
const double kMaxSmallNeighboursPerPix = 1.0 / 32
const int kMaxLargeOverlapsWithSmall = 3
const int kMaxMediumOverlapsWithSmall = 12
const int kMaxLargeOverlapsWithMedium = 12
const int kOriginalNoiseMultiple = 8
const int kNoisePadding = 4
const double kPhotoOffsetFraction = 0.375
const double kMinGoodTextPARatio = 1.5
const int kMinColumnWidth = 100
const int kMaxIncompatibleColumnCount = 2
const double kMarginOverlapFraction = 0.25
const double kHorizontalGapMergeFraction = 0.5
const double kMinNonNoiseFraction = 0.5
const double kMinGutterWidthGrid = 0.5
const double kMaxDistToPartSizeRatio = 1.5
bool textord_tabfind_show_initial_partitions = false
bool textord_tabfind_show_reject_blobs = false
int textord_tabfind_show_partitions = 0
bool textord_tabfind_show_columns = false
bool textord_tabfind_show_blocks = false
bool textord_tabfind_find_tables = true
const int kMaxPartnerDepth = 4
const double kMaxSpacingDrift = 1.0 / 72
const double kMaxTopSpacingFraction = 0.25
const double kMaxSameBlockLineSpacing = 3
const double kMaxSizeRatio = 1.5
const double kMaxLeaderGapFractionOfMax = 0.25
const double kMaxLeaderGapFractionOfMin = 0.5
const int kMinLeaderCount = 5
const int kLeaderCutCost = 8
const int kMinStrongTextValue = 6
const int kMinChainTextValue = 3
const int kHorzStrongTextlineCount = 8
const int kHorzStrongTextlineHeight = 10
const int kHorzStrongTextlineAspect = 5
const double kMaxBaselineError = 0.4375
const double kMinBaselineCoverage = 0.5
const int kMaxRMSColorNoise = 128
const int kMaxColorDistance = 900
const int kRGBRMSColors = 4
bool textord_tabfind_show_color_fit = false
const int kMaxPadFactor = 6
const int kMaxNeighbourDistFactor = 4
const int kMaxCaptionLines = 7
const double kMinCaptionGapRatio = 2.0
const double kMinCaptionGapHeightRatio = 0.5
const double kBigPartSizeRatio = 1.75
const double kStrokeWidthFractionTolerance = 0.25
const double kStrokeWidthConstantTolerance = 2.0
const double kTinyEnoughTextlineOverlapFraction = 0.25
const double kMaxPartitionSpacing = 1.75
const int kSmoothDecisionMargin = 4
const double kMinRectangularFraction = 0.125
const double kMaxRectangularFraction = 0.75
const double kMaxRectangularGradient = 0.1
const int kMinImageFindSize = 100
const double kRMSFitScaling = 8.0
const int kMinColorDifference = 16
const int kThinLineFraction = 20
 Denominator of resolution makes max pixel width to allow thin lines.
const int kMinLineLengthFraction = 4
 Denominator of resolution makes min pixels to demand line lengths to be.
const int kCrackSpacing = 100
 Spacing of cracks across the page to break up tall vertical lines.
const int kLineFindGridSize = 50
 Grid size used by line finder. Not very critical.
const int kMinThickLineWidth = 12
const int kMaxLineResidue = 6
const double kThickLengthMultiple = 0.75
const double kMaxNonLineDensity = 0.25
const double kMaxStaveHeight = 1.0
const double kMinMusicPixelFraction = 0.75
int textord_tabfind_show_strokewidths = 0
bool textord_tabfind_only_strokewidths = false
bool textord_tabfind_vertical_text = true
bool textord_tabfind_force_vertical_text = false
bool textord_tabfind_vertical_horizontal_mix = true
double textord_tabfind_vertical_text_ratio = 0.5
const double kStrokeWidthTolerance = 1.5
const double kStrokeWidthFractionCJK = 0.25
const double kStrokeWidthCJK = 2.0
const int kCJKRadius = 2
const double kCJKBrokenDistanceFraction = 0.25
const int kCJKMaxComponents = 8
const double kCJKAspectRatio = 1.25
const double kCJKAspectRatioIncrease = 1.0625
const int kMaxCJKSizeRatio = 5
const double kBrokenCJKIterationFraction = 0.125
const double kDiacriticXPadRatio = 7.0
const double kDiacriticYPadRatio = 1.75
const double kMinDiacriticSizeRatio = 1.0625
const double kMaxDiacriticDistanceRatio = 1.25
const double kMaxDiacriticGapToBaseCharHeight = 1.0
const int kSearchRadius = 2
const int kLineTrapLongest = 4
const int kLineTrapShortest = 2
const int kMostlyOneDirRatio = 3
const double kLineResidueAspectRatio = 8.0
const int kLineResiduePadRatio = 3
const double kLineResidueSizeRatio = 1.75
const float kSizeRatioToReject = 2.0
const int kMaxLargeOverlaps = 3
const double kNeighbourSearchFactor = 2.5
const int kTabRadiusFactor = 5
const int kMinVerticalSearch = 3
const int kMaxVerticalSearch = 12
const int kMaxRaggedSearch = 25
const int kMinLinesInColumn = 10
const double kMinFractionalLinesInColumn = 0.125
const double kMinGutterWidthAbsolute = 0.02
const double kMaxGutterWidthAbsolute = 2.00
const int kRaggedGutterMultiple = 5
const double kLineFragmentAspectRatio = 10.0
const double kSmoothFactor = 0.25
const double kCharVerticalOverlapFraction = 0.375
const double kMaxHorizontalGap = 3.0
const int kMinEvaluatedTabs = 3
const int kMaxTextLineBlobRatio = 5
const int kMinTextLineBlobRatio = 3
const double kMinImageArea = 0.5
const double kCosMaxSkewAngle = 0.866025
bool textord_tabfind_show_initialtabs = false
bool textord_tabfind_show_finaltabs = false
double textord_tabfind_aligned_gap_fraction = 0.75
const int kColumnWidthFactor = 20
const int kMaxVerticalSpacing = 500
const int kMaxBlobWidth = 500
const double kSplitPartitionSize = 2.0
const double kAllowTextHeight = 0.5
const double kAllowTextWidth = 0.6
const double kAllowTextArea = 0.8
const double kAllowBlobHeight = 0.3
const double kAllowBlobWidth = 0.4
const double kAllowBlobArea = 0.05
const int kMinBoxesInTextPartition = 10
const int kMaxBoxesInDataPartition = 20
const double kMaxGapInTextPartition = 4.0
const double kMinMaxGapInTextPartition = 0.5
const double kMaxBlobOverlapFactor = 4.0
const double kMaxTableCellXheight = 2.0
const int kMaxColumnHeaderDistance = 4
const double kTableColumnThreshold = 3.0
const int kRulingVerticalMargin = 3
const double kMinOverlapWithTable = 0.6
const int kSideSpaceMargin = 10
const double kSmallTableProjectionThreshold = 0.35
const double kLargeTableProjectionThreshold = 0.45
const int kLargeTableRowCount = 6
const int kMinRowsInTable = 3
const double kRequiredFullJustifiedSpacing = 4.0
const int kAdjacentLeaderSearchPadding = 2
const double kParagraphEndingPreviousLineRatio = 1.3
const double kMaxParagraphEndingLeftSpaceMultiple = 3.0
const double kMinParagraphEndingTextToWhitespaceRatio = 3.0
const double kMaxXProjectionGapFactor = 2.0
const double kStrokeWidthFractionalTolerance = 0.25
bool textord_dump_table_images = false
bool textord_show_tables = false
bool textord_tablefind_show_mark = false
bool textord_tablefind_show_stats = false
bool textord_tablefind_recognize_tables = false
const double kHorizontalSpacing = 0.30
const double kVerticalSpacing = -0.2
const int kCellSplitRowThreshold = 0
const int kCellSplitColumnThreshold = 0
const int kLinedTableMinVerticalLines = 3
const int kLinedTableMinHorizontalLines = 3
const double kRequiredColumns = 0.7
const double kMarginFactor = 1.1
const double kMaxRowSize = 2.5
const double kGoodRowNumberOfColumnsSmall [] = { 2, 2, 2, 2, 2, 3, 3 }
const int kGoodRowNumberOfColumnsSmallSize
const double kGoodRowNumberOfColumnsLarge = 0.7
const double kMinFilledArea = 0.35
const int kGutterMultiple = 4
const int kGutterToNeighbourRatio = 3
const int kSimilarVectorDist = 10
const int kSimilarRaggedDist = 50
const int kMaxFillinMultiple = 11
const double kMinGutterFraction = 0.5
const double kLineCountReciprocal = 4.0
const double kMinAlignedGutter = 0.25
const double kMinRaggedGutter = 1.5
double textord_tabvector_vertical_gap_fraction = 0.5
double textord_tabvector_vertical_box_ratio = 0.5
const char * kAlignmentNames []
const float kRotationRange = 0.02f
const int kExposureFactor = 16
const int kSaltnPepper = 5
const int kMinRampSize = 1000
const int kMinLigature = 0xfb00
const int kMaxLigature = 0xfb4f

Detailed Description

recog_pseudo_word

Make a word from the selected blobs and run Tess on them.

Parameters:
page_resrecognise blobs
selection_boxwithin this box

fp_eval_word_spacing() Evaluation function for fixed pitch word lists.

Basically, count the number of "nice" characters - those which are in tess acceptable words or in dict words and are not rejected. Penalise any potential noise chars

process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

build_menu()

Construct the menu tree used by the command window

process_cmd_win_event()

Process a command returned from the command window (Just call the appropriate command handler)

word_blank_and_set_display() Word processor

Blank display of word then redisplay word according to current display mode settings

---------------------------------------------------------------------------- Public Code ----------------------------------------------------------------------------

---------------------------------------------------------------------------- Include Files and Type Defines ----------------------------------------------------------------------------

---------------------------------------------------------------------------- Include Files and Type Defines ---------------------------------------------------------------------------- ---------------------------------------------------------------------------- Public Code ----------------------------------------------------------------------------

---------------------------------------------------------------------------- Private Code ----------------------------------------------------------------------------


Typedef Documentation

typedef GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> tesseract::BlobGridSearch

Definition at line 31 of file blobgrid.h.

typedef signed int tesseract::char_32

Definition at line 40 of file string_32.h.

typedef GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> tesseract::ColPartitionGridSearch

Definition at line 908 of file colpartition.h.

typedef BBGrid<ColSegment, ColSegment_CLIST, ColSegment_C_IT> tesseract::ColSegmentGrid

Definition at line 118 of file tablefind.h.

typedef GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> tesseract::ColSegmentGridSearch

Definition at line 121 of file tablefind.h.

Definition at line 50 of file dict.h.

typedef int(Dict::* tesseract::DictFunc)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 77 of file baseapi.h.

typedef bool(* tesseract::FileReader)(const STRING &filename, GenericVector< char > *data)

Definition at line 323 of file genericvector.h.

typedef bool(* tesseract::FileWriter)(const GenericVector< char > &data, const STRING &filename)

Definition at line 326 of file genericvector.h.

typedef void(Wordrec::* tesseract::FillLatticeFunc)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Definition at line 86 of file baseapi.h.

typedef KDPairInc<int, int> tesseract::IntKDPair

Definition at line 179 of file kdpair.h.

typedef unsigned char tesseract::LanguageModelFlagsType

Definition at line 37 of file lm_state.h.

typedef hash_map<string, string, StringHash> tesseract::LigHash

Definition at line 32 of file ligature_table.h.

Definition at line 67 of file dawg.h.

typedef float(Dict::* tesseract::ParamsModelClassifyFunc)(const char *lang, void *path)

Definition at line 84 of file baseapi.h.

typedef double(Dict::* tesseract::ProbabilityInContextFunc)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Definition at line 79 of file baseapi.h.

Definition at line 94 of file paragraphs_internal.h.

Definition at line 141 of file shapetable.h.

typedef basic_string<char_32> tesseract::string_32

Definition at line 41 of file string_32.h.

Definition at line 68 of file dawg.h.

Definition at line 69 of file dawg.h.

Definition at line 91 of file baseapi.h.

typedef GenericVector<AmbigSpec_LIST *> tesseract::UnicharAmbigsVector

Definition at line 142 of file ambigs.h.

Definition at line 34 of file ambigs.h.

Definition at line 45 of file tabfind.h.

typedef void(Tesseract::* tesseract::WordRecognizer)(WordData *word_data, WERD_RES *word)

Definition at line 151 of file tesseractclass.h.


Enumeration Type Documentation

Enumerator:
NOT_AMBIG 
REPLACE_AMBIG 
DEFINITE_AMBIG 
SIMILAR_AMBIG 
CASE_AMBIG 
AMBIG_TYPE_COUNT 

Definition at line 44 of file ambigs.h.

               {
  NOT_AMBIG,        // the ngram pair is not ambiguous
  REPLACE_AMBIG,    // ocred ngram should always be substituted with correct
  DEFINITE_AMBIG,   // add correct ngram to the classifier results (1-1)
  SIMILAR_AMBIG,    // use pairwise classifier for ocred/correct pair (1-1)
  CASE_AMBIG,       // this is a case ambiguity (1-1)

  AMBIG_TYPE_COUNT  // number of enum entries
};
Enumerator:
CST_FRAGMENT 
CST_WHOLE 
CST_IMPROPER 
CST_NGRAM 

Definition at line 53 of file classify.h.

                          {
  CST_FRAGMENT,  // A partial character.
  CST_WHOLE,     // A correctly segmented character.
  CST_IMPROPER,  // More than one but less than 2 characters.
  CST_NGRAM      // Multiple characters.
};
Enumerator:
ACTION_1_CMD_EVENT 
RECOG_WERDS 
RECOG_PSEUDO 
ACTION_2_CMD_EVENT 

Definition at line 470 of file tessedit.cpp.

Enumerator:
COL_UNKNOWN 
COL_TEXT 
COL_TABLE 
COL_MIXED 
COL_COUNT 

Definition at line 30 of file tablefind.h.

Enumerator:
CST_NOISE 
CST_FLOWING 
CST_HEADING 
CST_PULLOUT 
CST_COUNT 

Definition at line 47 of file colpartition.h.

                        {
  CST_NOISE,        // Strictly between columns.
  CST_FLOWING,      // Strictly within a single column.
  CST_HEADING,      // Spans multiple columns.
  CST_PULLOUT,      // Touches multiple columns, but doesn't span them.
  CST_COUNT         // Number of entries.
};
Enumerator:
CT_UNICHAR_TOP_OK 
CT_UNICHAR_TOP1_ERR 
CT_UNICHAR_TOP2_ERR 
CT_UNICHAR_TOPN_ERR 
CT_UNICHAR_TOPTOP_ERR 
CT_OK_MULTI_UNICHAR 
CT_OK_JOINED 
CT_OK_BROKEN 
CT_REJECT 
CT_FONT_ATTR_ERR 
CT_OK_MULTI_FONT 
CT_NUM_RESULTS 
CT_RANK 
CT_REJECTED_JUNK 
CT_ACCEPTED_JUNK 
CT_SIZE 

Definition at line 69 of file errorcounter.h.

                {
  CT_UNICHAR_TOP_OK,     // Top shape contains correct unichar id.
  // The rank of the results in TOP1, TOP2, TOPN is determined by a gap of
  // kRatingEpsilon from the first result in each group. The real top choice
  // is measured using TOPTOP.
  CT_UNICHAR_TOP1_ERR,   // Top shape does not contain correct unichar id.
  CT_UNICHAR_TOP2_ERR,   // Top 2 shapes don't contain correct unichar id.
  CT_UNICHAR_TOPN_ERR,   // No output shape contains correct unichar id.
  CT_UNICHAR_TOPTOP_ERR,   // Very top choice not correct.
  CT_OK_MULTI_UNICHAR,   // Top shape id has correct unichar id, and others.
  CT_OK_JOINED,          // Top shape id is correct but marked joined.
  CT_OK_BROKEN,          // Top shape id is correct but marked broken.
  CT_REJECT,             // Classifier hates this.
  CT_FONT_ATTR_ERR,      // Top unichar OK, but font attributes incorrect.
  CT_OK_MULTI_FONT,      // CT_FONT_ATTR_OK but there are multiple font attrs.
  CT_NUM_RESULTS,        // Number of answers produced.
  CT_RANK,               // Rank of correct answer.
  CT_REJECTED_JUNK,      // Junk that was correctly rejected.
  CT_ACCEPTED_JUNK,      // Junk that was incorrectly classified otherwise.

  CT_SIZE                // Number of types for array sizing.
};
Enumerator:
DAWG_TYPE_PUNCTUATION 
DAWG_TYPE_WORD 
DAWG_TYPE_NUMBER 
DAWG_TYPE_PATTERN 
DAWG_TYPE_COUNT 

Definition at line 71 of file dawg.h.

Enumerator:
PTRAIN_DIGITS_SHORT 
PTRAIN_DIGITS_MED 
PTRAIN_DIGITS_LONG 
PTRAIN_NUM_SHORT 
PTRAIN_NUM_MED 
PTRAIN_NUM_LONG 
PTRAIN_DOC_SHORT 
PTRAIN_DOC_MED 
PTRAIN_DOC_LONG 
PTRAIN_DICT_SHORT 
PTRAIN_DICT_MED 
PTRAIN_DICT_LONG 
PTRAIN_FREQ_SHORT 
PTRAIN_FREQ_MED 
PTRAIN_FREQ_LONG 
PTRAIN_SHAPE_COST_PER_CHAR 
PTRAIN_NGRAM_COST_PER_CHAR 
PTRAIN_NUM_BAD_PUNC 
PTRAIN_NUM_BAD_CASE 
PTRAIN_XHEIGHT_CONSISTENCY 
PTRAIN_NUM_BAD_CHAR_TYPE 
PTRAIN_NUM_BAD_SPACING 
PTRAIN_NUM_BAD_FONT 
PTRAIN_RATING_PER_CHAR 
PTRAIN_NUM_FEATURE_TYPES 

Definition at line 39 of file params_training_featdef.h.

                                {
  // Digits
  PTRAIN_DIGITS_SHORT,             // 0
  PTRAIN_DIGITS_MED,               // 1
  PTRAIN_DIGITS_LONG,              // 2
  // Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)
  PTRAIN_NUM_SHORT,                // 3
  PTRAIN_NUM_MED,                  // 4
  PTRAIN_NUM_LONG,                 // 5
  // Document word (DOC_DAWG_PERM)
  PTRAIN_DOC_SHORT,                // 6
  PTRAIN_DOC_MED,                  // 7
  PTRAIN_DOC_LONG,                 // 8
  // Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)
  PTRAIN_DICT_SHORT,               // 9
  PTRAIN_DICT_MED,                 // 10
  PTRAIN_DICT_LONG,                // 11
  // Frequent word (FREQ_DAWG_PERM)
  PTRAIN_FREQ_SHORT,               // 12
  PTRAIN_FREQ_MED,                 // 13
  PTRAIN_FREQ_LONG,                // 14
  PTRAIN_SHAPE_COST_PER_CHAR,      // 15
  PTRAIN_NGRAM_COST_PER_CHAR,      // 16
  PTRAIN_NUM_BAD_PUNC,             // 17
  PTRAIN_NUM_BAD_CASE,             // 18
  PTRAIN_XHEIGHT_CONSISTENCY,      // 19
  PTRAIN_NUM_BAD_CHAR_TYPE,        // 20
  PTRAIN_NUM_BAD_SPACING,          // 21
  PTRAIN_NUM_BAD_FONT,             // 22
  PTRAIN_RATING_PER_CHAR,          // 23

  PTRAIN_NUM_FEATURE_TYPES
};
Enumerator:
LR_LEFT 
LR_RIGHT 

Definition at line 39 of file strokewidth.h.

Enumerator:
LT_START 
LT_BODY 
LT_UNKNOWN 
LT_MULTIPLE 

Definition at line 54 of file paragraphs_internal.h.

              {
  LT_START = 'S',     // First line of a paragraph.
  LT_BODY = 'C',      // Continuation line of a paragraph.
  LT_UNKNOWN = 'U',   // No clues.
  LT_MULTIPLE = 'M',  // Matches for both LT_START and LT_BODY.
};
Enumerator:
LM_PPTYPE_BLAMER 
LM_PPTYPE_AMBIG 
LM_PPTYPE_PATH 
LM_PPTYPE_SHAPE 
LM_PPTYPE_NUM 

Definition at line 37 of file lm_pain_points.h.

Enumerator:
NPT_HTEXT 
NPT_VTEXT 
NPT_WEAK_HTEXT 
NPT_WEAK_VTEXT 
NPT_IMAGE 
NPT_COUNT 

Definition at line 1499 of file colpartitiongrid.cpp.

                            {
  NPT_HTEXT,       // Definite horizontal text.
  NPT_VTEXT,       // Definite vertical text.
  NPT_WEAK_HTEXT,  // Weakly horizontal text. Counts as HTEXT for HTEXT, but
                   // image for image and VTEXT.
  NPT_WEAK_VTEXT,  // Weakly vertical text. Counts as VTEXT for VTEXT, but
                   // image for image and HTEXT.
  NPT_IMAGE,       // Defininte non-text.
  NPT_COUNT        // Number of array elements.
};
Enumerator:
NM_BASELINE 
NM_CHAR_ISOTROPIC 
NM_CHAR_ANISOTROPIC 

Definition at line 44 of file normalis.h.

                       {
  NM_BASELINE = -3,         // The original BL normalization mode.
  NM_CHAR_ISOTROPIC = -2,   // Character normalization but isotropic.
  NM_CHAR_ANISOTROPIC = -1  // The original CN normalization mode.
};

When Tesseract/Cube is initialized we can choose to instantiate/load/run only the Tesseract part, only the Cube part or both along with the combiner. The preference of which engine to use is stored in tessedit_ocr_engine_mode.

ATTENTION: When modifying this enum, please make sure to make the appropriate changes to all the enums mirroring it (e.g. OCREngine in cityblock/workflow/detection/detection_storage.proto). Such enums will mention the connection to OcrEngineMode in the comments.

Enumerator:
OEM_TESSERACT_ONLY 
OEM_CUBE_ONLY 
OEM_TESSERACT_CUBE_COMBINED 
OEM_DEFAULT 

Definition at line 244 of file publictypes.h.

                   {
  OEM_TESSERACT_ONLY,           // Run Tesseract only - fastest
  OEM_CUBE_ONLY,                // Run Cube only - better accuracy, but slower
  OEM_TESSERACT_CUBE_COMBINED,  // Run both and combine results - best accuracy
  OEM_DEFAULT                   // Specify this mode when calling init_*(),
                                // to indicate that any of the above modes
                                // should be automatically inferred from the
                                // variables in the language-specific config,
                                // command-line configs, or if not specified
                                // in any of the above should be set to the
                                // default OEM_TESSERACT_ONLY.
};

+------------------+ Orientation Example: | 1 Aaaa Aaaa Aaaa | ==================== | Aaa aa aaa aa | To left is a diagram of some (1) English and | aaaaaa A aa aaa. | (2) Chinese text and a (3) photo credit. | 2 | | ####### c c C | Upright Latin characters are represented as A and a. | ####### c c c | '<' represents a latin character rotated | < ####### c c c | anti-clockwise 90 degrees. | < ####### c c | | < ####### . c | Upright Chinese characters are represented C and c. | 3 ####### c | +------------------+ NOTA BENE: enum values here should match goodoc.proto

If you orient your head so that "up" aligns with Orientation, then the characters will appear "right side up" and readable.

In the example above, both the English and Chinese paragraphs are oriented so their "up" is the top of the page (page up). The photo credit is read with one's head turned leftward ("up" is to page left).

The values of this enum match the convention of Tesseract's osdetect.h

Enumerator:
ORIENTATION_PAGE_UP 
ORIENTATION_PAGE_RIGHT 
ORIENTATION_PAGE_DOWN 
ORIENTATION_PAGE_LEFT 

Definition at line 108 of file publictypes.h.

enum of the elements of the page hierarchy, used in ResultIterator to provide functions that operate on each level without having to have 5x as many functions.

Enumerator:
RIL_BLOCK 
RIL_PARA 
RIL_TEXTLINE 
RIL_WORD 
RIL_SYMBOL 

Definition at line 195 of file publictypes.h.

                       {
  RIL_BLOCK,     // Block of text/image/separator line.
  RIL_PARA,      // Paragraph within a block.
  RIL_TEXTLINE,  // Line within a paragraph.
  RIL_WORD,      // Word within a textline.
  RIL_SYMBOL     // Symbol/character within a word.
};

Possible modes for page layout analysis. These *must* be kept in order of decreasing amount of layout analysis to be done, except for OSD_ONLY, so that the inequality test macros below work.

Enumerator:
PSM_OSD_ONLY 

Orientation and script detection only.

PSM_AUTO_OSD 

Automatic page segmentation with orientation and script detection. (OSD)

PSM_AUTO_ONLY 

Automatic page segmentation, but no OSD, or OCR.

PSM_AUTO 

Fully automatic page segmentation, but no OSD.

PSM_SINGLE_COLUMN 

Assume a single column of text of variable sizes.

PSM_SINGLE_BLOCK_VERT_TEXT 

Assume a single uniform block of vertically aligned text.

PSM_SINGLE_BLOCK 

Assume a single uniform block of text. (Default.)

PSM_SINGLE_LINE 

Treat the image as a single text line.

PSM_SINGLE_WORD 

Treat the image as a single word.

PSM_CIRCLE_WORD 

Treat the image as a single word in a circle.

PSM_SINGLE_CHAR 

Treat the image as a single character.

PSM_SPARSE_TEXT 

Find as much text as possible in no particular order.

PSM_SPARSE_TEXT_OSD 

Sparse text with orientation and script det.

PSM_COUNT 

Number of enum entries.

Definition at line 151 of file publictypes.h.

JUSTIFICATION_UNKNONW The alignment is not clearly one of the other options. This could happen for example if there are only one or two lines of text or the text looks like source code or poetry.

NOTA BENE: Fully justified paragraphs (text aligned to both left and right margins) are marked by Tesseract with JUSTIFICATION_LEFT if their text is written with a left-to-right script and with JUSTIFICATION_RIGHT if their text is written in a right-to-left script.

Interpretation for text read in vertical lines: "Left" is wherever the starting reading position is.

JUSTIFICATION_LEFT Each line, except possibly the first, is flush to the same left tab stop.

JUSTIFICATION_CENTER The text lines of the paragraph are centered about a line going down through their middle of the text lines.

JUSTIFICATION_RIGHT Each line, except possibly the first, is flush to the same right tab stop.

Enumerator:
JUSTIFICATION_UNKNOWN 
JUSTIFICATION_LEFT 
JUSTIFICATION_CENTER 
JUSTIFICATION_RIGHT 

Definition at line 227 of file publictypes.h.

Enumerator:
SP_NORMAL 
SP_SUBSCRIPT 
SP_SUPERSCRIPT 
SP_DROPCAP 

Definition at line 245 of file ratngs.h.

Enumerator:
SET_PARAM_CONSTRAINT_NONE 
SET_PARAM_CONSTRAINT_DEBUG_ONLY 
SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY 
SET_PARAM_CONSTRAINT_NON_INIT_ONLY 

Definition at line 36 of file params.h.

Enumerator:
TA_LEFT_ALIGNED 
TA_LEFT_RAGGED 
TA_CENTER_JUSTIFIED 
TA_RIGHT_ALIGNED 
TA_RIGHT_RAGGED 
TA_SEPARATOR 
TA_COUNT 

Definition at line 43 of file tabvector.h.

Enumerator:
TESSDATA_LANG_CONFIG 
TESSDATA_UNICHARSET 
TESSDATA_AMBIGS 
TESSDATA_INTTEMP 
TESSDATA_PFFMTABLE 
TESSDATA_NORMPROTO 
TESSDATA_PUNC_DAWG 
TESSDATA_SYSTEM_DAWG 
TESSDATA_NUMBER_DAWG 
TESSDATA_FREQ_DAWG 
TESSDATA_FIXED_LENGTH_DAWGS 
TESSDATA_CUBE_UNICHARSET 
TESSDATA_CUBE_SYSTEM_DAWG 
TESSDATA_SHAPE_TABLE 
TESSDATA_BIGRAM_DAWG 
TESSDATA_UNAMBIG_DAWG 
TESSDATA_PARAMS_MODEL 
TESSDATA_NUM_ENTRIES 

Definition at line 53 of file tessdatamanager.h.

The text lines are read in the given sequence.

In English, the order is top-to-bottom. In Chinese, vertical text lines are read right-to-left. Mongolian is written in vertical columns top to bottom like Chinese, but the lines order left-to right.

Note that only some combinations make sense. For example, WRITING_DIRECTION_LEFT_TO_RIGHT implies TEXTLINE_ORDER_TOP_TO_BOTTOM

Enumerator:
TEXTLINE_ORDER_LEFT_TO_RIGHT 
TEXTLINE_ORDER_RIGHT_TO_LEFT 
TEXTLINE_ORDER_TOP_TO_BOTTOM 

Definition at line 140 of file publictypes.h.

The grapheme clusters within a line of text are laid out logically in this direction, judged when looking at the text line rotated so that its Orientation is "page up".

For English text, the writing direction is left-to-right. For the Chinese text in the above example, the writing direction is top-to-bottom.

Enumerator:
WRITING_DIRECTION_LEFT_TO_RIGHT 
WRITING_DIRECTION_RIGHT_TO_LEFT 
WRITING_DIRECTION_TOP_TO_BOTTOM 

Definition at line 123 of file publictypes.h.

Enumerator:
XH_GOOD 
XH_SUBNORMAL 
XH_INCONSISTENT 

Definition at line 78 of file dict.h.


Function Documentation

bool tesseract::AsciiLikelyListItem ( const STRING word)

Definition at line 267 of file paragraphs.cpp.

                                             {
  return LikelyListMark(word) || LikelyListNumeral(word);
}
TrainingSample * tesseract::BlobToTrainingSample ( const TBLOB blob,
bool  nonlinear_norm,
INT_FX_RESULT_STRUCT fx_info,
GenericVector< INT_FEATURE_STRUCT > *  bl_features 
)

Definition at line 81 of file intfx.cpp.

                                                    {
  GenericVector<INT_FEATURE_STRUCT> cn_features;
  Classify::ExtractFeatures(blob, nonlinear_norm, bl_features,
                            &cn_features, fx_info, NULL);
  // TODO(rays) Use blob->PreciseBoundingBox() instead.
  TBOX box = blob.bounding_box();
  TrainingSample* sample = NULL;
  int num_features = fx_info->NumCN;
  if (num_features > 0) {
    sample = TrainingSample::CopyFromFeatures(*fx_info, box, &cn_features[0],
                                              num_features);
  }
  if (sample != NULL) {
    // Set the bounding box (in original image coordinates) in the sample.
    TPOINT topleft, botright;
    topleft.x = box.left();
    topleft.y = box.top();
    botright.x = box.right();
    botright.y = box.bottom();
    TPOINT original_topleft, original_botright;
    blob.denorm().DenormTransform(NULL, topleft, &original_topleft);
    blob.denorm().DenormTransform(NULL, botright, &original_botright);
    sample->set_bounding_box(TBOX(original_topleft.x, original_botright.y,
                                  original_botright.x, original_topleft.y));
  }
  return sample;
}
Pix* tesseract::CairoARGB32ToPixFormat ( cairo_surface_t *  surface)

Definition at line 77 of file stringrenderer.cpp.

                                                      {
  if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
    printf("Unexpected surface format %d\n",
           cairo_image_surface_get_format(surface));
    return NULL;
  }
  const int width = cairo_image_surface_get_width(surface);
  const int height = cairo_image_surface_get_height(surface);
  Pix* pix = pixCreate(width, height, 32);
  int byte_stride = cairo_image_surface_get_stride(surface);

  for (int i = 0; i < height; ++i) {
    memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1,
           cairo_image_surface_get_data(surface) + i * byte_stride,
           byte_stride - ((i == height - 1) ? 1 : 0));
  }
  return pix;
}
void tesseract::CalculateTabStops ( GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
int  tolerance,
GenericVector< Cluster > *  left_tabs,
GenericVector< Cluster > *  right_tabs 
)

Definition at line 691 of file paragraphs.cpp.

                                                           {
  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
    return;
  // First pass: toss all left and right indents into clusterers.
  SimpleClusterer initial_lefts(tolerance);
  SimpleClusterer initial_rights(tolerance);
  GenericVector<Cluster> initial_left_tabs;
  GenericVector<Cluster> initial_right_tabs;
  for (int i = row_start; i < row_end; i++) {
    initial_lefts.Add((*rows)[i].lindent_);
    initial_rights.Add((*rows)[i].rindent_);
  }
  initial_lefts.GetClusters(&initial_left_tabs);
  initial_rights.GetClusters(&initial_right_tabs);

  // Second pass: cluster only lines that are not "stray"
  //   An example of a stray line is a page number -- a line whose start
  //   and end tab-stops are far outside the typical start and end tab-stops
  //   for the block.
  //   Put another way, we only cluster data from lines whose start or end
  //   tab stop is frequent.
  SimpleClusterer lefts(tolerance);
  SimpleClusterer rights(tolerance);

  // Outlier elimination.  We might want to switch this to test outlier-ness
  // based on how strange a position an outlier is in instead of or in addition
  // to how rare it is.  These outliers get re-added if we end up having too
  // few tab stops, to work with, however.
  int infrequent_enough_to_ignore = 0;
  if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
  if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;

  for (int i = row_start; i < row_end; i++) {
    int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
    int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
    if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
        initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
      lefts.Add((*rows)[i].lindent_);
      rights.Add((*rows)[i].rindent_);
    }
  }
  lefts.GetClusters(left_tabs);
  rights.GetClusters(right_tabs);

  if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
      (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
    // One side is really ragged, and the other only has one tab stop,
    // so those "insignificant outliers" are probably important, actually.
    // This often happens on a page of an index.  Add back in the ones
    // we omitted in the first pass.
    for (int i = row_start; i < row_end; i++) {
      int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
      int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
      if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
            initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
        lefts.Add((*rows)[i].lindent_);
        rights.Add((*rows)[i].rindent_);
      }
    }
  }
  lefts.GetClusters(left_tabs);
  rights.GetClusters(right_tabs);

  // If one side is almost a two-indent aligned side, and the other clearly
  // isn't, try to prune out the least frequent tab stop from that side.
  if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
    int to_prune = -1;
    for (int i = left_tabs->size() - 1; i >= 0; i--) {
      if (to_prune < 0 ||
          (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
        to_prune = i;
      }
    }
    if (to_prune >= 0 &&
        (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
      left_tabs->remove(to_prune);
    }
  }
  if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
    int to_prune = -1;
    for (int i = right_tabs->size() - 1; i >= 0; i--) {
      if (to_prune < 0 ||
          (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
        to_prune = i;
      }
    }
    if (to_prune >= 0 &&
        (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
      right_tabs->remove(to_prune);
    }
  }
}
void tesseract::CallWithUTF8 ( TessCallback1< const char * > *  cb,
const WERD_CHOICE wc 
)

Definition at line 112 of file dawg.cpp.

                                                                          {
  STRING s;
  wc->string_and_lengths(&s, NULL);
  cb->Run(s.string());
}
void tesseract::CanonicalizeDetectionResults ( GenericVector< PARA * > *  row_owners,
PARA_LIST *  paragraphs 
)

Definition at line 2232 of file paragraphs.cpp.

                           {
  GenericVector<PARA *> &rows = *row_owners;
  paragraphs->clear();
  PARA_IT out(paragraphs);
  PARA *formerly_null = NULL;
  for (int i = 0; i < rows.size(); i++) {
    if (rows[i] == NULL) {
      if (i == 0 || rows[i - 1] != formerly_null) {
        rows[i] = formerly_null = new PARA();
      } else {
        rows[i] = formerly_null;
        continue;
      }
    } else if (i > 0 && rows[i - 1] == rows[i]) {
      continue;
    }
    out.add_after_then_move(rows[i]);
  }
}
TBOX tesseract::char_box_to_tbox ( Box *  char_box,
TBOX  word_box,
int  x_offset 
)

Definition at line 42 of file cube_control.cpp.

                                                                  {
  l_int32 left;
  l_int32 top;
  l_int32 width;
  l_int32 height;
  l_int32 right;
  l_int32 bottom;

  boxGetGeometry(char_box, &left, &top, &width, &height);
  left += word_box.left() - x_offset;
  right = left + width;
  top = word_box.bottom() + word_box.height() - top;
  bottom = top - height;
  return TBOX(left, bottom, right, top);
}
void tesseract::ClearFeatureSpaceWindow ( NORM_METHOD  norm_method,
ScrollView window 
)

Definition at line 1149 of file intproto.cpp.

                                                                          {
  window->Clear();

  window->Pen(ScrollView::GREY);
  // Draw the feature space limit rectangle.
  window->Rectangle(0, 0, INT_MAX_X, INT_MAX_Y);
  if (norm_method == baseline) {
    window->SetCursor(0, INT_DESCENDER);
    window->DrawTo(INT_MAX_X, INT_DESCENDER);
    window->SetCursor(0, INT_BASELINE);
    window->DrawTo(INT_MAX_X, INT_BASELINE);
    window->SetCursor(0, INT_XHEIGHT);
    window->DrawTo(INT_MAX_X, INT_XHEIGHT);
    window->SetCursor(0, INT_CAPHEIGHT);
    window->DrawTo(INT_MAX_X, INT_CAPHEIGHT);
  } else {
    window->Rectangle(INT_XCENTER - INT_XRADIUS, INT_YCENTER - INT_YRADIUS,
                      INT_XCENTER + INT_XRADIUS, INT_YCENTER + INT_YRADIUS);
  }
}
int tesseract::ClosestCluster ( const GenericVector< Cluster > &  clusters,
int  value 
)

Definition at line 665 of file paragraphs.cpp.

                                                                      {
  int best_index = 0;
  for (int i = 0; i < clusters.size(); i++) {
    if (abs(value - clusters[i].center) <
        abs(value - clusters[best_index].center))
        best_index = i;
  }
  return best_index;
}
template<typename T >
bool tesseract::cmp_eq ( T const &  t1,
T const &  t2 
)

Definition at line 356 of file genericvector.h.

                                        {
  return t1 == t2;
}
bool tesseract::CompareFontInfo ( const FontInfo &  fi1,
const FontInfo &  fi2 
)

Definition at line 120 of file fontinfo.cpp.

                                                               {
  // The font properties are required to be the same for two font with the same
  // name, so there is no need to test them.
  // Consequently, querying the table with only its font name as information is
  // enough to retrieve its properties.
  return strcmp(fi1.name, fi2.name) == 0;
}
bool tesseract::CompareFontSet ( const FontSet &  fs1,
const FontSet &  fs2 
)

Definition at line 128 of file fontinfo.cpp.

                                                            {
  if (fs1.size != fs2.size)
    return false;
  for (int i = 0; i < fs1.size; ++i) {
    if (fs1.configs[i] != fs2.configs[i])
      return false;
  }
  return true;
}
void tesseract::ConvertHypothesizedModelRunsToParagraphs ( int  debug_level,
const GenericVector< RowScratchRegisters > &  rows,
GenericVector< PARA * > *  row_owners,
ParagraphTheory *  theory 
)

Definition at line 2041 of file paragraphs.cpp.

                             {
  int end = rows.size();
  int start;
  for (; end > 0; end = start) {
    start = end - 1;
    const ParagraphModel *model = NULL;
    // TODO(eger): Be smarter about dealing with multiple hypotheses.
    bool single_line_paragraph = false;
    SetOfModels models;
    rows[start].NonNullHypotheses(&models);
    if (models.size() > 0) {
      model = models[0];
      if (rows[start].GetLineType(model) != LT_BODY)
        single_line_paragraph = true;
    }
    if (model && !single_line_paragraph) {
      // walk back looking for more body lines and then a start line.
      while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
        // do nothing
      }
      if (start < 0 || rows[start].GetLineType(model) != LT_START) {
        model = NULL;
      }
    }
    if (model == NULL) {
      continue;
    }
    // rows[start, end) should be a paragraph.
    PARA *p = new PARA();
    if (model == kCrownLeft || model == kCrownRight) {
      p->is_very_first_or_continuation = true;
      // Crown paragraph.
      //   If we can find an existing ParagraphModel that fits, use it,
      //   else create a new one.
      for (int row = end; row < rows.size(); row++) {
        if ((*row_owners)[row] &&
            (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
            (start == 0 ||
             ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
          model = (*row_owners)[row]->model;
          break;
        }
      }
      if (model == kCrownLeft) {
        // No subsequent model fits, so cons one up.
        model = theory->AddModel(ParagraphModel(
            JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
            0, 0, Epsilon(rows[start].ri_->average_interword_space)));
      } else if (model == kCrownRight) {
        // No subsequent model fits, so cons one up.
        model = theory->AddModel(ParagraphModel(
            JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
            0, 0, Epsilon(rows[start].ri_->average_interword_space)));
      }
    }
    rows[start].SetUnknown();
    rows[start].AddStartLine(model);
    for (int i = start + 1; i < end; i++) {
      rows[i].SetUnknown();
      rows[i].AddBodyLine(model);
    }
    p->model = model;
    p->has_drop_cap = rows[start].ri_->has_drop_cap;
    p->is_list_item =
        model->justification() == JUSTIFICATION_RIGHT
            ? rows[start].ri_->rword_indicates_list_item
            : rows[start].ri_->lword_indicates_list_item;
    for (int row = start; row < end; row++) {
      if ((*row_owners)[row] != NULL) {
        tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
                "more than once!\n");
      }
      (*row_owners)[row] = p;
    }
  }
}
bool tesseract::CrownCompatible ( const GenericVector< RowScratchRegisters > *  rows,
int  a,
int  b,
const ParagraphModel model 
)

Definition at line 1288 of file paragraphs.cpp.

                                                                {
  if (model != kCrownRight && model != kCrownLeft) {
    tprintf("CrownCompatible() should only be called with crown models!\n");
    return false;
  }
  RowScratchRegisters &row_a = (*rows)[a];
  RowScratchRegisters &row_b = (*rows)[b];
  if (model == kCrownRight) {
    return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
                       row_b.rindent_ + row_b.rmargin_,
                       Epsilon(row_a.ri_->average_interword_space));
  }
  return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
                     row_b.lindent_ + row_b.lmargin_,
                     Epsilon(row_a.ri_->average_interword_space));
}
struct Pix * tesseract::DegradeImage ( Pix *  input,
int  exposure,
float *  rotation 
) [read]

Definition at line 68 of file degradeimage.cpp.

                                                             {
  Pix* pix = pixConvertTo8(input, false);
  pixDestroy(&input);
  input = pix;
  int width = pixGetWidth(input);
  int height = pixGetHeight(input);
  if (exposure >= 2) {
    // An erosion simulates the spreading darkening of a dark copy.
    // This is backwards to binary morphology,
    // see http://www.leptonica.com/grayscale-morphology.html
    pix = input;
    input = pixErodeGray(pix, 3, 3);
    pixDestroy(&pix);
  }
  // A convolution is essential to any mode as no scanner produces an
  // image as sharp as the electronic image.
  pix = pixBlockconv(input, 1, 1);
  pixDestroy(&input);
  // A small random rotation helps to make the edges jaggy in a realistic way.
  if (rotation != NULL) {
    float radians_clockwise;
    if (*rotation) {
      radians_clockwise = *rotation;
    } else {
      radians_clockwise = (2.0*rand_r(&random_seed)/RAND_MAX - 1.0) *
          kRotationRange;
    }

    input = pixRotate(pix, radians_clockwise,
                      L_ROTATE_AREA_MAP, L_BRING_IN_WHITE,
                      0, 0);
    // Rotate the boxes to match.
    *rotation = radians_clockwise;
    pixDestroy(&pix);
  } else {
    input = pix;
  }

  if (exposure >= 3 || exposure == 1) {
    // Erosion after the convolution is not as heavy as before, so it is
    // good for level 1 and in addition as a level 3.
    // This is backwards to binary morphology,
    // see http://www.leptonica.com/grayscale-morphology.html
    pix = input;
    input = pixErodeGray(pix, 3, 3);
    pixDestroy(&pix);
  }
  // The convolution really needed to be 2x2 to be realistic enough, but
  // we only have 3x3, so we have to bias the image darker or lose thin
  // strokes.
  int erosion_offset = 0;
  // For light and 0 exposure, there is no dilation, so compensate for the
  // convolution with a big darkening bias which is undone for lighter
  // exposures.
  if (exposure <= 0)
    erosion_offset = -3 * kExposureFactor;
  // Add in a general offset of the greyscales for the exposure level so
  // a threshold of 128 gives a reasonable binary result.
  erosion_offset -= exposure * kExposureFactor;
  // Add a gradual fade over the page and a small amount of salt and pepper
  // noise to simulate noise in the sensor/paper fibres and varying
  // illumination.
  l_uint32* data = pixGetData(input);
  for (int y = 0; y < height; ++y) {
    for (int x = 0; x < width; ++x) {
      int pixel = GET_DATA_BYTE(data, x);
      pixel += rand_r(&random_seed) % (kSaltnPepper*2 + 1) - kSaltnPepper;
      if (height + width > kMinRampSize)
        pixel -= (2*x + y) * 32 / (height + width);
      pixel += erosion_offset;
      if (pixel < 0)
        pixel = 0;
      if (pixel > 255)
        pixel = 255;
      SET_DATA_BYTE(data, x, pixel);
    }
    data += input->wpl;
  }
  return input;
}
template<typename T >
void tesseract::DeleteObject ( T *  object)

Definition at line 165 of file tablefind.cpp.

                                                   {
  delete object;
}
void tesseract::DetectParagraphs ( int  debug_level,
GenericVector< RowInfo > *  row_infos,
GenericVector< PARA * > *  row_owners,
PARA_LIST *  paragraphs,
GenericVector< ParagraphModel * > *  models 
)

Definition at line 2264 of file paragraphs.cpp.

                                                               {
  GenericVector<RowScratchRegisters> rows;
  ParagraphTheory theory(models);

  // Initialize row_owners to be a bunch of NULL pointers.
  row_owners->init_to_size(row_infos->size(), NULL);

  // Set up row scratch registers for the main algorithm.
  rows.init_to_size(row_infos->size(), RowScratchRegisters());
  for (int i = 0; i < row_infos->size(); i++) {
    rows[i].Init((*row_infos)[i]);
  }

  // Pass 1:
  //   Detect sequences of lines that all contain leader dots (.....)
  //   These are likely Tables of Contents.  If there are three text lines in
  //   a row with leader dots, it's pretty safe to say the middle one should
  //   be a paragraph of its own.
  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);

  DebugDump(debug_level > 1, "End of Pass 1", theory, rows);

  GenericVector<Interval> leftovers;
  LeftoverSegments(rows, &leftovers, 0, rows.size());
  for (int i = 0; i < leftovers.size(); i++) {
    // Pass 2a:
    //   Find any strongly evidenced start-of-paragraph lines.  If they're
    //   followed by two lines that look like body lines, make a paragraph
    //   model for that and see if that model applies throughout the text
    //   (that is, "smear" it).
    StrongEvidenceClassify(debug_level, &rows,
                           leftovers[i].begin, leftovers[i].end, &theory);

    // Pass 2b:
    //   If we had any luck in pass 2a, we got part of the page and didn't
    //   know how to classify a few runs of rows. Take the segments that
    //   didn't find a model and reprocess them individually.
    GenericVector<Interval> leftovers2;
    LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
    bool pass2a_was_useful = leftovers2.size() > 1 ||
        (leftovers2.size() == 1 &&
         (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
    if (pass2a_was_useful) {
      for (int j = 0; j < leftovers2.size(); j++) {
        StrongEvidenceClassify(debug_level, &rows,
                               leftovers2[j].begin, leftovers2[j].end,
                               &theory);
      }
    }
  }

  DebugDump(debug_level > 1, "End of Pass 2", theory, rows);

  // Pass 3:
  //   These are the dregs for which we didn't have enough strong textual
  //   and geometric clues to form matching models for.  Let's see if
  //   the geometric clues are simple enough that we could just use those.
  LeftoverSegments(rows, &leftovers, 0, rows.size());
  for (int i = 0; i < leftovers.size(); i++) {
    GeometricClassify(debug_level, &rows,
                      leftovers[i].begin, leftovers[i].end, &theory);
  }

  // Undo any flush models for which there's little evidence.
  DowngradeWeakestToCrowns(debug_level, &theory, &rows);

  DebugDump(debug_level > 1, "End of Pass 3", theory, rows);

  // Pass 4:
  //   Take everything that's still not marked up well and clear all markings.
  LeftoverSegments(rows, &leftovers, 0, rows.size());
  for (int i = 0; i < leftovers.size(); i++) {
    for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
      rows[j].SetUnknown();
    }
  }

  DebugDump(debug_level > 1, "End of Pass 4", theory, rows);

  // Convert all of the unique hypothesis runs to PARAs.
  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
                                           &theory);

  DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);

  // Finally, clean up any dangling NULL row paragraph parents.
  CanonicalizeDetectionResults(row_owners, paragraphs);
}
void tesseract::DetectParagraphs ( int  debug_level,
bool  after_text_recognition,
const MutableIterator *  block_start,
GenericVector< ParagraphModel * > *  models 
)

Definition at line 2508 of file paragraphs.cpp.

                                                               {
  // Clear out any preconceived notions.
  if (block_start->Empty(RIL_TEXTLINE)) {
    return;
  }
  BLOCK *block = block_start->PageResIt()->block()->block;
  block->para_list()->clear();
  bool is_image_block = block->poly_block() && !block->poly_block()->IsText();

  // Convert the Tesseract structures to RowInfos
  // for the paragraph detection algorithm.
  MutableIterator row(*block_start);
  if (row.Empty(RIL_TEXTLINE))
    return;  // end of input already.

  GenericVector<RowInfo> row_infos;
  do {
    if (!row.PageResIt()->row())
      continue;  // empty row.
    row.PageResIt()->row()->row->set_para(NULL);
    row_infos.push_back(RowInfo());
    RowInfo &ri = row_infos.back();
    InitializeRowInfo(after_text_recognition, row, &ri);
  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
           row.Next(RIL_TEXTLINE));

  // If we're called before text recognition, we might not have
  // tight block bounding boxes, so trim by the minimum on each side.
  if (row_infos.size() > 0) {
    int min_lmargin = row_infos[0].pix_ldistance;
    int min_rmargin = row_infos[0].pix_rdistance;
    for (int i = 1; i < row_infos.size(); i++) {
      if (row_infos[i].pix_ldistance < min_lmargin)
        min_lmargin = row_infos[i].pix_ldistance;
      if (row_infos[i].pix_rdistance < min_rmargin)
        min_rmargin = row_infos[i].pix_rdistance;
    }
    if (min_lmargin > 0 || min_rmargin > 0) {
      for (int i = 0; i < row_infos.size(); i++) {
        row_infos[i].pix_ldistance -= min_lmargin;
        row_infos[i].pix_rdistance -= min_rmargin;
      }
    }
  }

  // Run the paragraph detection algorithm.
  GenericVector<PARA *> row_owners;
  GenericVector<PARA *> the_paragraphs;
  if (!is_image_block) {
    DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
                     models);
  } else {
    row_owners.init_to_size(row_infos.size(), NULL);
    CanonicalizeDetectionResults(&row_owners, block->para_list());
  }

  // Now stitch in the row_owners into the rows.
  row = *block_start;
  for (int i = 0; i < row_owners.size(); i++) {
    while (!row.PageResIt()->row())
      row.Next(RIL_TEXTLINE);
    row.PageResIt()->row()->row->set_para(row_owners[i]);
    row.Next(RIL_TEXTLINE);
  }
}
void tesseract::DiscardUnusedModels ( const GenericVector< RowScratchRegisters > &  rows,
ParagraphTheory *  theory 
)

Definition at line 1455 of file paragraphs.cpp.

                                                  {
  SetOfModels used_models;
  for (int i = 0; i < rows.size(); i++) {
    rows[i].StrongHypotheses(&used_models);
  }
  theory->DiscardUnusedModels(used_models);
}
long tesseract::dist2 ( int  x1,
int  y1,
int  x2,
int  y2 
)

Definition at line 60 of file pdfrenderer.cpp.

                                           {
  return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
}
void tesseract::DowngradeWeakestToCrowns ( int  debug_level,
ParagraphTheory *  theory,
GenericVector< RowScratchRegisters > *  rows 
)

Definition at line 1488 of file paragraphs.cpp.

                                                                        {
  int start;
  for (int end = rows->size(); end > 0; end = start) {
    // Search back for a body line of a unique type.
    const ParagraphModel *model = NULL;
    while (end > 0 &&
           (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {
      end--;
    }
    if (end == 0) break;
    start = end - 1;
    while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
      start--;  // walk back to the first line that is not the same body type.
    }
    if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
        StrongModel(model) &&
        NearlyEqual(model->first_indent(), model->body_indent(),
                    model->tolerance())) {
        start--;
    }
    start++;
    // Now rows[start, end) is a sequence of unique body hypotheses of model.
    if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
      continue;
    if (!StrongModel(model)) {
      while (start > 0 &&
             CrownCompatible(rows, start - 1, start, model))
        start--;
    }
    if (start == 0 ||
        (!StrongModel(model)) ||
        (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
      // crownify rows[start, end)
      const ParagraphModel *crown_model = model;
      if (StrongModel(model)) {
          if (model->justification() == JUSTIFICATION_LEFT)
            crown_model = kCrownLeft;
          else
            crown_model = kCrownRight;
      }
      (*rows)[start].SetUnknown();
      (*rows)[start].AddStartLine(crown_model);
      for (int row = start + 1; row < end; row++) {
        (*rows)[row].SetUnknown();
        (*rows)[row].AddBodyLine(crown_model);
      }
    }
  }
  DiscardUnusedModels(*rows, theory);
}
tesseract::ELISTIZE ( ViterbiStateEntry  )
tesseract::ELISTIZE ( AmbigSpec  )
tesseract::ELISTIZEH ( AmbigSpec  )
tesseract::ELISTIZEH ( ViterbiStateEntry  )
void tesseract::ExtractFontProperties ( const string &  utf8_text,
StringRenderer render,
const string &  output_base 
)

Definition at line 195 of file text2image.cpp.

                                                      {
  map<string, SpacingProperties> spacing_map;
  map<string, SpacingProperties>::iterator spacing_map_it0;
  map<string, SpacingProperties>::iterator spacing_map_it1;
  int x_bearing, x_advance;
  int len = utf8_text.length();
  int offset = 0;
  const char* text = utf8_text.c_str();
  while (offset < len) {
    offset += render->RenderToImage(text + offset, strlen(text + offset), NULL);
    const vector<BoxChar*> &boxes = render->GetBoxes();

    // If the page break split a bigram, correct the offset so we try the bigram
    // on the next iteration.
    if (boxes.size() > 2 && !IsWhitespaceBox(boxes[boxes.size() - 1]) &&
        IsWhitespaceBox(boxes[boxes.size() - 2])) {
      if (boxes.size() > 3) {
        tprintf("WARNING: Adjusting to bad page break after '%s%s'\n",
                boxes[boxes.size() - 4]->ch().c_str(),
                boxes[boxes.size() - 3]->ch().c_str());
      }
      offset -= boxes[boxes.size() - 1]->ch().size();
    }

    for (int b = 0; b < boxes.size(); b += 2) {
      while (b < boxes.size() && IsWhitespaceBox(boxes[b])) ++b;
      if (b + 1 >= boxes.size()) break;
      const string &ch0 = boxes[b]->ch();
      // We encountered a ligature. This happens in at least two scenarios:
      // One is when the rendered bigram forms a grapheme cluster (eg. the
      // second character in the bigram is a combining vowel), in which case we
      // correctly output only one bounding box.
      // A second far less frequent case is when caused some fonts like 'DejaVu
      // Sans Ultra-Light' force Pango to render a ligatured character even if
      // the input consists of the separated characters.  NOTE(ranjith): As per
      // behdad@ this is not currently controllable at the level of the Pango
      // API.
      // Safeguard against these cases here by just skipping the bigram.
      if (IsWhitespaceBox(boxes[b+1])) {
        tprintf("WARNING: Found unexpected ligature: %s\n", ch0.c_str());
        continue;
      }
      int xgap = (boxes[b+1]->box()->x -
                  (boxes[b]->box()->x + boxes[b]->box()->w));
      spacing_map_it0 = spacing_map.find(ch0);
      int ok_count = 0;
      if (spacing_map_it0 == spacing_map.end() &&
          render->font().GetSpacingProperties(ch0, &x_bearing, &x_advance)) {
        spacing_map[ch0] = SpacingProperties(
            x_bearing, x_advance - x_bearing - boxes[b]->box()->w);
        spacing_map_it0 = spacing_map.find(ch0);
        ++ok_count;
      }
      const string &ch1 = boxes[b+1]->ch();
      tlog(3, "%s%s\n", ch0.c_str(), ch1.c_str());
      spacing_map_it1 = spacing_map.find(ch1);
      if (spacing_map_it1 == spacing_map.end() &&
          render->font().GetSpacingProperties(ch1, &x_bearing, &x_advance)) {
        spacing_map[ch1] = SpacingProperties(
            x_bearing, x_advance - x_bearing - boxes[b+1]->box()->w);
        spacing_map_it1 = spacing_map.find(ch1);
        ++ok_count;
      }
      if (ok_count == 2 && xgap != (spacing_map_it0->second.x_gap_after +
                                    spacing_map_it1->second.x_gap_before)) {
        spacing_map_it0->second.kerned_x_gaps[ch1] = xgap;
      }
    }
    render->ClearBoxes();
  }
  string output_string;
  const int kBufSize = 1024;
  char buf[kBufSize];
  snprintf(buf, kBufSize, "%d\n", static_cast<int>(spacing_map.size()));
  output_string.append(buf);
  map<string, SpacingProperties>::const_iterator spacing_map_it;
  for (spacing_map_it = spacing_map.begin();
       spacing_map_it != spacing_map.end(); ++spacing_map_it) {
    snprintf(buf, kBufSize,
             "%s %d %d %d", spacing_map_it->first.c_str(),
             spacing_map_it->second.x_gap_before,
             spacing_map_it->second.x_gap_after,
             static_cast<int>(spacing_map_it->second.kerned_x_gaps.size()));
    output_string.append(buf);
    map<string, int>::const_iterator kern_it;
    for (kern_it = spacing_map_it->second.kerned_x_gaps.begin();
         kern_it != spacing_map_it->second.kerned_x_gaps.end(); ++kern_it) {
      snprintf(buf, kBufSize,
               " %s %d", kern_it->first.c_str(), kern_it->second);
      output_string.append(buf);
    }
    output_string.append("\n");
  }
  File::WriteStringToFileOrDie(output_string, output_base + ".fontinfo");
}
bool tesseract::FirstWordWouldHaveFit ( const RowScratchRegisters &  before,
const RowScratchRegisters &  after,
tesseract::ParagraphJustification  justification 
)

Definition at line 1621 of file paragraphs.cpp.

                                                                          {
  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
    return true;

  if (justification == JUSTIFICATION_UNKNOWN) {
    tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
  }
  int available_space;
  if (justification == JUSTIFICATION_CENTER) {
    available_space = before.lindent_ + before.rindent_;
  } else {
    available_space = before.OffsideIndent(justification);
  }
  available_space -= before.ri_->average_interword_space;

  if (before.ri_->ltr)
    return after.ri_->lword_box.width() < available_space;
  return after.ri_->rword_box.width() < available_space;
}
bool tesseract::FirstWordWouldHaveFit ( const RowScratchRegisters &  before,
const RowScratchRegisters &  after 
)

Definition at line 1646 of file paragraphs.cpp.

                                                             {
  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
    return true;

  int available_space = before.lindent_;
  if (before.rindent_ > available_space)
    available_space = before.rindent_;
  available_space -= before.ri_->average_interword_space;

  if (before.ri_->ltr)
    return after.ri_->lword_box.width() < available_space;
  return after.ri_->rword_box.width() < available_space;
}
void tesseract::FontInfoDeleteCallback ( FontInfo  f)

Definition at line 139 of file fontinfo.cpp.

                                        {
  if (f.spacing_vec != NULL) {
    f.spacing_vec->delete_data_pointers();
    delete f.spacing_vec;
  }
  delete[] f.name;
}
void tesseract::FontSetDeleteCallback ( FontSet  fs)

Definition at line 146 of file fontinfo.cpp.

                                       {
  delete[] fs.configs;
}

Definition at line 239 of file normstrngs.cpp.

                                             {
  // Return unchanged if not in the fullwidth-halfwidth Unicode block.
  if (ch < 0xFF00 || ch > 0xFFEF || !IsValidCodepoint(ch)) {
    if (ch != 0x3000) return ch;
  }
  // Special case for fullwidth left and right "white parentheses".
  if (ch == 0xFF5F) return 0x2985;
  if (ch == 0xFF60) return 0x2986;
  // Construct a full-to-half width transliterator.
  IcuErrorCode error_code;
  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
  const icu::Transliterator* fulltohalf = icu::Transliterator::createInstance(
      "Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
  error_code.assertSuccess();
  error_code.reset();

  fulltohalf->transliterate(uch_str);
  delete fulltohalf;
  ASSERT_HOST(uch_str.length() != 0);
  return uch_str[0];
}
void tesseract::GeometricClassify ( int  debug_level,
GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
ParagraphTheory *  theory 
)

Definition at line 1077 of file paragraphs.cpp.

                                                {
  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
    return;
  if (debug_level > 1) {
    tprintf("###############################################\n");
    tprintf("##### GeometricClassify( rows[%d:%d) )   ####\n",
            row_start, row_end);
    tprintf("###############################################\n");
  }
  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);

  GeometricClassifierState s(debug_level, rows, row_start, row_end);
  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
    s.Fail(2, "Too much variety for simple outline classification.");
    return;
  }
  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
    s.Fail(1, "Not enough variety for simple outline classification.");
    return;
  }
  if (s.left_tabs.size() + s.right_tabs.size() == 3) {
    GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
    return;
  }

  // At this point, we know that one side has at least two tab stops, and the
  // other side has one or two tab stops.
  // Left to determine:
  //   (1) Which is the body indent and which is the first line indent?
  //   (2) Is the text fully justified?

  // If one side happens to have three or more tab stops, assume that side
  // is opposite of the aligned side.
  if (s.right_tabs.size() > 2) {
    s.AssumeLeftJustification();
  } else if (s.left_tabs.size() > 2) {
    s.AssumeRightJustification();
  } else if (s.ltr) {  // guess based on script direction
    s.AssumeLeftJustification();
  } else {
    s.AssumeRightJustification();
  }

  if (s.AlignTabs().size() == 2) {
    // For each tab stop on the aligned side, how many of them appear
    // to be paragraph start lines?  [first lines]
    int firsts[2] = {0, 0};
    // Count the first line as a likely paragraph start line.
    firsts[s.AlignsideTabIndex(s.row_start)]++;
    // For each line, if the first word would have fit on the previous
    // line count it as a likely paragraph start line.
    bool jam_packed = true;
    for (int i = s.row_start + 1; i < s.row_end; i++) {
      if (s.FirstWordWouldHaveFit(i - 1, i)) {
        firsts[s.AlignsideTabIndex(i)]++;
        jam_packed = false;
      }
    }
    // Make an extra accounting for the last line of the paragraph just
    // in case it's the only short line in the block.  That is, take its
    // first word as typical and see if this looks like the *last* line
    // of a paragraph.  If so, mark the *other* indent as probably a first.
    if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
      firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
    }

    int percent0firsts, percent1firsts;
    percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
    percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;

    // TODO(eger): Tune these constants if necessary.
    if ((percent0firsts < 20 && 30 < percent1firsts) ||
        percent0firsts + 30 < percent1firsts) {
      s.first_indent = s.AlignTabs()[1].center;
      s.body_indent = s.AlignTabs()[0].center;
    } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
               percent1firsts + 30 < percent0firsts) {
      s.first_indent = s.AlignTabs()[0].center;
      s.body_indent = s.AlignTabs()[1].center;
    } else {
      // Ambiguous! Probably lineated (poetry)
      if (debug_level > 1) {
        tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
                s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
        tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
                s.AlignTabs()[0].center, percent0firsts);
        tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
                s.AlignTabs()[1].center, percent1firsts);
        s.PrintRows();
      }
      return;
    }
  } else {
    // There's only one tab stop for the "aligned to" side.
    s.first_indent = s.body_indent = s.AlignTabs()[0].center;
  }

  // At this point, we have our model.
  const ParagraphModel *model = theory->AddModel(s.Model());

  // Now all we have to do is figure out if the text is fully justified or not.
  // eop_threshold: default to fully justified unless we see evidence below.
  //    See description on MarkRowsWithModel()
  s.eop_threshold =
      (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
  // If the text is not fully justified, re-set the eop_threshold to 0.
  if (s.AlignTabs().size() == 2) {
    // Paragraphs with a paragraph-start indent.
    for (int i = s.row_start; i < s.row_end - 1; i++) {
      if (ValidFirstLine(s.rows, i + 1, model) &&
          !NearlyEqual(s.OffsideTabs()[0].center,
                       (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
        // We found a non-end-of-paragraph short line: not fully justified.
        s.eop_threshold = 0;
        break;
      }
    }
  } else {
    // Paragraphs with no paragraph-start indent.
    for (int i = s.row_start; i < s.row_end - 1; i++) {
      if (!s.FirstWordWouldHaveFit(i, i + 1) &&
          !NearlyEqual(s.OffsideTabs()[0].center,
                       (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
        // We found a non-end-of-paragraph short line: not fully justified.
        s.eop_threshold = 0;
        break;
      }
    }
  }
  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
}
void tesseract::GeometricClassifyThreeTabStopTextBlock ( int  debug_level,
GeometricClassifierState &  s,
ParagraphTheory *  theory 
)

Definition at line 985 of file paragraphs.cpp.

                             {
  int num_rows = s.row_end - s.row_start;
  int num_full_rows = 0;
  int last_row_full = 0;
  for (int i = s.row_start; i < s.row_end; i++) {
    if (s.IsFullRow(i)) {
      num_full_rows++;
      if (i == s.row_end - 1) last_row_full++;
    }
  }

  if (num_full_rows < 0.7 * num_rows) {
    s.Fail(1, "Not enough full lines to know which lines start paras.");
    return;
  }

  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
  s.eop_threshold = 0;

  if (s.ltr) {
    s.AssumeLeftJustification();
  } else {
    s.AssumeRightJustification();
  }

  if (debug_level > 0) {
    tprintf("# Not enough variety for clear outline classification. "
            "Guessing these are %s aligned based on script.\n",
            s.ltr ? "left" : "right");
    s.PrintRows();
  }

  if (s.AlignTabs().size() == 2) {  // case A1 or A2
    s.first_indent = s.AlignTabs()[1].center;
    s.body_indent = s.AlignTabs()[0].center;
  } else {                      // case B1 or B2
    if (num_rows - 1 == num_full_rows - last_row_full) {
      // case B2
      const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
      (*s.rows)[s.row_start].AddStartLine(model);
      for (int i = s.row_start + 1; i < s.row_end; i++) {
        (*s.rows)[i].AddBodyLine(model);
      }
      return;
    } else {
      // case B1
      s.first_indent = s.body_indent = s.AlignTabs()[0].center;
      s.eop_threshold = (s.OffsideTabs()[0].center +
                         s.OffsideTabs()[1].center) / 2;
    }
  }
  const ParagraphModel *model = theory->AddModel(s.Model());
  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
                    s.ltr, s.eop_threshold);
  return;
}
Pix* tesseract::GridReducedPix ( const TBOX box,
int  gridsize,
ICOORD  bleft,
int *  left,
int *  bottom 
)

Definition at line 212 of file bbgrid.cpp.

                                                          {
  // Compute grid bounds of the outline and pad all round by 1.
  int grid_left = (box.left() - bleft.x()) / gridsize - 1;
  int grid_bottom = (box.bottom() - bleft.y()) / gridsize - 1;
  int grid_right = (box.right() - bleft.x()) / gridsize + 1;
  int grid_top = (box.top() - bleft.y()) / gridsize + 1;
  *left = grid_left;
  *bottom = grid_bottom;
  return pixCreate(grid_right - grid_left + 1,
                   grid_top - grid_bottom + 1,
                   1);
}
void tesseract::HistogramRect ( Pix *  src_pix,
int  channel,
int  left,
int  top,
int  width,
int  height,
int *  histogram 
)

Definition at line 157 of file otsuthr.cpp.

                                   {
  PERF_COUNT_START("HistogramRect")
  int num_channels = pixGetDepth(src_pix) / 8;
  channel = ClipToRange(channel, 0, num_channels - 1);
  int bottom = top + height;
  memset(histogram, 0, sizeof(*histogram) * kHistogramSize);
  int src_wpl = pixGetWpl(src_pix);
  l_uint32* srcdata = pixGetData(src_pix);
  for (int y = top; y < bottom; ++y) {
    const l_uint32* linedata = srcdata + y * src_wpl;
    for (int x = 0; x < width; ++x) {
      int pixel = GET_DATA_BYTE(const_cast<void*>(
          reinterpret_cast<const void *>(linedata)),
          (x + left) * num_channels + channel);
      ++histogram[pixel];
    }
  }
  PERF_COUNT_END
}
void tesseract::InitializeRowInfo ( bool  after_recognition,
const MutableIterator &  it,
RowInfo *  info 
)

Definition at line 2410 of file paragraphs.cpp.

                                      {
  if (it.PageResIt()->row() != NULL) {
    ROW *row = it.PageResIt()->row()->row;
    info->pix_ldistance = row->lmargin();
    info->pix_rdistance = row->rmargin();
    info->average_interword_space =
        row->space() > 0 ? row->space() : MAX(row->x_height(), 1);
    info->pix_xheight = row->x_height();
    info->has_leaders = false;
    info->has_drop_cap = row->has_drop_cap();
    info->ltr = true;  // set below depending on word scripts
  } else {
    info->pix_ldistance = info->pix_rdistance = 0;
    info->average_interword_space = 1;
    info->pix_xheight = 1.0;
    info->has_leaders = false;
    info->has_drop_cap = false;
    info->ltr = true;
  }

  info->num_words = 0;
  info->lword_indicates_list_item = false;
  info->lword_likely_starts_idea = false;
  info->lword_likely_ends_idea = false;
  info->rword_indicates_list_item = false;
  info->rword_likely_starts_idea = false;
  info->rword_likely_ends_idea = false;
  info->has_leaders = false;
  info->ltr = 1;

  if (!after_recognition) {
    InitializeTextAndBoxesPreRecognition(it, info);
    return;
  }
  info->text = "";
  char *text = it.GetUTF8Text(RIL_TEXTLINE);
  int trailing_ws_idx = strlen(text);  // strip trailing space
  while (trailing_ws_idx > 0 &&
         // isspace() only takes ASCII
         ((text[trailing_ws_idx - 1] & 0x80) == 0) &&
         isspace(text[trailing_ws_idx - 1]))
    trailing_ws_idx--;
  if (trailing_ws_idx > 0) {
    int lspaces = info->pix_ldistance / info->average_interword_space;
    for (int i = 0; i < lspaces; i++)
      info->text += ' ';
    for (int i = 0; i < trailing_ws_idx; i++)
      info->text += text[i];
  }
  delete []text;

  if (info->text.size() == 0) {
    return;
  }

  PAGE_RES_IT page_res_it = *it.PageResIt();
  GenericVector<WERD_RES *> werds;
  WERD_RES *word_res = page_res_it.restart_row();
  ROW_RES *this_row = page_res_it.row();
  int num_leaders = 0;
  int ltr = 0;
  int rtl = 0;
  do {
    if (word_res && word_res->best_choice->unichar_string().length() > 0) {
      werds.push_back(word_res);
      ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
      rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
      if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
    }
    word_res = page_res_it.forward();
  } while (page_res_it.row() == this_row);
  info->ltr = ltr >= rtl;
  info->has_leaders = num_leaders > 3;
  info->num_words = werds.size();
  if (werds.size() > 0) {
    WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
    info->lword_text = lword->best_choice->unichar_string().string();
    info->rword_text = rword->best_choice->unichar_string().string();
    info->lword_box = lword->word->bounding_box();
    info->rword_box = rword->word->bounding_box();
    LeftWordAttributes(lword->uch_set, lword->best_choice,
                       info->lword_text,
                       &info->lword_indicates_list_item,
                       &info->lword_likely_starts_idea,
                       &info->lword_likely_ends_idea);
    RightWordAttributes(rword->uch_set, rword->best_choice,
                        info->rword_text,
                        &info->rword_indicates_list_item,
                        &info->rword_likely_starts_idea,
                        &info->rword_likely_ends_idea);
  }
}
void tesseract::InitializeTextAndBoxesPreRecognition ( const MutableIterator &  it,
RowInfo *  info 
)

Definition at line 2359 of file paragraphs.cpp.

                                                         {
  // Set up text, lword_text, and rword_text (mostly for debug printing).
  STRING fake_text;
  PageIterator pit(static_cast<const PageIterator&>(it));
  bool first_word = true;
  if (!pit.Empty(RIL_WORD)) {
    do {
      fake_text += "x";
      if (first_word) info->lword_text += "x";
      info->rword_text += "x";
      if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
          !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
        fake_text += " ";
        info->rword_text = "";
        first_word = false;
      }
    } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
             pit.Next(RIL_SYMBOL));
  }
  if (fake_text.size() == 0) return;

  int lspaces = info->pix_ldistance / info->average_interword_space;
  for (int i = 0; i < lspaces; i++) {
    info->text += ' ';
  }
  info->text += fake_text;

  // Set up lword_box, rword_box, and num_words.
  PAGE_RES_IT page_res_it = *it.PageResIt();
  WERD_RES *word_res = page_res_it.restart_row();
  ROW_RES *this_row = page_res_it.row();

  WERD_RES *lword = NULL;
  WERD_RES *rword = NULL;
  info->num_words = 0;
  do {
    if (word_res) {
      if (!lword) lword = word_res;
      if (rword != word_res) info->num_words++;
      rword = word_res;
    }
    word_res = page_res_it.forward();
  } while (page_res_it.row() == this_row);
  info->lword_box = lword->word->bounding_box();
  info->rword_box = rword->word->bounding_box();
}
ParagraphModel tesseract::InternalParagraphModelByOutline ( const GenericVector< RowScratchRegisters > *  rows,
int  start,
int  end,
int  tolerance,
bool *  consistent 
)

Definition at line 1692 of file paragraphs.cpp.

                                                         {
  int ltr_line_count = 0;
  for (int i = start; i < end; i++) {
    ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
  }
  bool ltr = (ltr_line_count >= (end - start) / 2);

  *consistent = true;
  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
    return ParagraphModel();

  // Ensure the caller only passed us a region with a common rmargin and
  // lmargin.
  int lmargin = (*rows)[start].lmargin_;
  int rmargin = (*rows)[start].rmargin_;
  int lmin, lmax, rmin, rmax, cmin, cmax;
  lmin = lmax = (*rows)[start + 1].lindent_;
  rmin = rmax = (*rows)[start + 1].rindent_;
  cmin = cmax = 0;
  for (int i = start + 1; i < end; i++) {
    if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
      tprintf("Margins don't match! Software error.\n");
      *consistent = false;
      return ParagraphModel();
    }
    UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
    UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
    UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
  }
  int ldiff = lmax - lmin;
  int rdiff = rmax - rmin;
  int cdiff = cmax - cmin;
  if (rdiff > tolerance && ldiff > tolerance) {
    if (cdiff < tolerance * 2) {
      if (end - start < 3)
        return ParagraphModel();
      return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
    }
    *consistent = false;
    return ParagraphModel();
  }
  if (end - start < 3)  // Don't return a model for two line paras.
    return ParagraphModel();

  // These booleans keep us from saying something is aligned left when the body
  // left variance is too large.
  bool body_admits_left_alignment = ldiff < tolerance;
  bool body_admits_right_alignment = rdiff < tolerance;

  ParagraphModel left_model =
      ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
                     (lmin + lmax) / 2, tolerance);
  ParagraphModel right_model =
      ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
                     (rmin + rmax) / 2, tolerance);

  // These booleans keep us from having an indent on the "wrong side" for the
  // first line.
  bool text_admits_left_alignment = ltr || left_model.is_flush();
  bool text_admits_right_alignment = !ltr || right_model.is_flush();

  // At least one of the edges is less than tolerance in variance.
  // If the other is obviously ragged, it can't be the one aligned to.
  // [Note the last line is included in this raggedness.]
  if (tolerance < rdiff) {
    if (body_admits_left_alignment && text_admits_left_alignment)
      return left_model;
    *consistent = false;
    return ParagraphModel();
  }
  if (tolerance < ldiff) {
    if (body_admits_right_alignment && text_admits_right_alignment)
      return right_model;
    *consistent = false;
    return ParagraphModel();
  }

  // At this point, we know the body text doesn't vary much on either side.

  // If the first line juts out oddly in one direction or the other,
  // that likely indicates the side aligned to.
  int first_left = (*rows)[start].lindent_;
  int first_right = (*rows)[start].rindent_;

  if (ltr && body_admits_left_alignment &&
      (first_left < lmin || first_left > lmax))
    return left_model;
  if (!ltr && body_admits_right_alignment &&
      (first_right < rmin || first_right > rmax))
    return right_model;

  *consistent = false;
  return ParagraphModel();
}
int tesseract::InterwordSpace ( const GenericVector< RowScratchRegisters > &  rows,
int  row_start,
int  row_end 
)

Definition at line 1598 of file paragraphs.cpp.

                                               {
  if (row_end < row_start + 1) return 1;
  int word_height = (rows[row_start].ri_->lword_box.height() +
                     rows[row_end - 1].ri_->lword_box.height()) / 2;
  int word_width = (rows[row_start].ri_->lword_box.width() +
                    rows[row_end - 1].ri_->lword_box.width())  / 2;
  STATS spacing_widths(0, 5 + word_width);
  for (int i = row_start; i < row_end; i++) {
    if (rows[i].ri_->num_words > 1) {
      spacing_widths.add(rows[i].ri_->average_interword_space, 1);
    }
  }
  int minimum_reasonable_space = word_height / 3;
  if (minimum_reasonable_space < 2)
    minimum_reasonable_space = 2;
  int median = spacing_widths.median();
  return (median > minimum_reasonable_space)
      ? median : minimum_reasonable_space;
}
bool tesseract::is_double_quote ( const char32  ch)

Definition at line 97 of file normstrngs.cpp.

                                      {
  static const int kNumDoubleQuoteUnicodes = 8;
  static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
    '"',
    0x201C,  // left double quotation mark (English, others)
    0x201D,  // right double quotation mark (Danish, Finnish, Swedish, Norw.)
    0x201F,  // double high-reversed-9 quotation mark (PropList.txt)
    0x2033,  // double prime
    0x301D,  // reversed double prime quotation mark (East Asian langs, horiz.)
    0x301E,  // close double prime (East Asian languages written horizontally)
    0xFF02,  // fullwidth quotation mark
  };
  for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
    if (kDoubleQuoteUnicodes[i] == ch)
      return true;
  }
  return false;
}
bool tesseract::is_hyphen_punc ( const char32  ch)

Definition at line 58 of file normstrngs.cpp.

                                     {
  static const int kNumHyphenPuncUnicodes = 13;
  static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
    '-',
    0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015,  // hyphen..horizontal bar
    0x207b,  // superscript minus
    0x208b,  // subscript minus
    0x2212,  // minus sign
    0xfe58,  // small em dash
    0xfe63,  // small hyphen-minus
    0xff0d,  // fullwidth hyphen-minus
  };
  for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
    if (kHyphenPuncUnicodes[i] == ch)
      return true;
  }
  return false;
}
bool tesseract::is_single_quote ( const char32  ch)

Definition at line 77 of file normstrngs.cpp.

                                      {
  static const int kNumSingleQuoteUnicodes = 8;
  static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
    '\'',
    '`',
    0x2018,  // left single quotation mark (English, others)
    0x2019,  // right single quotation mark (Danish, Finnish, Swedish, Norw.)
             // We may have to introduce a comma set with 0x201a
    0x201B,  // single high-reveresed-9 quotation mark (PropList.txt)
    0x2032,  // prime
    0x300C,  // left corner bracket (East Asian languages)
    0xFF07,  // fullwidth apostrophe
  };
  for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
    if (kSingleQuoteUnicodes[i] == ch)
      return true;
  }
  return false;
}
bool tesseract::IsDigitLike ( int  ch)

Definition at line 197 of file paragraphs.cpp.

                         {
  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
}

Definition at line 208 of file normstrngs.cpp.

                                         {
  return IsValidCodepoint(ch) &&
      !(ch >= 0xFDD0 && ch <= 0xFDEF) &&  // Noncharacters.
      !(ch >= 0xFFFE && ch <= 0xFFFF) &&
      !(ch >= 0x1FFFE && ch <= 0x1FFFF) &&
      !(ch >= 0x2FFFE && ch <= 0x2FFFF) &&
      !(ch >= 0x3FFFE && ch <= 0x3FFFF) &&
      !(ch >= 0x4FFFE && ch <= 0x4FFFF) &&
      !(ch >= 0x5FFFE && ch <= 0x5FFFF) &&
      !(ch >= 0x6FFFE && ch <= 0x6FFFF) &&
      !(ch >= 0x7FFFE && ch <= 0x7FFFF) &&
      !(ch >= 0x8FFFE && ch <= 0x8FFFF) &&
      !(ch >= 0x9FFFE && ch <= 0x9FFFF) &&
      !(ch >= 0xAFFFE && ch <= 0xAFFFF) &&
      !(ch >= 0xBFFFE && ch <= 0xBFFFF) &&
      !(ch >= 0xCFFFE && ch <= 0xCFFFF) &&
      !(ch >= 0xDFFFE && ch <= 0xDFFFF) &&
      !(ch >= 0xEFFFE && ch <= 0xEFFFF) &&
      !(ch >= 0xFFFFE && ch <= 0xFFFFF) &&
      !(ch >= 0x10FFFE && ch <= 0x10FFFF) &&
      (!u_isISOControl(static_cast<UChar32>(ch)) ||
       ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r');
}

Definition at line 232 of file normstrngs.cpp.

                                                  {
  return IsValidCodepoint(ch) &&
      ch <= 128 &&
      (!u_isISOControl(static_cast<UChar32>(ch)) ||
       ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r');
}
bool tesseract::IsLatinLetter ( int  ch)

Definition at line 193 of file paragraphs.cpp.

                           {
  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
}
bool tesseract::IsLeftIndented ( const EquationDetect::IndentType  type) [inline]

Definition at line 95 of file equationdetect.cpp.

                                                                {
  return type == EquationDetect::LEFT_INDENT ||
      type == EquationDetect::BOTH_INDENT;
}
bool tesseract::IsOCREquivalent ( char32  ch1,
char32  ch2 
)

Definition at line 166 of file normstrngs.cpp.

                                             {
  return OCRNormalize(ch1) == OCRNormalize(ch2);
}
bool tesseract::IsOpeningPunct ( int  ch)

Definition at line 201 of file paragraphs.cpp.

                            {
  return strchr("'\"({[", ch) != NULL;
}
bool tesseract::IsRightIndented ( const EquationDetect::IndentType  type) [inline]

Definition at line 100 of file equationdetect.cpp.

                                                                 {
  return type == EquationDetect::RIGHT_INDENT ||
      type == EquationDetect::BOTH_INDENT;
}
bool tesseract::IsTerminalPunct ( int  ch)

Definition at line 205 of file paragraphs.cpp.

                             {
  return strchr(":'\".?!]})", ch) != NULL;
}

Definition at line 91 of file equationdetect.cpp.

                                                     {
  return PTIsTextType(type) || type == PT_EQUATION;
}
bool tesseract::IsUTF8Whitespace ( const char *  text)

Definition at line 182 of file normstrngs.cpp.

                                        {
  return SpanUTF8Whitespace(text) == strlen(text);
}
bool tesseract::IsValidCodepoint ( const char32  ch)

Definition at line 170 of file normstrngs.cpp.

                                       {
  // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
  return (static_cast<uinT32>(ch) < 0xD800)
      || (ch >= 0xE000 && ch <= 0x10FFFF);
}
bool tesseract::IsWhitespace ( const char32  ch)

Definition at line 176 of file normstrngs.cpp.

                                   {
  ASSERT_HOST_MSG(IsValidCodepoint(ch),
                  "Invalid Unicode codepoint: 0x%x\n", ch);
  return u_isUWhiteSpace(static_cast<UChar32>(ch));
}
void tesseract::LeftoverSegments ( const GenericVector< RowScratchRegisters > &  rows,
GenericVector< Interval > *  to_fix,
int  row_start,
int  row_end 
)

Definition at line 2181 of file paragraphs.cpp.

                                                  {
  to_fix->clear();
  for (int i = row_start; i < row_end; i++) {
    bool needs_fixing = false;

    SetOfModels models;
    SetOfModels models_w_crowns;
    rows[i].StrongHypotheses(&models);
    rows[i].NonNullHypotheses(&models_w_crowns);
    if (models.empty() && models_w_crowns.size() > 0) {
      // Crown paragraph.  Is it followed by a modeled line?
      for (int end = i + 1; end < rows.size(); end++) {
        SetOfModels end_models;
        SetOfModels strong_end_models;
        rows[end].NonNullHypotheses(&end_models);
        rows[end].StrongHypotheses(&strong_end_models);
        if (end_models.size() == 0) {
          needs_fixing = true;
          break;
        } else if (strong_end_models.size() > 0) {
          needs_fixing = false;
          break;
        }
      }
    } else if (models.empty() && rows[i].ri_->num_words > 0) {
      // No models at all.
      needs_fixing = true;
    }

    if (!needs_fixing && !models.empty()) {
      needs_fixing = RowIsStranded(rows, i);
    }

    if (needs_fixing) {
      if (!to_fix->empty() && to_fix->back().end == i - 1)
        to_fix->back().end = i;
      else
        to_fix->push_back(Interval(i, i));
    }
  }
  // Convert inclusive intervals to half-open intervals.
  for (int i = 0; i < to_fix->size(); i++) {
    (*to_fix)[i].end = (*to_fix)[i].end + 1;
  }
}
void tesseract::LeftWordAttributes ( const UNICHARSET unicharset,
const WERD_CHOICE werd,
const STRING utf8,
bool *  is_list,
bool *  starts_idea,
bool *  ends_idea 
)

Definition at line 394 of file paragraphs.cpp.

                                                                           {
  *is_list = false;
  *starts_idea = false;
  *ends_idea = false;
  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty
    *ends_idea = true;
    return;
  }

  if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
    if (UniLikelyListItem(unicharset, werd)) {
      *is_list = true;
      *starts_idea = true;
      *ends_idea = true;
    }
    if (unicharset->get_isupper(werd->unichar_id(0))) {
      *starts_idea = true;
    }
    if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
      *starts_idea = true;
      *ends_idea = true;
    }
  } else {  // Assume utf8 is mostly ASCII
    if (AsciiLikelyListItem(utf8)) {
      *is_list = true;
      *starts_idea = true;
    }
    int start_letter = utf8[0];
    if (IsOpeningPunct(start_letter)) {
      *starts_idea = true;
    }
    if (IsTerminalPunct(start_letter)) {
      *ends_idea = true;
    }
    if (start_letter >= 'A' && start_letter <= 'Z') {
      *starts_idea = true;
    }
  }
}
bool tesseract::LikelyListMark ( const STRING word)

Definition at line 262 of file paragraphs.cpp.

                                        {
  const char *kListMarks = "0Oo*.,+.";
  return word.size() == 1 && strchr(kListMarks, word[0]) != NULL;
}

Definition at line 328 of file paragraphs.cpp.

                                   {
  if (ch < 0x80) {
    STRING single_ch;
    single_ch += ch;
    return LikelyListMark(single_ch);
  }
  switch (ch) {
    // TODO(eger) expand this list of unicodes as needed.
    case 0x00B0:  // degree sign
    case 0x2022:  // bullet
    case 0x25E6:  // white bullet
    case 0x00B7:  // middle dot
    case 0x25A1:  // white square
    case 0x25A0:  // black square
    case 0x25AA:  // black small square
    case 0x2B1D:  // black very small square
    case 0x25BA:  // black right-pointing pointer
    case 0x25CF:  // black circle
    case 0x25CB:  // white circle
      return true;
    default:
      break;  // fall through
  }
  return false;
}
bool tesseract::LikelyListNumeral ( const STRING word)

Definition at line 228 of file paragraphs.cpp.

                                           {
  const char *kRomans = "ivxlmdIVXLMD";
  const char *kDigits = "012345789";
  const char *kOpen = "[{(";
  const char *kSep = ":;-.,";
  const char *kClose = "]})";

  int num_segments = 0;
  const char *pos = word.string();
  while (*pos != '\0' && num_segments < 3) {
    // skip up to two open parens.
    const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
    const char *numeral_end = SkipChars(numeral_start, kRomans);
    if (numeral_end != numeral_start) {
      // Got Roman Numeral. Great.
    } else {
      numeral_end = SkipChars(numeral_start, kDigits);
      if (numeral_end == numeral_start) {
        // If there's a single latin letter, we can use that.
        numeral_end = SkipChars(numeral_start, IsLatinLetter);
        if (numeral_end - numeral_start != 1)
          break;
      }
    }
    // We got some sort of numeral.
    num_segments++;
    // Skip any trailing parens or punctuation.
    pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
    if (pos == numeral_end)
      break;
  }
  return *pos == '\0';
}
bool tesseract::LikelyParagraphStart ( const RowScratchRegisters &  before,
const RowScratchRegisters &  after 
)

Definition at line 1672 of file paragraphs.cpp.

                                                            {
  return before.ri_->num_words == 0 ||
      (FirstWordWouldHaveFit(before, after) &&
       TextSupportsBreak(before, after));
}
bool tesseract::LikelyParagraphStart ( const RowScratchRegisters &  before,
const RowScratchRegisters &  after,
tesseract::ParagraphJustification  j 
)

Definition at line 1679 of file paragraphs.cpp.

                                                             {
  return before.ri_->num_words == 0 ||
      (FirstWordWouldHaveFit(before, after, j) &&
       TextSupportsBreak(before, after));
}
bool tesseract::LoadDataFromFile ( const STRING filename,
GenericVector< char > *  data 
) [inline]

Definition at line 330 of file genericvector.h.

                                                        {
  FILE* fp = fopen(filename.string(), "rb");
  if (fp == NULL) return false;
  fseek(fp, 0, SEEK_END);
  size_t size = ftell(fp);
  fseek(fp, 0, SEEK_SET);
  // Pad with a 0, just in case we treat the result as a string.
  data->init_to_size(size + 1, 0);
  bool result = fread(&(*data)[0], 1, size, fp) == size;
  fclose(fp);
  return result;
}
ShapeTable * tesseract::LoadShapeTable ( const STRING file_prefix)

Definition at line 119 of file commontraining.cpp.

                                                      {
  ShapeTable* shape_table = NULL;
  STRING shape_table_file = file_prefix;
  shape_table_file += kShapeTableFileSuffix;
  FILE* shape_fp = fopen(shape_table_file.string(), "rb");
  if (shape_fp != NULL) {
    shape_table = new ShapeTable;
    if (!shape_table->DeSerialize(false, shape_fp)) {
      delete shape_table;
      shape_table = NULL;
      tprintf("Error: Failed to read shape table %s\n",
              shape_table_file.string());
    } else {
      int num_shapes = shape_table->NumShapes();
      tprintf("Read shape table %s of %d shapes\n",
              shape_table_file.string(), num_shapes);
    }
    fclose(shape_fp);
  } else {
    tprintf("Warning: No shape table file present: %s\n",
            shape_table_file.string());
  }
  return shape_table;
}
MasterTrainer * tesseract::LoadTrainingData ( int  argc,
const char *const *  argv,
bool  replication,
ShapeTable **  shape_table,
STRING file_prefix 
)

Definition at line 174 of file commontraining.cpp.

                                                     {
  InitFeatureDefs(&feature_defs);
  InitIntegerFX();
  *file_prefix = "";
  if (!FLAGS_D.empty()) {
    *file_prefix += FLAGS_D.c_str();
    *file_prefix += "/";
  }
  // If we are shape clustering (NULL shape_table) or we successfully load
  // a shape_table written by a previous shape clustering, then
  // shape_analysis will be true, meaning that the MasterTrainer will replace
  // some members of the unicharset with their fragments.
  bool shape_analysis = false;
  if (shape_table != NULL) {
    *shape_table = LoadShapeTable(*file_prefix);
    if (*shape_table != NULL)
      shape_analysis = true;
  } else {
    shape_analysis = true;
  }
  MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC,
                                             shape_analysis,
                                             replication,
                                             FLAGS_debug_level);
  IntFeatureSpace fs;
  fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
  if (FLAGS_T.empty()) {
    trainer->LoadUnicharset(FLAGS_U.c_str());
    // Get basic font information from font_properties.
    if (!FLAGS_F.empty()) {
      if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
        delete trainer;
        return NULL;
      }
    }
    if (!FLAGS_X.empty()) {
      if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
        delete trainer;
        return NULL;
      }
    }
    trainer->SetFeatureSpace(fs);
    const char* page_name;
    // Load training data from .tr files on the command line.
    while ((page_name = GetNextFilename(argc, argv)) != NULL) {
      tprintf("Reading %s ...\n", page_name);
      trainer->ReadTrainingSamples(page_name, feature_defs, false);

      // If there is a file with [lang].[fontname].exp[num].fontinfo present,
      // read font spacing information in to fontinfo_table.
      int pagename_len = strlen(page_name);
      char *fontinfo_file_name = new char[pagename_len + 7];
      strncpy(fontinfo_file_name, page_name, pagename_len - 2);  // remove "tr"
      strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo");  // +"fontinfo"
      trainer->AddSpacingInfo(fontinfo_file_name);
      delete[] fontinfo_file_name;

      // Load the images into memory if required by the classifier.
      if (FLAGS_load_images) {
        STRING image_name = page_name;
        // Chop off the tr and replace with tif. Extension must be tif!
        image_name.truncate_at(image_name.length() - 2);
        image_name += "tif";
        trainer->LoadPageImages(image_name.string());
      }
    }
    trainer->PostLoadCleanup();
    // Write the master trainer if required.
    if (!FLAGS_output_trainer.empty()) {
      FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
      if (fp == NULL) {
        tprintf("Can't create saved trainer data!\n");
      } else {
        trainer->Serialize(fp);
        fclose(fp);
      }
    }
  } else {
    bool success = false;
    tprintf("Loading master trainer from file:%s\n",
            FLAGS_T.c_str());
    FILE* fp = fopen(FLAGS_T.c_str(), "rb");
    if (fp == NULL) {
      tprintf("Can't read file %s to initialize master trainer\n",
              FLAGS_T.c_str());
    } else {
      success = trainer->DeSerialize(false, fp);
      fclose(fp);
    }
    if (!success) {
      tprintf("Deserialize of master trainer failed!\n");
      delete trainer;
      return NULL;
    }
    trainer->SetFeatureSpace(fs);
  }
  trainer->PreTrainingSetup();
  if (!FLAGS_O.empty() &&
      !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
    fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
    delete trainer;
    return NULL;
  }
  if (shape_table != NULL) {
    // If we previously failed to load a shapetable, then shape clustering
    // wasn't run so make a flat one now.
    if (*shape_table == NULL) {
      *shape_table = new ShapeTable;
      trainer->SetupFlatShapeTable(*shape_table);
      tprintf("Flat shape table summary: %s\n",
              (*shape_table)->SummaryStr().string());
    }
    (*shape_table)->set_unicharset(trainer->unicharset());
  }
  return trainer;
}
bool tesseract::MakeIndividualGlyphs ( Pix *  pix,
const vector< BoxChar * > &  vbox,
const int  input_tiff_page 
)

Definition at line 293 of file text2image.cpp.

                                                     {
  // If checks fail, return false without exiting text2image
  if (!pix) {
    tprintf("ERROR: MakeIndividualGlyphs(): Input Pix* is NULL\n");
    return false;
  } else if (FLAGS_glyph_resized_size <= 0) {
    tprintf("ERROR: --glyph_resized_size must be positive\n");
    return false;
  } else if (FLAGS_glyph_num_border_pixels_to_pad < 0) {
    tprintf("ERROR: --glyph_num_border_pixels_to_pad must be 0 or positive\n");
    return false;
  }

  const int n_boxes = vbox.size();
  int n_boxes_saved = 0;
  int current_tiff_page = 0;
  int y_previous = 0;
  static int glyph_count = 0;
  for (int i = 0; i < n_boxes; i++) {
    // Get one bounding box
    Box* b = vbox[i]->mutable_box();
    if (!b) continue;
    const int x = b->x;
    const int y = b->y;
    const int w = b->w;
    const int h = b->h;
    // Check present tiff page (for multipage tiff)
    if (y < y_previous-pixGetHeight(pix)/10) {
      tprintf("ERROR: Wrap-around encountered, at i=%d\n", i);
      current_tiff_page++;
    }
    if (current_tiff_page < input_tiff_page) continue;
    else if (current_tiff_page > input_tiff_page) break;
    // Check box validity
    if (x < 0 || y < 0 ||
        (x+w-1) >= pixGetWidth(pix) ||
        (y+h-1) >= pixGetHeight(pix)) {
      tprintf("ERROR: MakeIndividualGlyphs(): Index out of range, at i=%d"
              " (x=%d, y=%d, w=%d, h=%d\n)", i, x, y, w, h);
      continue;
    } else if (w < FLAGS_glyph_num_border_pixels_to_pad &&
               h < FLAGS_glyph_num_border_pixels_to_pad) {
      tprintf("ERROR: Input image too small to be a character, at i=%d\n", i);
      continue;
    }
    // Crop the boxed character
    Pix* pix_glyph = pixClipRectangle(pix, b, NULL);
    if (!pix_glyph) {
      tprintf("ERROR: MakeIndividualGlyphs(): Failed to clip, at i=%d\n", i);
      continue;
    }
    // Resize to square
    Pix* pix_glyph_sq = pixScaleToSize(pix_glyph,
                                       FLAGS_glyph_resized_size,
                                       FLAGS_glyph_resized_size);
    if (!pix_glyph_sq) {
      tprintf("ERROR: MakeIndividualGlyphs(): Failed to resize, at i=%d\n", i);
      continue;
    }
    // Zero-pad
    Pix* pix_glyph_sq_pad = pixAddBorder(pix_glyph_sq,
                                         FLAGS_glyph_num_border_pixels_to_pad,
                                         0);
    if (!pix_glyph_sq_pad) {
      tprintf("ERROR: MakeIndividualGlyphs(): Failed to zero-pad, at i=%d\n", i);
      continue;
    }
    // Write out
    Pix* pix_glyph_sq_pad_8 = pixConvertTo8(pix_glyph_sq_pad, false);
    char filename[1024];
    snprintf(filename, 1024, "%s_%d.jpg", FLAGS_outputbase.c_str(),
             glyph_count++);
    if (pixWriteJpeg(filename, pix_glyph_sq_pad_8, 100, 0)) {
      tprintf("ERROR: MakeIndividualGlyphs(): Failed to write JPEG to %s,"
              " at i=%d\n", filename, i);
      continue;
    }

    pixDestroy(&pix_glyph);
    pixDestroy(&pix_glyph_sq);
    pixDestroy(&pix_glyph_sq_pad);
    pixDestroy(&pix_glyph_sq_pad_8);
    n_boxes_saved++;
    y_previous = y;
  }
  if (n_boxes_saved == 0) {
    return false;
  } else {
    tprintf("Total number of characters saved = %d\n", n_boxes_saved);
    return true;
  }
}
void tesseract::MarkRowsWithModel ( GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
const ParagraphModel model,
bool  ltr,
int  eop_threshold 
)

Definition at line 807 of file paragraphs.cpp.

                                          {
  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
    return;
  for (int row = row_start; row < row_end; row++) {
    bool valid_first = ValidFirstLine(rows, row, model);
    bool valid_body = ValidBodyLine(rows, row, model);
    if (valid_first && !valid_body) {
      (*rows)[row].AddStartLine(model);
    } else if (valid_body && !valid_first) {
      (*rows)[row].AddBodyLine(model);
    } else if (valid_body && valid_first) {
      bool after_eop = (row == row_start);
      if (row > row_start) {
        if (eop_threshold > 0) {
          if (model->justification() == JUSTIFICATION_LEFT) {
            after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
          } else {
            after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
          }
        } else {
         after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
                                           model->justification());
        }
      }
      if (after_eop) {
        (*rows)[row].AddStartLine(model);
      } else {
        (*rows)[row].AddBodyLine(model);
      }
    } else {
      // Do nothing. Stray row.
    }
  }
}
void tesseract::MarkStrongEvidence ( GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end 
)

Definition at line 1830 of file paragraphs.cpp.

                                                    {
  // Record patently obvious body text.
  for (int i = row_start + 1; i < row_end; i++) {
    const RowScratchRegisters &prev = (*rows)[i - 1];
    RowScratchRegisters &curr = (*rows)[i];
    tesseract::ParagraphJustification typical_justification =
        prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
    if (!curr.ri_->rword_likely_starts_idea &&
        !curr.ri_->lword_likely_starts_idea &&
        !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
      curr.SetBodyLine();
    }
  }

  // Record patently obvious start paragraph lines.
  //
  // It's an extremely good signal of the start of a paragraph that
  // the first word would have fit on the end of the previous line.
  // However, applying just that signal would have us mark random
  // start lines of lineated text (poetry and source code) and some
  // centered headings as paragraph start lines.  Therefore, we use
  // a second qualification for a paragraph start: Not only should
  // the first word of this line have fit on the previous line,
  // but also, this line should go full to the right of the block,
  // disallowing a subsequent word from having fit on this line.

  // First row:
  {
    RowScratchRegisters &curr = (*rows)[row_start];
    RowScratchRegisters &next = (*rows)[row_start + 1];
    tesseract::ParagraphJustification j =
        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
    if (curr.GetLineType() == LT_UNKNOWN &&
        !FirstWordWouldHaveFit(curr, next, j) &&
        (curr.ri_->lword_likely_starts_idea ||
         curr.ri_->rword_likely_starts_idea)) {
      curr.SetStartLine();
    }
  }
  // Middle rows
  for (int i = row_start + 1; i < row_end - 1; i++) {
    RowScratchRegisters &prev = (*rows)[i - 1];
    RowScratchRegisters &curr = (*rows)[i];
    RowScratchRegisters &next = (*rows)[i + 1];
    tesseract::ParagraphJustification j =
        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
    if (curr.GetLineType() == LT_UNKNOWN &&
        !FirstWordWouldHaveFit(curr, next, j) &&
        LikelyParagraphStart(prev, curr, j)) {
      curr.SetStartLine();
    }
  }
  // Last row
  {  // the short circuit at the top means we have at least two lines.
    RowScratchRegisters &prev = (*rows)[row_end - 2];
    RowScratchRegisters &curr = (*rows)[row_end - 1];
    tesseract::ParagraphJustification j =
        curr.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
    if (curr.GetLineType() == LT_UNKNOWN &&
        !FirstWordWouldHaveFit(curr, curr, j) &&
        LikelyParagraphStart(prev, curr, j)) {
      curr.SetStartLine();
    }
  }
}
void tesseract::ModelStrongEvidence ( int  debug_level,
GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
bool  allow_flush_models,
ParagraphTheory *  theory 
)

Definition at line 1900 of file paragraphs.cpp.

                                                  {
  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
    return;

  int start = row_start;
  while (start < row_end) {
    while (start < row_end && (*rows)[start].GetLineType() != LT_START)
      start++;
    if (start >= row_end - 1)
      break;

    int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
    int end = start;
    ParagraphModel last_model;
    bool next_consistent;
    do {
      ++end;
      // rows[row, end) was consistent.
      // If rows[row, end + 1) is not consistent,
      //   just model rows[row, end)
      if (end < row_end - 1) {
        RowScratchRegisters &next = (*rows)[end];
        LineType lt = next.GetLineType();
        next_consistent = lt == LT_BODY ||
            (lt == LT_UNKNOWN &&
             !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
      } else {
        next_consistent = false;
      }
      if (next_consistent) {
        ParagraphModel next_model = InternalParagraphModelByOutline(
            rows, start, end + 1, tolerance, &next_consistent);
        if (((*rows)[start].ri_->ltr &&
             last_model.justification() == JUSTIFICATION_LEFT &&
             next_model.justification() != JUSTIFICATION_LEFT) ||
            (!(*rows)[start].ri_->ltr &&
             last_model.justification() == JUSTIFICATION_RIGHT &&
             next_model.justification() != JUSTIFICATION_RIGHT)) {
          next_consistent = false;
        }
        last_model = next_model;
      } else {
        next_consistent = false;
      }
    } while (next_consistent && end < row_end);
    // At this point, rows[start, end) looked like it could have been a
    // single paragraph.  If we can make a good ParagraphModel for it,
    // do so and mark this sequence with that model.
    if (end > start + 1) {
      // emit a new paragraph if we have more than one line.
      const ParagraphModel *model = NULL;
      ParagraphModel new_model = ParagraphModelByOutline(
          debug_level, rows, start, end,
          Epsilon(InterwordSpace(*rows, start, end)));
      if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
        // couldn't create a good model, oh well.
      } else if (new_model.is_flush()) {
        if (end == start + 2) {
          // It's very likely we just got two paragraph starts in a row.
          end = start + 1;
        } else if (start == row_start) {
          // Mark this as a Crown.
          if (new_model.justification() == JUSTIFICATION_LEFT) {
            model = kCrownLeft;
          } else {
            model = kCrownRight;
          }
        } else if (allow_flush_models) {
          model = theory->AddModel(new_model);
        }
      } else {
        model = theory->AddModel(new_model);
      }
      if (model) {
        (*rows)[start].AddStartLine(model);
        for (int i = start + 1; i < end; i++) {
          (*rows)[i].AddBodyLine(model);
        }
      }
    }
    start = end;
  }
}

Definition at line 131 of file normstrngs.cpp.

                                                            {
  IcuErrorCode error_code;
  const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
      NULL, "nfkc", UNORM2_COMPOSE, error_code);
  error_code.assertSuccess();
  error_code.reset();

  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
  icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
  error_code.assertSuccess();

  str->clear();
  for (int i = 0; i < norm_str.length(); ++i) {
    // If any spaces were added by NFKC, pretend normalization is a nop.
    if (norm_str[i] == ' ') {
      str->clear();
      str->push_back(ch);
      break;
    } else {
      str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
    }
  }
}
uinT8 tesseract::NormalizeDirection ( uinT8  dir,
const FCOORD unnormed_pos,
const DENORM denorm,
const DENORM root_denorm 
)

Definition at line 171 of file intfx.cpp.

                                                                          {
  // Convert direction to a vector.
  FCOORD unnormed_end;
  unnormed_end.from_direction(dir);
  unnormed_end += unnormed_pos;
  FCOORD normed_pos, normed_end;
  denorm.NormTransform(root_denorm, unnormed_pos, &normed_pos);
  denorm.NormTransform(root_denorm, unnormed_end, &normed_end);
  normed_end -= normed_pos;
  return normed_end.to_direction();
}
STRING tesseract::NormalizeUTF8String ( const char *  str8)

Definition at line 116 of file normstrngs.cpp.

                                             {
  GenericVector<char32> str32, out_str32, norm_str;
  UTF8ToUTF32(str8, &str32);
  for (int i = 0; i < str32.length(); ++i) {
    norm_str.clear();
    NormalizeChar32(str32[i], &norm_str);
    for (int j = 0; j < norm_str.length(); ++j) {
      out_str32.push_back(norm_str[j]);
    }
  }
  STRING out_str8;
  UTF32ToUTF8(out_str32, &out_str8);
  return out_str8;
}

Definition at line 156 of file normstrngs.cpp.

                               {
  if (is_hyphen_punc(ch))
    return '-';
  else if (is_single_quote(ch))
    return '\'';
  else if (is_double_quote(ch))
    return '"';
  return ch;
}
int tesseract::OtsuStats ( const int *  histogram,
int *  H_out,
int *  omega0_out 
)

Definition at line 182 of file otsuthr.cpp.

                                                                 {
  int H = 0;
  double mu_T = 0.0;
  for (int i = 0; i < kHistogramSize; ++i) {
    H += histogram[i];
    mu_T += static_cast<double>(i) * histogram[i];
  }

  // Now maximize sig_sq_B over t.
  // http://www.ctie.monash.edu.au/hargreave/Cornall_Terry_328.pdf
  int best_t = -1;
  int omega_0, omega_1;
  int best_omega_0 = 0;
  double best_sig_sq_B = 0.0;
  double mu_0, mu_1, mu_t;
  omega_0 = 0;
  mu_t = 0.0;
  for (int t = 0; t < kHistogramSize - 1; ++t) {
    omega_0 += histogram[t];
    mu_t += t * static_cast<double>(histogram[t]);
    if (omega_0 == 0)
      continue;
    omega_1 = H - omega_0;
    if (omega_1 == 0)
      break;
    mu_0 = mu_t / omega_0;
    mu_1 = (mu_T - mu_t) / omega_1;
    double sig_sq_B = mu_1 - mu_0;
    sig_sq_B *= sig_sq_B * omega_0 * omega_1;
    if (best_t < 0 || sig_sq_B > best_sig_sq_B) {
      best_sig_sq_B = sig_sq_B;
      best_t = t;
      best_omega_0 = omega_0;
    }
  }
  if (H_out != NULL) *H_out = H;
  if (omega0_out != NULL) *omega0_out = best_omega_0;
  return best_t;
}
int tesseract::OtsuThreshold ( Pix *  src_pix,
int  left,
int  top,
int  width,
int  height,
int **  thresholds,
int **  hi_values 
)

Definition at line 39 of file otsuthr.cpp.

                                                     {
  int num_channels = pixGetDepth(src_pix) / 8;
  // Of all channels with no good hi_value, keep the best so we can always
  // produce at least one answer.
  PERF_COUNT_START("OtsuThreshold")
  int best_hi_value = 1;
  int best_hi_index = 0;
  bool any_good_hivalue = false;
  double best_hi_dist = 0.0;
  *thresholds = new int[num_channels];
  *hi_values = new int[num_channels];
  // all of channel 0 then all of channel 1...
  int *histogramAllChannels = new int[kHistogramSize * num_channels];

  // only use opencl if compiled w/ OpenCL and selected device is opencl
#ifdef USE_OPENCL
    // Calculate Histogram on GPU
    OpenclDevice od;
    if (od.selectedDeviceIsOpenCL() &&
        (num_channels == 1 || num_channels == 4) && top == 0 && left == 0 ) {
      od.HistogramRectOCL(
          (const unsigned char*)pixGetData(src_pix),
          num_channels,
          pixGetWpl(src_pix) * 4,
          left,
          top,
          width,
          height,
          kHistogramSize,
          histogramAllChannels);

    // Calculate Threshold from Histogram on cpu
    for (int ch = 0; ch < num_channels; ++ch) {
      (*thresholds)[ch] = -1;
      (*hi_values)[ch] = -1;
      int *histogram = &histogramAllChannels[kHistogramSize * ch];
      int H;
      int best_omega_0;
      int best_t = OtsuStats(histogram, &H, &best_omega_0);
      if (best_omega_0 == 0 || best_omega_0 == H) {
         // This channel is empty.
         continue;
       }
      // To be a convincing foreground we must have a small fraction of H
      // or to be a convincing background we must have a large fraction of H.
      // In between we assume this channel contains no thresholding information.
      int hi_value = best_omega_0 < H * 0.5;
      (*thresholds)[ch] = best_t;
      if (best_omega_0 > H * 0.75) {
        any_good_hivalue = true;
        (*hi_values)[ch] = 0;
      } else if (best_omega_0 < H * 0.25) {
        any_good_hivalue = true;
        (*hi_values)[ch] = 1;
      } else {
        // In case all channels are like this, keep the best of the bad lot.
        double hi_dist = hi_value ? (H - best_omega_0) : best_omega_0;
        if (hi_dist > best_hi_dist) {
          best_hi_dist = hi_dist;
          best_hi_value = hi_value;
          best_hi_index = ch;
        }
      }
    }
  } else {
#endif
    for (int ch = 0; ch < num_channels; ++ch) {
      (*thresholds)[ch] = -1;
      (*hi_values)[ch] = -1;
      // Compute the histogram of the image rectangle.
      int histogram[kHistogramSize];
      HistogramRect(src_pix, ch, left, top, width, height, histogram);
      int H;
      int best_omega_0;
      int best_t = OtsuStats(histogram, &H, &best_omega_0);
      if (best_omega_0 == 0 || best_omega_0 == H) {
         // This channel is empty.
         continue;
       }
      // To be a convincing foreground we must have a small fraction of H
      // or to be a convincing background we must have a large fraction of H.
      // In between we assume this channel contains no thresholding information.
      int hi_value = best_omega_0 < H * 0.5;
      (*thresholds)[ch] = best_t;
      if (best_omega_0 > H * 0.75) {
        any_good_hivalue = true;
        (*hi_values)[ch] = 0;
      } else if (best_omega_0 < H * 0.25) {
        any_good_hivalue = true;
        (*hi_values)[ch] = 1;
      } else {
        // In case all channels are like this, keep the best of the bad lot.
        double hi_dist = hi_value ? (H - best_omega_0) : best_omega_0;
        if (hi_dist > best_hi_dist) {
          best_hi_dist = hi_dist;
          best_hi_value = hi_value;
          best_hi_index = ch;
        }
      }
    }
#ifdef USE_OPENCL
  }
#endif  // USE_OPENCL
  delete[] histogramAllChannels;

  if (!any_good_hivalue) {
    // Use the best of the ones that were not good enough.
    (*hi_values)[best_hi_index] = best_hi_value;
  }
  PERF_COUNT_END
  return num_channels;
}
ParagraphModel tesseract::ParagraphModelByOutline ( int  debug_level,
const GenericVector< RowScratchRegisters > *  rows,
int  start,
int  end,
int  tolerance 
)

Definition at line 1793 of file paragraphs.cpp.

                                       {
  bool unused_consistent;
  ParagraphModel retval = InternalParagraphModelByOutline(
      rows, start, end, tolerance, &unused_consistent);
  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
    tprintf("Could not determine a model for this paragraph:\n");
    PrintRowRange(*rows, start, end);
  }
  return retval;
}
int tesseract::ParamsTrainingFeatureByName ( const char *  name)

Definition at line 26 of file params_training_featdef.cpp.

                                                  {
  if (name == NULL)
    return -1;
  int array_size = sizeof(kParamsTrainingFeatureTypeName) /
    sizeof(kParamsTrainingFeatureTypeName[0]);
  for (int i = 0; i < array_size; i++) {
    if (kParamsTrainingFeatureTypeName[i] == NULL)
      continue;
    if (strcmp(name, kParamsTrainingFeatureTypeName[i]) == 0)
      return i;
  }
  return -1;
}
void tesseract::ParseCommandLineFlags ( const char *  usage,
int *  argc,
char ***  argv,
const bool  remove_flags 
)

Definition at line 312 of file commandlineflags.cpp.

                                                    {
  InitGoogle(usage, argc, argv, remove_flags);
}
double tesseract::prec ( double  x)

Definition at line 52 of file pdfrenderer.cpp.

                      {
  double kPrecision = 1000.0;
  double a = round(x * kPrecision) / kPrecision;
  if (a == -0)
    return 0;
  return a;
}
bool tesseract::read_info ( FILE *  f,
FontInfo *  fi,
bool  swap 
)

Definition at line 152 of file fontinfo.cpp.

                                                 {
  inT32 size;
  if (fread(&size, sizeof(size), 1, f) != 1) return false;
  if (swap)
    Reverse32(&size);
  char* font_name = new char[size + 1];
  fi->name = font_name;
  if (static_cast<int>(fread(font_name, sizeof(*font_name), size, f)) != size)
    return false;
  font_name[size] = '\0';
  if (fread(&fi->properties, sizeof(fi->properties), 1, f) != 1) return false;
  if (swap)
    Reverse32(&fi->properties);
  return true;
}
bool tesseract::read_set ( FILE *  f,
FontSet *  fs,
bool  swap 
)

Definition at line 238 of file fontinfo.cpp.

                                               {
  if (fread(&fs->size, sizeof(fs->size), 1, f) != 1) return false;
  if (swap)
    Reverse32(&fs->size);
  fs->configs = new int[fs->size];
  for (int i = 0; i < fs->size; ++i) {
    if (fread(&fs->configs[i], sizeof(fs->configs[i]), 1, f) != 1) return false;
    if (swap)
      Reverse32(&fs->configs[i]);
  }
  return true;
}
bool tesseract::read_spacing_info ( FILE *  f,
FontInfo *  fi,
bool  swap 
)

Definition at line 177 of file fontinfo.cpp.

                                                         {
  inT32 vec_size, kern_size;
  if (fread(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
  if (swap) Reverse32(&vec_size);
  ASSERT_HOST(vec_size >= 0);
  if (vec_size == 0) return true;
  fi->init_spacing(vec_size);
  for (int i = 0; i < vec_size; ++i) {
    FontSpacingInfo *fs = new FontSpacingInfo();
    if (fread(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, f) != 1 ||
        fread(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, f) != 1 ||
        fread(&kern_size, sizeof(kern_size), 1, f) != 1) {
      return false;
    }
    if (swap) {
      ReverseN(&(fs->x_gap_before), sizeof(fs->x_gap_before));
      ReverseN(&(fs->x_gap_after), sizeof(fs->x_gap_after));
      Reverse32(&kern_size);
    }
    if (kern_size < 0) {  // indication of a NULL entry in fi->spacing_vec
      delete fs;
      continue;
    }
    if (kern_size > 0 && (!fs->kerned_unichar_ids.DeSerialize(swap, f) ||
                          !fs->kerned_x_gaps.DeSerialize(swap, f))) {
      return false;
    }
    fi->add_spacing(i, fs);
  }
  return true;
}
bool tesseract::read_t ( PAGE_RES_IT page_res_it,
TBOX tbox 
)

Definition at line 53 of file recogtraining.cpp.

                                                  {
  while (page_res_it->block() != NULL) {
    if (page_res_it->word() != NULL)
      break;
    page_res_it->forward();
  }

  if (page_res_it->word() != NULL) {
    *tbox = page_res_it->word()->word->bounding_box();
    page_res_it->forward();

    // If tbox->left() is negative, the training image has vertical text and
    // all the coordinates of bounding boxes of page_res are rotated by 90
    // degrees in a counterclockwise direction. We need to rotate the TBOX back
    // in order to compare with the TBOXes of box files.
    if (tbox->left() < 0) {
      tbox->rotate(FCOORD(0.0, -1.0));
    }

    return true;
  } else {
    return false;
  }
}
void tesseract::RecomputeMarginsAndClearHypotheses ( GenericVector< RowScratchRegisters > *  rows,
int  start,
int  end,
int  percentile 
)

Definition at line 1558 of file paragraphs.cpp.

                    {
  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
    return;

  int lmin, lmax, rmin, rmax;
  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
  for (int i = start; i < end; i++) {
    RowScratchRegisters &sr = (*rows)[i];
    sr.SetUnknown();
    if (sr.ri_->num_words == 0)
      continue;
    UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
    UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
  }
  STATS lefts(lmin, lmax + 1);
  STATS rights(rmin, rmax + 1);
  for (int i = start; i < end; i++) {
    RowScratchRegisters &sr = (*rows)[i];
    if (sr.ri_->num_words == 0)
      continue;
    lefts.add(sr.lmargin_ + sr.lindent_, 1);
    rights.add(sr.rmargin_ + sr.rindent_, 1);
  }
  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
  for (int i = start; i < end; i++) {
    RowScratchRegisters &sr = (*rows)[i];
    int ldelta = ignorable_left - sr.lmargin_;
    sr.lmargin_ += ldelta;
    sr.lindent_ -= ldelta;
    int rdelta = ignorable_right - sr.rmargin_;
    sr.rmargin_ += rdelta;
    sr.rindent_ -= rdelta;
  }
}
void tesseract::RightWordAttributes ( const UNICHARSET unicharset,
const WERD_CHOICE werd,
const STRING utf8,
bool *  is_list,
bool *  starts_idea,
bool *  ends_idea 
)

Definition at line 441 of file paragraphs.cpp.

                                                                            {
  *is_list = false;
  *starts_idea = false;
  *ends_idea = false;
  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) {  // Empty
    *ends_idea = true;
    return;
  }

  if (unicharset && werd) {  // We have a proper werd and unicharset so use it.
    if (UniLikelyListItem(unicharset, werd)) {
      *is_list = true;
      *starts_idea = true;
    }
    UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
    if (unicharset->get_ispunctuation(last_letter)) {
      *ends_idea = true;
    }
  } else {  // Assume utf8 is mostly ASCII
    if (AsciiLikelyListItem(utf8)) {
      *is_list = true;
      *starts_idea = true;
    }
    int last_letter = utf8[utf8.size() - 1];
    if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
      *ends_idea = true;
    }
  }
}
bool tesseract::RowIsStranded ( const GenericVector< RowScratchRegisters > &  rows,
int  row 
)

Definition at line 2139 of file paragraphs.cpp.

                                                                            {
  SetOfModels row_models;
  rows[row].StrongHypotheses(&row_models);

  for (int m = 0; m < row_models.size(); m++) {
    bool all_starts = rows[row].GetLineType();
    int run_length = 1;
    bool continues = true;
    for (int i = row - 1; i >= 0 && continues; i--) {
      SetOfModels models;
      rows[i].NonNullHypotheses(&models);
      switch (rows[i].GetLineType(row_models[m])) {
        case LT_START: run_length++; break;
        case LT_MULTIPLE:  // explicit fall-through
        case LT_BODY: run_length++; all_starts = false; break;
        case LT_UNKNOWN:  // explicit fall-through
        default: continues = false;
      }
    }
    continues = true;
    for (int i = row + 1; i < rows.size() && continues; i++) {
      SetOfModels models;
      rows[i].NonNullHypotheses(&models);
      switch (rows[i].GetLineType(row_models[m])) {
        case LT_START: run_length++; break;
        case LT_MULTIPLE:  // explicit fall-through
        case LT_BODY: run_length++; all_starts = false; break;
        case LT_UNKNOWN:  // explicit fall-through
        default: continues = false;
      }
    }
    if (run_length > 2 || (!all_starts && run_length > 1)) return false;
  }
  return true;
}
bool tesseract::RowsFitModel ( const GenericVector< RowScratchRegisters > *  rows,
int  start,
int  end,
const ParagraphModel model 
)

Definition at line 1808 of file paragraphs.cpp.

                                                                   {
  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
    return false;
  if (!ValidFirstLine(rows, start, model)) return false;
  for (int i = start + 1 ; i < end; i++) {
    if (!ValidBodyLine(rows, i, model)) return false;
  }
  return true;
}
STRING tesseract::RtlEmbed ( const STRING word,
bool  rtlify 
)

Definition at line 121 of file paragraphs.cpp.

                                                 {
  if (rtlify)
    return STRING(kRLE) + word + STRING(kPDF);
  return word;
}
bool tesseract::SaveDataToFile ( const GenericVector< char > &  data,
const STRING filename 
) [inline]

Definition at line 345 of file genericvector.h.

                                                  {
  FILE* fp = fopen(filename.string(), "wb");
  if (fp == NULL) return false;
  bool result =
      static_cast<int>(fwrite(&data[0], 1, data.size(), fp)) == data.size();
  fclose(fp);
  return result;
}
const char * tesseract::ScriptPosToString ( enum ScriptPos  script_pos)

Definition at line 181 of file ratngs.cpp.

                                                         {
  switch (script_pos) {
    case SP_NORMAL: return "NORM";
    case SP_SUBSCRIPT: return "SUB";
    case SP_SUPERSCRIPT: return "SUPER";
    case SP_DROPCAP: return "DROPC";
  }
  return "SP_UNKNOWN";
}
void tesseract::SeparateSimpleLeaderLines ( GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
ParagraphTheory *  theory 
)

Definition at line 2025 of file paragraphs.cpp.

                                                        {
  for (int i = row_start + 1; i < row_end - 1; i++) {
    if ((*rows)[i - 1].ri_->has_leaders &&
        (*rows)[i].ri_->has_leaders &&
        (*rows)[i + 1].ri_->has_leaders) {
      const ParagraphModel *model = theory->AddModel(
          ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
      (*rows)[i].AddStartLine(model);
    }
  }
}
const char* tesseract::SkipChars ( const char *  str,
const char *  toskip 
)

Definition at line 210 of file paragraphs.cpp.

                                                           {
  while (*str != '\0' && strchr(toskip, *str)) { str++; }
  return str;
}
const char* tesseract::SkipChars ( const char *  str,
bool(*)(int)  skip 
)

Definition at line 215 of file paragraphs.cpp.

                                                          {
  while (*str != '\0' && skip(*str)) { str++; }
  return str;
}
const char* tesseract::SkipOne ( const char *  str,
const char *  toskip 
)

Definition at line 220 of file paragraphs.cpp.

                                                         {
  if (*str != '\0' && strchr(toskip, *str)) return str + 1;
  return str;
}
template<typename T >
int tesseract::sort_cmp ( const void *  t1,
const void *  t2 
)

Definition at line 365 of file genericvector.h.

                                             {
  const T* a = static_cast<const T *> (t1);
  const T* b = static_cast<const T *> (t2);
  if (*a < *b) {
    return -1;
  } else if (*b < *a) {
    return 1;
  } else {
    return 0;
  }
}
template<typename T >
int tesseract::sort_ptr_cmp ( const void *  t1,
const void *  t2 
)

Definition at line 382 of file genericvector.h.

                                                 {
  const T* a = *reinterpret_cast<T * const *>(t1);
  const T* b = *reinterpret_cast<T * const *>(t2);
  if (*a < *b) {
    return -1;
  } else if (*b < *a) {
    return 1;
  } else {
    return 0;
  }
}
template<class BBC >
int tesseract::SortByBoxBottom ( const void *  void1,
const void *  void2 
)

Definition at line 408 of file bbgrid.h.

                                                          {
  // The void*s are actually doubly indirected, so get rid of one level.
  const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
  const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
  int result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
  if (result != 0)
    return result;
  result =  p1->bounding_box().top() - p2->bounding_box().top();
  if (result != 0)
    return result;
  result = p1->bounding_box().left() - p2->bounding_box().left();
  if (result != 0)
    return result;
  return p1->bounding_box().right() - p2->bounding_box().right();
}
template<class BBC >
int tesseract::SortByBoxLeft ( const void *  void1,
const void *  void2 
)

Definition at line 372 of file bbgrid.h.

                                                        {
  // The void*s are actually doubly indirected, so get rid of one level.
  const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
  const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
  int result = p1->bounding_box().left() - p2->bounding_box().left();
  if (result != 0)
    return result;
  result = p1->bounding_box().right() - p2->bounding_box().right();
  if (result != 0)
    return result;
  result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
  if (result != 0)
    return result;
  return p1->bounding_box().top() - p2->bounding_box().top();
}
template<class BLOB_CHOICE >
int tesseract::SortByRating ( const void *  void1,
const void *  void2 
)

Definition at line 84 of file pieces.cpp.

                                                       {
  const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
  const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);

  if (p1->rating() < p2->rating())
    return 1;
  return -1;
}
template<class BLOB_CHOICE >
int tesseract::SortByUnicharID ( const void *  void1,
const void *  void2 
)

Definition at line 76 of file pieces.cpp.

                                                          {
  const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
  const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);

  return p1->unichar_id() - p2->unichar_id();
}
template<class BBC >
int tesseract::SortRightToLeft ( const void *  void1,
const void *  void2 
)

Definition at line 390 of file bbgrid.h.

                                                          {
  // The void*s are actually doubly indirected, so get rid of one level.
  const BBC* p1 = *reinterpret_cast<const BBC* const *>(void1);
  const BBC* p2 = *reinterpret_cast<const BBC* const *>(void2);
  int result = p2->bounding_box().right() - p1->bounding_box().right();
  if (result != 0)
    return result;
  result = p2->bounding_box().left() - p1->bounding_box().left();
  if (result != 0)
    return result;
  result = p1->bounding_box().bottom() - p2->bounding_box().bottom();
  if (result != 0)
    return result;
  return p1->bounding_box().top() - p2->bounding_box().top();
}
int tesseract::SpanUTF8NotWhitespace ( const char *  text)

Definition at line 197 of file normstrngs.cpp.

                                            {
  int n_notwhite = 0;
  for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
       it != UNICHAR::end(text, strlen(text));
       ++it) {
    if (IsWhitespace(*it)) break;
    n_notwhite += it.utf8_len();
  }
  return n_notwhite;
}
int tesseract::SpanUTF8Whitespace ( const char *  text)

Definition at line 186 of file normstrngs.cpp.

                                         {
  int n_white = 0;
  for (UNICHAR::const_iterator it = UNICHAR::begin(text, strlen(text));
       it != UNICHAR::end(text, strlen(text));
       ++it) {
    if (!IsWhitespace(*it)) break;
    n_white += it.utf8_len();
  }
  return n_white;
}
void tesseract::StrongEvidenceClassify ( int  debug_level,
GenericVector< RowScratchRegisters > *  rows,
int  row_start,
int  row_end,
ParagraphTheory *  theory 
)

Definition at line 1995 of file paragraphs.cpp.

                                                     {
  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
    return;

  if (debug_level > 1) {
    tprintf("#############################################\n");
    tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
    tprintf("#############################################\n");
  }

  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
  MarkStrongEvidence(rows, row_start, row_end);

  DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);

  // Create paragraph models.
  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);

  DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);

  // At this point, some rows are marked up as paragraphs with model numbers,
  // and some rows are marked up as either LT_START or LT_BODY.  Now let's
  // smear any good paragraph hypotheses forward and backward.
  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
  smearer.Smear();
}
bool tesseract::StrongModel ( const ParagraphModel model) [inline]

Definition at line 75 of file paragraphs_internal.h.

                                                     {
  return model != NULL && model != kCrownLeft && model != kCrownRight;
}
bool tesseract::TextSupportsBreak ( const RowScratchRegisters &  before,
const RowScratchRegisters &  after 
)

Definition at line 1661 of file paragraphs.cpp.

                                                         {
  if (before.ri_->ltr) {
    return before.ri_->rword_likely_ends_idea &&
           after.ri_->lword_likely_starts_idea;
  } else {
    return before.ri_->lword_likely_ends_idea &&
           after.ri_->rword_likely_starts_idea;
  }
}
Pix * tesseract::TraceBlockOnReducedPix ( BLOCK block,
int  gridsize,
ICOORD  bleft,
int *  left,
int *  bottom 
)

Definition at line 258 of file bbgrid.cpp.

                                                                  {
  TBOX box = block->bounding_box();
  Pix* pix = GridReducedPix(box, gridsize, bleft, left, bottom);
  int wpl = pixGetWpl(pix);
  l_uint32* data = pixGetData(pix);
  ICOORDELT_IT it(block->poly_block()->points());
  for (it.mark_cycle_pt(); !it.cycled_list();) {
    ICOORD pos = *it.data();
    it.forward();
    ICOORD next_pos = *it.data();
    ICOORD line_vector = next_pos - pos;
    int major, minor;
    ICOORD major_step, minor_step;
    line_vector.setup_render(&major_step, &minor_step, &major, &minor);
    int accumulator = major / 2;
    while (pos != next_pos) {
      int grid_x = (pos.x() - bleft.x()) / gridsize - *left;
      int grid_y = (pos.y() - bleft.y()) / gridsize - *bottom;
      SET_DATA_BIT(data + grid_y * wpl, grid_x);
      pos += major_step;
      accumulator += minor;
      if (accumulator >= major) {
        accumulator -= major;
        pos += minor_step;
      }
    }
  }
  return pix;
}
Pix * tesseract::TraceOutlineOnReducedPix ( C_OUTLINE outline,
int  gridsize,
ICOORD  bleft,
int *  left,
int *  bottom 
)

Definition at line 232 of file bbgrid.cpp.

                                                                    {
  TBOX box = outline->bounding_box();
  Pix* pix = GridReducedPix(box, gridsize, bleft, left, bottom);
  int wpl = pixGetWpl(pix);
  l_uint32* data = pixGetData(pix);
  int length = outline->pathlength();
  ICOORD pos = outline->start_pos();
  for (int i = 0; i < length; ++i) {
    int grid_x = (pos.x() - bleft.x()) / gridsize - *left;
    int grid_y = (pos.y() - bleft.y()) / gridsize - *bottom;
    SET_DATA_BIT(data + grid_y * wpl, grid_x);
    pos += outline->step(i);
  }
  return pix;
}
int tesseract::UnicodeFor ( const UNICHARSET u,
const WERD_CHOICE werd,
int  pos 
)

Definition at line 274 of file paragraphs.cpp.

                                                                      {
  if (!u || !werd || pos > werd->length())
    return 0;
  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
}
bool tesseract::UniLikelyListItem ( const UNICHARSET u,
const WERD_CHOICE werd 
)

Definition at line 357 of file paragraphs.cpp.

                                                                     {
  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
    return true;

  UnicodeSpanSkipper m(u, werd);
  int num_segments = 0;
  int pos = 0;
  while (pos < werd->length() && num_segments < 3) {
    int numeral_start = m.SkipPunc(pos);
    if (numeral_start > pos + 1) break;
    int numeral_end = m.SkipRomans(numeral_start);
    if (numeral_end == numeral_start) {
      numeral_end = m.SkipDigits(numeral_start);
      if (numeral_end == numeral_start) {
        // If there's a single latin letter, we can use that.
        numeral_end = m.SkipAlpha(numeral_start);
        if (numeral_end - numeral_start != 1)
          break;
      }
    }
    // We got some sort of numeral.
    num_segments++;
    // Skip any trailing punctuation.
    pos = m.SkipPunc(numeral_end);
    if (pos == numeral_end)
      break;
  }
  return pos == werd->length();
}
void tesseract::UTF32ToUTF8 ( const GenericVector< char32 > &  str32,
STRING utf8_str 
)

Definition at line 45 of file normstrngs.cpp.

                                                                       {
  utf8_str->ensure(str32.length());
  utf8_str->assign("", 0);
  for (int i = 0; i < str32.length(); ++i) {
    UNICHAR uni_ch(str32[i]);
    char *utf8 = uni_ch.utf8_str();
    if (utf8 != NULL) {
      (*utf8_str) += utf8;
      delete[] utf8;
    }
  }
}
void tesseract::UTF8ToUTF32 ( const char *  utf8_str,
GenericVector< char32 > *  str32 
)

Definition at line 31 of file normstrngs.cpp.

                                                                     {
  str32->clear();
  str32->reserve(strlen(utf8_str));
  int len = strlen(utf8_str);
  int step = 0;
  for (int ch = 0; ch < len; ch += step) {
    step = UNICHAR::utf8_step(utf8_str + ch);
    if (step > 0) {
      UNICHAR uni_ch(utf8_str + ch, step);
      (*str32) += uni_ch.first_uni();
    }
  }
}
bool tesseract::ValidBodyLine ( const GenericVector< RowScratchRegisters > *  rows,
int  row,
const ParagraphModel model 
)

Definition at line 1277 of file paragraphs.cpp.

                                                         {
  if (!StrongModel(model)) {
    tprintf("ValidBodyLine() should only be called with strong models!\n");
  }
  return StrongModel(model) &&
      model->ValidBodyLine(
          (*rows)[row].lmargin_, (*rows)[row].lindent_,
          (*rows)[row].rindent_, (*rows)[row].rmargin_);
}
bool tesseract::ValidFirstLine ( const GenericVector< RowScratchRegisters > *  rows,
int  row,
const ParagraphModel model 
)

Definition at line 1266 of file paragraphs.cpp.

                                                          {
  if (!StrongModel(model)) {
    tprintf("ValidFirstLine() should only be called with strong models!\n");
  }
  return StrongModel(model) &&
      model->ValidFirstLine(
          (*rows)[row].lmargin_, (*rows)[row].lindent_,
          (*rows)[row].rindent_, (*rows)[row].rmargin_);
}
bool tesseract::write_info ( FILE *  f,
const FontInfo &  fi 
)

Definition at line 168 of file fontinfo.cpp.

                                             {
  inT32 size = strlen(fi.name);
  if (fwrite(&size, sizeof(size), 1, f) != 1) return false;
  if (static_cast<int>(fwrite(fi.name, sizeof(*fi.name), size, f)) != size)
    return false;
  if (fwrite(&fi.properties, sizeof(fi.properties), 1, f) != 1) return false;
  return true;
}
bool tesseract::write_set ( FILE *  f,
const FontSet &  fs 
)

Definition at line 251 of file fontinfo.cpp.

                                           {
  if (fwrite(&fs.size, sizeof(fs.size), 1, f) != 1) return false;
  for (int i = 0; i < fs.size; ++i) {
    if (fwrite(&fs.configs[i], sizeof(fs.configs[i]), 1, f) != 1) return false;
  }
  return true;
}
bool tesseract::write_spacing_info ( FILE *  f,
const FontInfo &  fi 
)

Definition at line 209 of file fontinfo.cpp.

                                                     {
  inT32 vec_size = (fi.spacing_vec == NULL) ? 0 : fi.spacing_vec->size();
  if (fwrite(&vec_size,  sizeof(vec_size), 1, f) != 1) return false;
  inT16 x_gap_invalid = -1;
  for (int i = 0; i < vec_size; ++i) {
    FontSpacingInfo *fs = fi.spacing_vec->get(i);
    inT32 kern_size = (fs == NULL) ? -1 : fs->kerned_x_gaps.size();
    if (fs == NULL) {
      // Valid to have the identical fwrites. Writing invalid x-gaps.
      if (fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
          fwrite(&(x_gap_invalid), sizeof(x_gap_invalid), 1, f) != 1 ||
          fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
        return false;
      }
    } else {
      if (fwrite(&(fs->x_gap_before), sizeof(fs->x_gap_before), 1, f) != 1 ||
          fwrite(&(fs->x_gap_after), sizeof(fs->x_gap_after), 1, f) != 1 ||
          fwrite(&kern_size, sizeof(kern_size), 1, f) != 1) {
        return false;
      }
    }
    if (kern_size > 0 && (!fs->kerned_unichar_ids.Serialize(f) ||
                          !fs->kerned_x_gaps.Serialize(f))) {
      return false;
    }
  }
  return true;
}
void tesseract::WriteShapeTable ( const STRING file_prefix,
const ShapeTable shape_table 
)

Definition at line 145 of file commontraining.cpp.

                                                                               {
  STRING shape_table_file = file_prefix;
  shape_table_file += kShapeTableFileSuffix;
  FILE* fp = fopen(shape_table_file.string(), "wb");
  if (fp != NULL) {
    if (!shape_table.Serialize(fp)) {
      fprintf(stderr, "Error writing shape table: %s\n",
              shape_table_file.string());
    }
    fclose(fp);
  } else {
    fprintf(stderr, "Error creating shape table: %s\n",
            shape_table_file.string());
  }
}
void tesseract::YOutlierPieces ( WERD_RES word,
int  rebuilt_blob_index,
int  super_y_bottom,
int  sub_y_top,
ScriptPos *  leading_pos,
int *  num_leading_outliers,
ScriptPos *  trailing_pos,
int *  num_trailing_outliers 
)

Given a recognized blob, see if a contiguous collection of sub-pieces (chopped blobs) starting at its left might qualify as being a subscript or superscript letter based only on y position. Also do this for the right side.

Definition at line 46 of file superscript.cpp.

                                                                         {
  ScriptPos sp_unused1, sp_unused2;
  int unused1, unused2;
  if (!leading_pos) leading_pos = &sp_unused1;
  if (!num_leading_outliers) num_leading_outliers = &unused1;
  if (!trailing_pos) trailing_pos = &sp_unused2;
  if (!num_trailing_outliers) num_trailing_outliers = &unused2;

  *num_leading_outliers = *num_trailing_outliers = 0;
  *leading_pos = *trailing_pos = SP_NORMAL;

  int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
  int num_chopped_pieces = word->best_state[rebuilt_blob_index];
  ScriptPos last_pos = SP_NORMAL;
  int trailing_outliers = 0;
  for (int i = 0; i < num_chopped_pieces; i++) {
    TBOX box = word->chopped_word->blobs[chopped_start + i]->bounding_box();
    ScriptPos pos = SP_NORMAL;
    if (box.bottom() >= super_y_bottom) {
      pos = SP_SUPERSCRIPT;
    } else if (box.top() <= sub_y_top) {
      pos = SP_SUBSCRIPT;
    }
    if (pos == SP_NORMAL) {
      if (trailing_outliers == i) {
        *num_leading_outliers = trailing_outliers;
        *leading_pos = last_pos;
      }
      trailing_outliers = 0;
    } else {
      if (pos == last_pos) {
        trailing_outliers++;
      } else {
        trailing_outliers = 1;
      }
    }
    last_pos = pos;
  }
  *num_trailing_outliers = trailing_outliers;
  *trailing_pos = last_pos;
}

Variable Documentation

const int tesseract::case_state_table[6][4]
Initial value:
 { {
                                  
    
                                  
      0, 1, 5, 4
    },
    {                            
      0, 3, 2, 4
    },
    {                            
      0, -1, 2, -1
    },
    {                            
      0, 3, -1, 4
    },
    {                            
      0, -1, -1, 4
    },
    {                            
      5, -1, 2, -1
    },
  }

Definition at line 35 of file context.cpp.

Definition at line 125 of file tablefind.cpp.

const double tesseract::kAlignedFraction = 0.03125

Definition at line 39 of file alignedblob.cpp.

const double tesseract::kAlignedGapFraction = 0.75

Definition at line 43 of file alignedblob.cpp.

Initial value:
 {
  "Left Aligned",
  "Left Ragged",
  "Center",
  "Right Aligned",
  "Right Ragged",
  "Separator"
}

Definition at line 515 of file tabvector.cpp.

const double tesseract::kAllowBlobArea = 0.05

Definition at line 61 of file tablefind.cpp.

const double tesseract::kAllowBlobHeight = 0.3

Definition at line 59 of file tablefind.cpp.

const double tesseract::kAllowBlobWidth = 0.4

Definition at line 60 of file tablefind.cpp.

const double tesseract::kAllowTextArea = 0.8

Definition at line 54 of file tablefind.cpp.

const double tesseract::kAllowTextHeight = 0.5

Definition at line 52 of file tablefind.cpp.

const double tesseract::kAllowTextWidth = 0.6

Definition at line 53 of file tablefind.cpp.

Initial value:
 {
  "'",       
  "`",       
  "\u2018",  
  "\u2019",  
  "\u2032",  
  NULL,      
}

Definition at line 48 of file unicodes.cpp.

const int tesseract::kBasicBufSize = 2048

Definition at line 26 of file pdfrenderer.cpp.

const double tesseract::kBigPartSizeRatio = 1.75

Definition at line 51 of file colpartitiongrid.cpp.

Definition at line 31 of file boxword.cpp.

Definition at line 78 of file strokewidth.cpp.

Definition at line 40 of file tablerecog.cpp.

Definition at line 39 of file tablerecog.cpp.

Definition at line 62 of file tabfind.cpp.

const double tesseract::kCJKAspectRatio = 1.25

Definition at line 72 of file strokewidth.cpp.

const double tesseract::kCJKAspectRatioIncrease = 1.0625

Definition at line 74 of file strokewidth.cpp.

Definition at line 68 of file strokewidth.cpp.

Definition at line 70 of file strokewidth.cpp.

const int tesseract::kCJKRadius = 2

Definition at line 66 of file strokewidth.cpp.

Pixel resolution of column width estimates.

Definition at line 51 of file tabfind.h.

const double tesseract::kCosMaxSkewAngle = 0.866025

Definition at line 81 of file tabfind.cpp.

const int tesseract::kCrackSpacing = 100

Spacing of cracks across the page to break up tall vertical lines.

Definition at line 45 of file linefind.cpp.

const ParagraphModel * tesseract::kCrownLeft = reinterpret_cast<ParagraphModel *>(0xDEAD111F)

Definition at line 45 of file paragraphs.cpp.

const ParagraphModel * tesseract::kCrownRight = reinterpret_cast<ParagraphModel *>(0xDEAD888F)

Definition at line 47 of file paragraphs.cpp.

Default resolution used if input in not believable.

Definition at line 60 of file pagesegmain.cpp.

const double tesseract::kDiacriticXPadRatio = 7.0

Definition at line 81 of file strokewidth.cpp.

const double tesseract::kDiacriticYPadRatio = 1.75

Definition at line 84 of file strokewidth.cpp.

const char tesseract::kDoNotReverse[] = "RRP_DO_NO_REVERSE"

Definition at line 44 of file trie.cpp.

Definition at line 31 of file degradeimage.cpp.

const float tesseract::kFontMergeDistance = 0.025

Definition at line 50 of file mastertrainer.cpp.

const char tesseract::kForceReverse[] = "RRP_FORCE_REVERSE"

Definition at line 46 of file trie.cpp.

Definition at line 58 of file tablerecog.cpp.

const double tesseract::kGoodRowNumberOfColumnsSmall[] = { 2, 2, 2, 2, 2, 3, 3 }

Definition at line 54 of file tablerecog.cpp.

Initial value:
 
    sizeof(kGoodRowNumberOfColumnsSmall) / sizeof(double) - 1

Definition at line 55 of file tablerecog.cpp.

Definition at line 38 of file tabvector.cpp.

Definition at line 40 of file tabvector.cpp.

const int tesseract::kHistogramSize = 256

Definition at line 27 of file otsuthr.h.

Definition at line 57 of file colfind.cpp.

const double tesseract::kHorizontalSpacing = 0.30

Definition at line 33 of file tablerecog.cpp.

Definition at line 74 of file colpartition.cpp.

Definition at line 70 of file colpartition.cpp.

Definition at line 72 of file colpartition.cpp.

Initial value:
 {
  "-",       
  "\u05BE",  
  "\u2010",  
  "\u2011",  
  "\u2012",  
  "\u2013",  
  "\u2014",  
  "\u2015",  
  "\u2212",  
  "\uFE58",  
  "\uFE63",  
  "\uFF0D",  
  NULL,      
}

Definition at line 32 of file unicodes.cpp.

const float tesseract::kInfiniteDist = 999.0f

Definition at line 907 of file mastertrainer.cpp.

Definition at line 110 of file tablefind.cpp.

Definition at line 112 of file tablefind.cpp.

Definition at line 64 of file colpartition.cpp.

Definition at line 88 of file equationdetect.cpp.

const double tesseract::kLineCountReciprocal = 4.0

Definition at line 51 of file tabvector.cpp.

Definition at line 43 of file tablerecog.cpp.

Definition at line 42 of file tablerecog.cpp.

Grid size used by line finder. Not very critical.

Definition at line 47 of file linefind.cpp.

Definition at line 56 of file tabfind.cpp.

Definition at line 107 of file strokewidth.cpp.

Definition at line 109 of file strokewidth.cpp.

const double tesseract::kLineResidueSizeRatio = 1.75

Definition at line 111 of file strokewidth.cpp.

Definition at line 100 of file strokewidth.cpp.

Definition at line 102 of file strokewidth.cpp.

const char * tesseract::kLRM = "\u200E"

Definition at line 27 of file unicodes.cpp.

const double tesseract::kMarginFactor = 1.1

Definition at line 48 of file tablerecog.cpp.

const double tesseract::kMarginOverlapFraction = 0.25

Definition at line 54 of file colfind.cpp.

const float tesseract::kMathDigitDensityTh1 = 0.25

Definition at line 83 of file equationdetect.cpp.

Definition at line 84 of file equationdetect.cpp.

Definition at line 85 of file equationdetect.cpp.

Definition at line 40 of file ambigs.cpp.

const double tesseract::kMaxBaselineError = 0.4375

Definition at line 77 of file colpartition.cpp.

Definition at line 80 of file tablefind.cpp.

const int tesseract::kMaxBlobWidth = 500

Definition at line 43 of file tablefind.cpp.

Definition at line 32 of file recogtraining.cpp.

Definition at line 69 of file tablefind.cpp.

Definition at line 43 of file colpartitiongrid.cpp.

Definition at line 60 of file fixxht.cpp.

Definition at line 62 of file pagesegmain.cpp.

Definition at line 76 of file strokewidth.cpp.

Definition at line 84 of file colpartition.cpp.

Definition at line 88 of file tablefind.cpp.

Definition at line 90 of file strokewidth.cpp.

Definition at line 93 of file strokewidth.cpp.

Definition at line 64 of file colfind.cpp.

Definition at line 47 of file tabvector.cpp.

Definition at line 72 of file tablefind.cpp.

Definition at line 51 of file tabfind.cpp.

const double tesseract::kMaxHorizontalGap = 3.0

Definition at line 64 of file tabfind.cpp.

Definition at line 52 of file colfind.cpp.

Definition at line 116 of file strokewidth.cpp.

Definition at line 44 of file ccnontextdetect.cpp.

Definition at line 35 of file ccnontextdetect.cpp.

Definition at line 58 of file colpartition.cpp.

Definition at line 60 of file colpartition.cpp.

const int tesseract::kMaxLigature = 0xfb4f

Definition at line 46 of file ligature_table.cpp.

Definition at line 53 of file linefind.cpp.

Definition at line 40 of file ccnontextdetect.cpp.

Definition at line 37 of file colpartitiongrid.cpp.

const double tesseract::kMaxNonLineDensity = 0.25

Definition at line 58 of file linefind.cpp.

const int tesseract::kMaxOffsetDist = 32

Definition at line 32 of file intfeaturemap.cpp.

const int tesseract::kMaxPadFactor = 6

Definition at line 34 of file colpartitiongrid.cpp.

Definition at line 134 of file tablefind.cpp.

const double tesseract::kMaxPartitionSpacing = 1.75

Definition at line 70 of file colpartitiongrid.cpp.

Definition at line 46 of file colpartition.cpp.

Definition at line 39 of file tabfind.cpp.

const int tesseract::kMaxRealDistance = 2.0

Definition at line 37 of file detlinefit.cpp.

Definition at line 46 of file imagefind.cpp.

Definition at line 49 of file imagefind.cpp.

Definition at line 81 of file colpartition.cpp.

const double tesseract::kMaxRowSize = 2.5

Definition at line 51 of file tablerecog.cpp.

Definition at line 54 of file colpartition.cpp.

const double tesseract::kMaxSizeRatio = 1.5

Definition at line 56 of file colpartition.cpp.

const int tesseract::kMaxSkewFactor = 15

Definition at line 65 of file alignedblob.cpp.

const double tesseract::kMaxSmallNeighboursPerPix = 1.0 / 32

Definition at line 32 of file ccnontextdetect.cpp.

const double tesseract::kMaxSpacingDrift = 1.0 / 72

Definition at line 48 of file colpartition.cpp.

const double tesseract::kMaxStaveHeight = 1.0

Definition at line 60 of file linefind.cpp.

const double tesseract::kMaxTableCellXheight = 2.0

Definition at line 84 of file tablefind.cpp.

Definition at line 72 of file tabfind.cpp.

const double tesseract::kMaxTopSpacingFraction = 0.25

Definition at line 51 of file colpartition.cpp.

Definition at line 48 of file mastertrainer.cpp.

Definition at line 38 of file tabfind.cpp.

Definition at line 41 of file tablefind.cpp.

Definition at line 144 of file tablefind.cpp.

const double tesseract::kMinAlignedGutter = 0.25

Definition at line 53 of file tabvector.cpp.

Definition at line 55 of file alignedblob.cpp.

const double tesseract::kMinBaselineCoverage = 0.5

Definition at line 79 of file colpartition.cpp.

Definition at line 66 of file tablefind.cpp.

Definition at line 47 of file colpartitiongrid.cpp.

const double tesseract::kMinCaptionGapRatio = 2.0

Definition at line 45 of file colpartitiongrid.cpp.

Definition at line 68 of file colpartition.cpp.

Definition at line 46 of file mastertrainer.cpp.

Definition at line 55 of file imagefind.cpp.

const int tesseract::kMinColumnWidth = 100

Definition at line 49 of file colfind.cpp.

Minimum believable resolution.

Definition at line 58 of file pagesegmain.cpp.

const double tesseract::kMinDiacriticSizeRatio = 1.0625

Definition at line 87 of file strokewidth.cpp.

Definition at line 69 of file tabfind.cpp.

const double tesseract::kMinFilledArea = 0.35

Definition at line 61 of file tablerecog.cpp.

Definition at line 45 of file tabfind.cpp.

const double tesseract::kMinGoodTextPARatio = 1.5

Definition at line 60 of file ccnontextdetect.cpp.

const double tesseract::kMinGutterFraction = 0.5

Definition at line 49 of file tabvector.cpp.

Definition at line 49 of file tabfind.cpp.

const double tesseract::kMinGutterWidthGrid = 0.5

Definition at line 61 of file colfind.cpp.

const double tesseract::kMinImageArea = 0.5

Definition at line 77 of file tabfind.cpp.

Definition at line 51 of file imagefind.cpp.

Definition at line 62 of file colpartition.cpp.

const int tesseract::kMinLigature = 0xfb00

Definition at line 45 of file ligature_table.cpp.

Denominator of resolution makes min pixels to demand line lengths to be.

Definition at line 43 of file linefind.cpp.

Definition at line 41 of file tabfind.cpp.

Definition at line 76 of file tablefind.cpp.

const double tesseract::kMinMusicPixelFraction = 0.75

Definition at line 62 of file linefind.cpp.

const double tesseract::kMinNonNoiseFraction = 0.5

Definition at line 59 of file colfind.cpp.

Definition at line 37 of file trainingsampleset.cpp.

const double tesseract::kMinOverlapWithTable = 0.6

Definition at line 100 of file tablefind.cpp.

Definition at line 140 of file tablefind.cpp.

const double tesseract::kMinPCLengthIncrease = 1.0 / 1024

Definition at line 33 of file intfeaturemap.cpp.

Definition at line 34 of file detlinefit.cpp.

const double tesseract::kMinRaggedGutter = 1.5

Definition at line 55 of file tabvector.cpp.

Definition at line 53 of file alignedblob.cpp.

const int tesseract::kMinRampSize = 1000

Definition at line 35 of file degradeimage.cpp.

const double tesseract::kMinRectangularFraction = 0.125

Definition at line 44 of file imagefind.cpp.

Definition at line 115 of file tablefind.cpp.

Definition at line 66 of file colpartition.cpp.

const double tesseract::kMinTabGradient = 4.0

Definition at line 61 of file alignedblob.cpp.

Definition at line 75 of file tabfind.cpp.

Definition at line 49 of file linefind.cpp.

Definition at line 37 of file tabfind.cpp.

Definition at line 105 of file strokewidth.cpp.

Definition at line 118 of file strokewidth.cpp.

const int tesseract::kNoisePadding = 4

Definition at line 51 of file ccnontextdetect.cpp.

const int tesseract::kNumEndPoints = 3

Definition at line 28 of file detlinefit.cpp.

Definition at line 36 of file tess_lang_model.h.

Definition at line 47 of file ccnontextdetect.cpp.

Definition at line 130 of file tablefind.cpp.

const char * tesseract::kPDF = "\u202C"

Definition at line 30 of file unicodes.cpp.

const double tesseract::kPhotoOffsetFraction = 0.375

Definition at line 54 of file ccnontextdetect.cpp.

const int tesseract::kPrime1 = 17

Definition at line 34 of file trainingsampleset.cpp.

const int tesseract::kPrime2 = 13

Definition at line 35 of file trainingsampleset.cpp.

const double tesseract::kRaggedFraction = 2.5

Definition at line 41 of file alignedblob.cpp.

const double tesseract::kRaggedGapFraction = 1.0

Definition at line 45 of file alignedblob.cpp.

Definition at line 53 of file tabfind.cpp.

Definition at line 35 of file trainingsample.cpp.

const double tesseract::kRatingEpsilon = 1.0 / 32

Definition at line 31 of file errorcounter.cpp.

const double tesseract::kRequiredColumns = 0.7

Definition at line 46 of file tablerecog.cpp.

Definition at line 120 of file tablefind.cpp.

const char tesseract::kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL"

Definition at line 45 of file trie.cpp.

const int tesseract::kRGBRMSColors = 4

Definition at line 36 of file colpartition.h.

const char * tesseract::kRLE = "\u202A"

Definition at line 29 of file unicodes.cpp.

const char * tesseract::kRLM = "\u200F"

Definition at line 28 of file unicodes.cpp.

const double tesseract::kRMSFitScaling = 8.0

Definition at line 53 of file imagefind.cpp.

const float tesseract::kRotationRange = 0.02f

Definition at line 29 of file degradeimage.cpp.

Definition at line 96 of file tablefind.cpp.

const int tesseract::kSaltnPepper = 5

Definition at line 33 of file degradeimage.cpp.

const int tesseract::kSearchRadius = 2

Definition at line 95 of file strokewidth.cpp.

Definition at line 87 of file equationdetect.cpp.

Definition at line 105 of file tablefind.cpp.

Definition at line 45 of file tabvector.cpp.

Definition at line 42 of file tabvector.cpp.

Definition at line 24 of file universalambigs.h.

const float tesseract::kSizeRatioToReject = 2.0

Definition at line 113 of file strokewidth.cpp.

Definition at line 109 of file tablefind.cpp.

Definition at line 73 of file colpartitiongrid.cpp.

const double tesseract::kSmoothFactor = 0.25

Definition at line 58 of file tabfind.cpp.

const double tesseract::kSplitPartitionSize = 2.0

Definition at line 47 of file tablefind.cpp.

const int tesseract::kSquareLimit = 25

Definition at line 32 of file trainingsampleset.cpp.

const int tesseract::kStateCnt = 4

Definition at line 35 of file tess_lang_model.h.

const double tesseract::kStrokeWidthCJK = 2.0

Definition at line 63 of file strokewidth.cpp.

Definition at line 55 of file colpartitiongrid.cpp.

Definition at line 148 of file tablefind.cpp.

Definition at line 62 of file strokewidth.cpp.

Allowed proportional change in stroke width to be the same font.

Definition at line 53 of file colpartitiongrid.cpp.

Allowed constant change in stroke width to be the same font. Really 1.5 pixels.

Definition at line 60 of file strokewidth.cpp.

Definition at line 92 of file tablefind.cpp.

Definition at line 35 of file tabfind.cpp.

const int tesseract::kTestChar = -1

Definition at line 30 of file trainingsampleset.cpp.

const char* tesseract::kTextordDebugPix = "psdebug_pix"

Definition at line 68 of file alignedblob.cpp.

const double tesseract::kThickLengthMultiple = 0.75

Definition at line 56 of file linefind.cpp.

Denominator of resolution makes max pixel width to allow thin lines.

Definition at line 41 of file linefind.cpp.

Definition at line 57 of file colpartitiongrid.cpp.

const float tesseract::kUnclearDensityTh = 0.25

Definition at line 86 of file equationdetect.cpp.

Definition at line 23 of file universalambigs.h.

const char * tesseract::kUTF8LineSeparator = "\u2028"

Definition at line 25 of file unicodes.cpp.

const char * tesseract::kUTF8ParagraphSeparator = "\u2029"

Definition at line 26 of file unicodes.cpp.

const double tesseract::kVerticalSpacing = -0.2

Definition at line 36 of file tablerecog.cpp.

Definition at line 47 of file alignedblob.cpp.

const int tesseract::kVLineGutter = 1

Definition at line 49 of file alignedblob.cpp.

const int tesseract::kVLineMinLength = 500

Definition at line 57 of file alignedblob.cpp.

const int tesseract::kVLineSearchSize = 150

Definition at line 51 of file alignedblob.cpp.

const char* const tesseract::RTLReversePolicyNames[]
Initial value:

Definition at line 48 of file trie.cpp.

"Paint table detection output"

Definition at line 151 of file tablefind.cpp.

"Show table regions"

Definition at line 152 of file tablefind.cpp.

"Fraction of height used as a minimum gap for aligned blobs."

Definition at line 86 of file tabfind.cpp.

"run table detection"

Definition at line 74 of file colfind.cpp.

"Force using vertical text page mode"

Definition at line 48 of file strokewidth.cpp.

"Only run stroke widths"

Definition at line 45 of file strokewidth.cpp.

"Show final block bounds"

Definition at line 73 of file colfind.cpp.

"Show stroke widths"

Definition at line 30 of file colpartitiongrid.cpp.

"Show column bounds"

Definition at line 72 of file colfind.cpp.

"Show tab vectors"

Definition at line 84 of file tabfind.cpp.

"Show partition bounds"

Definition at line 67 of file colfind.cpp.

"Show tab candidates"

Definition at line 83 of file tabfind.cpp.

"Show partition bounds, waiting if >1"

Definition at line 71 of file colfind.cpp.

"Show blobs rejected as noise"

Definition at line 69 of file colfind.cpp.

"Show stroke widths"

Definition at line 44 of file strokewidth.cpp.

"find horizontal lines such as headers in vertical page mode"

Definition at line 50 of file strokewidth.cpp.

"Enable vertical detection"

Definition at line 46 of file strokewidth.cpp.

"Fraction of textlines deemed vertical to use vertical page mode"

Definition at line 52 of file strokewidth.cpp.

"Enables the table recognizer for table layout and filtering."

Definition at line 158 of file tablefind.cpp.

"Debug table marking steps in detail"

Definition at line 154 of file tablefind.cpp.

"Show page stats used in table finding"

Definition at line 156 of file tablefind.cpp.

"Fraction of box matches required to declare a line vertical"

Definition at line 61 of file tabvector.cpp.

"max fraction of mean blob width allowed for vertical gaps in vertical text"

"Max fraction of mean blob width allowed for vertical gaps in vertical text"

Definition at line 58 of file tabvector.cpp.

Definition at line 51 of file ccutil.cpp.

 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines