tesseract
3.03
|
#include <tesseractclass.h>
Public Member Functions | ||||||||||
Tesseract () | ||||||||||
~Tesseract () | ||||||||||
void | Clear () | |||||||||
void | ResetAdaptiveClassifier () | |||||||||
void | ResetDocumentDictionary () | |||||||||
void | SetEquationDetect (EquationDetect *detector) | |||||||||
const FCOORD & | reskew () const | |||||||||
Pix ** | mutable_pix_binary () | |||||||||
Pix * | pix_binary () const | |||||||||
Pix * | pix_grey () const | |||||||||
void | set_pix_grey (Pix *grey_pix) | |||||||||
Pix * | BestPix () const | |||||||||
void | set_pix_thresholds (Pix *thresholds) | |||||||||
int | source_resolution () const | |||||||||
void | set_source_resolution (int ppi) | |||||||||
int | ImageWidth () const | |||||||||
int | ImageHeight () const | |||||||||
Pix * | scaled_color () const | |||||||||
int | scaled_factor () const | |||||||||
void | SetScaledColor (int factor, Pix *color) | |||||||||
const Textord & | textord () const | |||||||||
Textord * | mutable_textord () | |||||||||
bool | right_to_left () const | |||||||||
int | num_sub_langs () const | |||||||||
Tesseract * | get_sub_lang (int index) const | |||||||||
void | SetBlackAndWhitelist () | |||||||||
void | PrepareForPageseg () | |||||||||
void | PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) | |||||||||
int | SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr) | |||||||||
void | SetupWordScripts (BLOCK_LIST *blocks) | |||||||||
int | AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, Tesseract *osd_tess, OSResults *osr) | |||||||||
ColumnFinder * | SetupPageSegAndDetectOrientation (bool single_column, bool osd, bool only_osd, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix) | |||||||||
void | PrerecAllWordsPar (const GenericVector< WordData > &words) | |||||||||
bool | ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass) | |||||||||
void | SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words) | |||||||||
void | SetupWordPassN (int pass_n, WordData *word) | |||||||||
bool | RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, GenericVector< WordData > *words) | |||||||||
bool | recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses) | |||||||||
void | rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config) | |||||||||
void | bigram_correction_pass (PAGE_RES *page_res) | |||||||||
void | blamer_pass (PAGE_RES *page_res) | |||||||||
void | script_pos_pass (PAGE_RES *page_res) | |||||||||
bool | RetryWithLanguage (const WERD_RES &best_word, WordData *word_data, WERD_RES *word, WordRecognizer recognizer) | |||||||||
void | classify_word_and_language (WordRecognizer recognizer, WordData *word_data) | |||||||||
void | classify_word_pass1 (WordData *word_data, WERD_RES *word) | |||||||||
void | recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box) | |||||||||
void | fix_rep_char (PAGE_RES_IT *page_res_it) | |||||||||
void | ExplodeRepeatedWord (BLOB_CHOICE *best_choice, PAGE_RES_IT *page_res_it) | |||||||||
ACCEPTABLE_WERD_TYPE | acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths) | |||||||||
void | match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block) | |||||||||
void | classify_word_pass2 (WordData *word_data, WERD_RES *word) | |||||||||
void | ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word) | |||||||||
bool | RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
bool | TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
BOOL8 | recog_interactive (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
void | set_word_fonts (WERD_RES *word) | |||||||||
void | font_recognition_pass (PAGE_RES *page_res) | |||||||||
BOOL8 | check_debug_pt (WERD_RES *word, int location) | |||||||||
bool | SubAndSuperscriptFix (WERD_RES *word_res) | |||||||||
void | GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold) | |||||||||
WERD_RES * | TrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing) | |||||||||
bool | BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const | |||||||||
bool | init_cube_objects (bool load_combiner, TessdataManager *tessdata_manager) | |||||||||
void | run_cube_combiner (PAGE_RES *page_res) | |||||||||
void | cube_word_pass1 (BLOCK *block, ROW *row, WERD_RES *word) | |||||||||
CubeObject * | cube_recognize_word (BLOCK *block, WERD_RES *word) | |||||||||
void | cube_combine_word (CubeObject *cube_obj, WERD_RES *cube_word, WERD_RES *tess_word) | |||||||||
bool | cube_recognize (CubeObject *cube_obj, BLOCK *block, WERD_RES *word) | |||||||||
void | fill_werd_res (const BoxWord &cube_box_word, const char *cube_best_str, WERD_RES *tess_werd_res) | |||||||||
bool | extract_cube_state (CubeObject *cube_obj, int *num_chars, Boxa **char_boxes, CharSamp ***char_samples) | |||||||||
bool | create_cube_box_word (Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord *box_word) | |||||||||
void | output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box) | |||||||||
void | write_results (PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol) | |||||||||
void | set_unlv_suspects (WERD_RES *word) | |||||||||
UNICHAR_ID | get_rep_char (WERD_RES *word) | |||||||||
BOOL8 | acceptable_number_string (const char *s, const char *lengths) | |||||||||
inT16 | count_alphanums (const WERD_CHOICE &word) | |||||||||
inT16 | count_alphas (const WERD_CHOICE &word) | |||||||||
void | read_config_file (const char *filename, SetParamConstraint constraint) | |||||||||
int | init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params) | |||||||||
int | init_tesseract (const char *datapath, const char *language, OcrEngineMode oem) | |||||||||
int | init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params) | |||||||||
void | SetupUniversalFontIds () | |||||||||
int | init_tesseract_lm (const char *arg0, const char *textbase, const char *language) | |||||||||
void | recognize_page (STRING &image_name) | |||||||||
void | end_tesseract () | |||||||||
bool | init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params) | |||||||||
void | ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load) | |||||||||
SVMenuNode * | build_menu_new () | |||||||||
void | pgeditor_main (int width, int height, PAGE_RES *page_res) | |||||||||
void | process_image_event (const SVEvent &event) | |||||||||
BOOL8 | process_cmd_win_event (inT32 cmd_event, char *new_value) | |||||||||
void | debug_word (PAGE_RES *page_res, const TBOX &selection_box) | |||||||||
void | do_re_display (BOOL8(tesseract::Tesseract::*word_painter)(BLOCK *block, ROW *row, WERD_RES *word_res)) | |||||||||
BOOL8 | word_display (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
BOOL8 | word_bln_display (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
BOOL8 | word_blank_and_set_display (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
BOOL8 | word_set_display (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
BOOL8 | word_dumper (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
void | blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box) | |||||||||
void | make_reject_map (WERD_RES *word, ROW *row, inT16 pass) | |||||||||
BOOL8 | one_ell_conflict (WERD_RES *word_res, BOOL8 update_map) | |||||||||
inT16 | first_alphanum_index (const char *word, const char *word_lengths) | |||||||||
inT16 | first_alphanum_offset (const char *word, const char *word_lengths) | |||||||||
inT16 | alpha_count (const char *word, const char *word_lengths) | |||||||||
BOOL8 | word_contains_non_1_digit (const char *word, const char *word_lengths) | |||||||||
void | dont_allow_1Il (WERD_RES *word) | |||||||||
inT16 | count_alphanums (WERD_RES *word) | |||||||||
void | flip_0O (WERD_RES *word) | |||||||||
BOOL8 | non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) | |||||||||
BOOL8 | non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) | |||||||||
BOOL8 | repeated_nonalphanum_wd (WERD_RES *word, ROW *row) | |||||||||
void | nn_match_word (WERD_RES *word, ROW *row) | |||||||||
void | nn_recover_rejects (WERD_RES *word, ROW *row) | |||||||||
void | set_done (WERD_RES *word, inT16 pass) | |||||||||
inT16 | safe_dict_word (const WERD_RES *werd_res) | |||||||||
void | flip_hyphens (WERD_RES *word) | |||||||||
void | reject_I_1_L (WERD_RES *word) | |||||||||
void | reject_edge_blobs (WERD_RES *word) | |||||||||
void | reject_mostly_rejects (WERD_RES *word) | |||||||||
BOOL8 | word_adaptable (WERD_RES *word, uinT16 mode) | |||||||||
void | recog_word_recursive (WERD_RES *word) | |||||||||
void | recog_word (WERD_RES *word) | |||||||||
void | split_and_recog_word (WERD_RES *word) | |||||||||
void | split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const | |||||||||
void | join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const | |||||||||
void | match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block) | |||||||||
inT16 | fp_eval_word_spacing (WERD_RES_LIST &word_res_list) | |||||||||
void | dump_words (WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved) | |||||||||
BOOL8 | fixspace_thinks_word_done (WERD_RES *word) | |||||||||
GARBAGE_LEVEL | garbage_word (WERD_RES *word, BOOL8 ok_dict_word) | |||||||||
BOOL8 | potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word) | |||||||||
void | tilde_crunch (PAGE_RES_IT &page_res_it) | |||||||||
void | unrej_good_quality_words (PAGE_RES_IT &page_res_it) | |||||||||
void | doc_and_block_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc) | |||||||||
void | quality_based_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc) | |||||||||
void | convert_bad_unlv_chs (WERD_RES *word_res) | |||||||||
void | tilde_delete (PAGE_RES_IT &page_res_it) | |||||||||
inT16 | word_blob_quality (WERD_RES *word, ROW *row) | |||||||||
void | word_char_quality (WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count) | |||||||||
void | unrej_good_chs (WERD_RES *word, ROW *row) | |||||||||
inT16 | count_outline_errs (char c, inT16 outline_count) | |||||||||
inT16 | word_outline_errs (WERD_RES *word) | |||||||||
BOOL8 | terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level) | |||||||||
CRUNCH_MODE | word_deletable (WERD_RES *word, inT16 &delete_mode) | |||||||||
inT16 | failure_count (WERD_RES *word) | |||||||||
BOOL8 | noise_outlines (TWERD *word) | |||||||||
void | process_selected_words (PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(BLOCK *block, ROW *row, WERD_RES *word_res)) | |||||||||
void | tess_segment_pass_n (int pass_n, WERD_RES *word) | |||||||||
PAGE_RES * | ApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list) | |||||||||
PAGE_RES * | SetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list) | |||||||||
void | MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
bool | ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text) | |||||||||
bool | ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text) | |||||||||
void | ReSegmentByClassification (PAGE_RES *page_res) | |||||||||
bool | ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids) | |||||||||
bool | FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res) | |||||||||
void | SearchForText (const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation) | |||||||||
void | TidyUp (PAGE_RES *page_res) | |||||||||
void | ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg) | |||||||||
void | CorrectClassifyWords (PAGE_RES *page_res) | |||||||||
void | ApplyBoxTraining (const STRING &filename, PAGE_RES *page_res) | |||||||||
int | CountMisfitTops (WERD_RES *word_res) | |||||||||
float | ComputeCompatibleXheight (WERD_RES *word_res) | |||||||||
FILE * | init_recog_training (const STRING &fname) | |||||||||
void | recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file) | |||||||||
void | ambigs_classify_and_output (WERD_RES *werd_res, ROW_RES *row_res, BLOCK_RES *block_res, const char *label, FILE *output_file) | |||||||||
CubeRecoContext * | GetCubeRecoContext () | |||||||||
eval_word_spacing() | ||||||||||
The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect. Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is prefered. The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space. Conversly, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined. The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2. | ||||||||||
BOOL8 | digit_or_numeric_punct (WERD_RES *word, int char_position) | |||||||||
inT16 | eval_word_spacing (WERD_RES_LIST &word_res_list) | |||||||||
fix_sp_fp_word() | ||||||||||
Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words. | ||||||||||
void | fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) | |||||||||
void | fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) | |||||||||
inT16 | worst_noise_blob (WERD_RES *word_res, float *worst_noise_score) | |||||||||
float | blob_noise_score (TBLOB *blob) | |||||||||
void | break_noisiest_blob_word (WERD_RES_LIST &words) | |||||||||
fix_fuzzy_spaces() | ||||||||||
Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.
| ||||||||||
void | fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) | |||||||||
void | fix_fuzzy_spaces (ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res) | |||||||||
tess_add_doc_word | ||||||||||
Add the given word to the document dictionary | ||||||||||
void | tess_add_doc_word (WERD_CHOICE *word_choice) | |||||||||
tess_acceptable_word | ||||||||||
| ||||||||||
bool | tess_acceptable_word (WERD_RES *word) | |||||||||
Public Attributes | ||||||||||
bool | tessedit_resegment_from_boxes = false | |||||||||
bool | tessedit_resegment_from_line_boxes = false | |||||||||
bool | tessedit_train_from_boxes = false | |||||||||
bool | tessedit_make_boxes_from_boxes = false | |||||||||
bool | tessedit_dump_pageseg_images = false | |||||||||
int | tessedit_pageseg_mode = PSM_SINGLE_BLOCK | |||||||||
int | tessedit_ocr_engine_mode = tesseract::OEM_TESSERACT_ONLY | |||||||||
char * | tessedit_char_blacklist = "" | |||||||||
char * | tessedit_char_whitelist = "" | |||||||||
bool | tessedit_ambigs_training = false | |||||||||
int | pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT | |||||||||
int | ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT | |||||||||
char * | tessedit_write_params_to_file = "" | |||||||||
bool | tessedit_adaption_debug = false | |||||||||
int | bidi_debug = 0 | |||||||||
int | applybox_debug = 1 | |||||||||
int | applybox_page = 0 | |||||||||
char * | applybox_exposure_pattern = ".exp" | |||||||||
bool | applybox_learn_chars_and_char_frags_mode = false | |||||||||
bool | applybox_learn_ngrams_mode = false | |||||||||
bool | tessedit_display_outwords = false | |||||||||
bool | tessedit_training_tess = false | |||||||||
bool | tessedit_dump_choices = false | |||||||||
bool | tessedit_timing_debug = false | |||||||||
bool | tessedit_fix_fuzzy_spaces = true | |||||||||
bool | tessedit_unrej_any_wd = false | |||||||||
bool | tessedit_fix_hyphens = true | |||||||||
bool | tessedit_redo_xheight = true | |||||||||
bool | tessedit_enable_doc_dict = true | |||||||||
bool | tessedit_debug_fonts = false | |||||||||
bool | tessedit_debug_block_rejection = false | |||||||||
bool | tessedit_enable_bigram_correction = true | |||||||||
int | tessedit_bigram_debug = 0 | |||||||||
int | debug_x_ht_level = 0 | |||||||||
bool | debug_acceptable_wds = false | |||||||||
char * | chs_leading_punct = "('`\"" | |||||||||
char * | chs_trailing_punct1 = ").,;:?!" | |||||||||
char * | chs_trailing_punct2 = ")'`\"" | |||||||||
double | quality_rej_pc = 0.08 | |||||||||
double | quality_blob_pc = 0.0 | |||||||||
double | quality_outline_pc = 1.0 | |||||||||
double | quality_char_pc = 0.95 | |||||||||
int | quality_min_initial_alphas_reqd = 2 | |||||||||
int | tessedit_tess_adaption_mode = 0x27 | |||||||||
bool | tessedit_minimal_rej_pass1 = false | |||||||||
bool | tessedit_test_adaption = false | |||||||||
bool | tessedit_matcher_log = false | |||||||||
int | tessedit_test_adaption_mode = 3 | |||||||||
bool | test_pt = false | |||||||||
double | test_pt_x = 99999.99 | |||||||||
double | test_pt_y = 99999.99 | |||||||||
int | paragraph_debug_level = 0 | |||||||||
bool | paragraph_text_based = true | |||||||||
int | cube_debug_level = 1 | |||||||||
char * | outlines_odd = "%| " | |||||||||
char * | outlines_2 = "ij!?%\":;" | |||||||||
bool | docqual_excuse_outline_errs = false | |||||||||
bool | tessedit_good_quality_unrej = true | |||||||||
bool | tessedit_use_reject_spaces = true | |||||||||
double | tessedit_reject_doc_percent = 65.00 | |||||||||
double | tessedit_reject_block_percent = 45.00 | |||||||||
double | tessedit_reject_row_percent = 40.00 | |||||||||
double | tessedit_whole_wd_rej_row_percent = 70.00 | |||||||||
bool | tessedit_preserve_blk_rej_perfect_wds = true | |||||||||
bool | tessedit_preserve_row_rej_perfect_wds = true | |||||||||
bool | tessedit_dont_blkrej_good_wds = false | |||||||||
bool | tessedit_dont_rowrej_good_wds = false | |||||||||
int | tessedit_preserve_min_wd_len = 2 | |||||||||
bool | tessedit_row_rej_good_docs = true | |||||||||
double | tessedit_good_doc_still_rowrej_wd = 1.1 | |||||||||
bool | tessedit_reject_bad_qual_wds = true | |||||||||
bool | tessedit_debug_doc_rejection = false | |||||||||
bool | tessedit_debug_quality_metrics = false | |||||||||
bool | bland_unrej = false | |||||||||
double | quality_rowrej_pc = 1.1 | |||||||||
bool | unlv_tilde_crunching = true | |||||||||
bool | crunch_early_merge_tess_fails = true | |||||||||
bool | crunch_early_convert_bad_unlv_chs = false | |||||||||
double | crunch_terrible_rating = 80.0 | |||||||||
bool | crunch_terrible_garbage = true | |||||||||
double | crunch_poor_garbage_cert = -9.0 | |||||||||
double | crunch_poor_garbage_rate = 60 | |||||||||
double | crunch_pot_poor_rate = 40 | |||||||||
double | crunch_pot_poor_cert = -8.0 | |||||||||
bool | crunch_pot_garbage = true | |||||||||
double | crunch_del_rating = 60 | |||||||||
double | crunch_del_cert = -10.0 | |||||||||
double | crunch_del_min_ht = 0.7 | |||||||||
double | crunch_del_max_ht = 3.0 | |||||||||
double | crunch_del_min_width = 3.0 | |||||||||
double | crunch_del_high_word = 1.5 | |||||||||
double | crunch_del_low_word = 0.5 | |||||||||
double | crunch_small_outlines_size = 0.6 | |||||||||
int | crunch_rating_max = 10 | |||||||||
int | crunch_pot_indicators = 1 | |||||||||
bool | crunch_leave_ok_strings = true | |||||||||
bool | crunch_accept_ok = true | |||||||||
bool | crunch_leave_accept_strings = false | |||||||||
bool | crunch_include_numerals = false | |||||||||
int | crunch_leave_lc_strings = 4 | |||||||||
int | crunch_leave_uc_strings = 4 | |||||||||
int | crunch_long_repetitions = 3 | |||||||||
int | crunch_debug = 0 | |||||||||
int | fixsp_non_noise_limit = 1 | |||||||||
double | fixsp_small_outlines_size = 0.28 | |||||||||
bool | tessedit_prefer_joined_punct = false | |||||||||
int | fixsp_done_mode = 1 | |||||||||
int | debug_fix_space_level = 0 | |||||||||
char * | numeric_punctuation = ".," | |||||||||
int | x_ht_acceptance_tolerance = 8 | |||||||||
int | x_ht_min_change = 8 | |||||||||
int | superscript_debug = 0 | |||||||||
double | superscript_worse_certainty = 2.0 | |||||||||
double | superscript_bettered_certainty = 0.97 | |||||||||
double | superscript_scaledown_ratio = 0.4 | |||||||||
double | subscript_max_y_top = 0.5 | |||||||||
double | superscript_min_y_bottom = 0.3 | |||||||||
bool | tessedit_write_block_separators = false | |||||||||
bool | tessedit_write_rep_codes = false | |||||||||
bool | tessedit_write_unlv = false | |||||||||
bool | tessedit_create_hocr = false | |||||||||
bool | tessedit_create_pdf = false | |||||||||
char * | unrecognised_char = "|" | |||||||||
int | suspect_level = 99 | |||||||||
int | suspect_space_level = 100 | |||||||||
int | suspect_short_words = 2 | |||||||||
bool | suspect_constrain_1Il = false | |||||||||
double | suspect_rating_per_ch = 999.9 | |||||||||
double | suspect_accept_rating = -999.9 | |||||||||
bool | tessedit_minimal_rejection = false | |||||||||
bool | tessedit_zero_rejection = false | |||||||||
bool | tessedit_word_for_word = false | |||||||||
bool | tessedit_zero_kelvin_rejection = false | |||||||||
bool | tessedit_consistent_reps = true | |||||||||
int | tessedit_reject_mode = 0 | |||||||||
bool | tessedit_rejection_debug = false | |||||||||
bool | tessedit_flip_0O = true | |||||||||
double | tessedit_lower_flip_hyphen = 1.5 | |||||||||
double | tessedit_upper_flip_hyphen = 1.8 | |||||||||
bool | rej_trust_doc_dawg = false | |||||||||
bool | rej_1Il_use_dict_word = false | |||||||||
bool | rej_1Il_trust_permuter_type = true | |||||||||
bool | rej_use_tess_accepted = true | |||||||||
bool | rej_use_tess_blanks = true | |||||||||
bool | rej_use_good_perm = true | |||||||||
bool | rej_use_sensible_wd = false | |||||||||
bool | rej_alphas_in_number_perm = false | |||||||||
double | rej_whole_of_mostly_reject_word_fract = 0.85 | |||||||||
int | tessedit_image_border = 2 | |||||||||
char * | ok_repeated_ch_non_alphanum_wds = "-?*\075" | |||||||||
char * | conflict_set_I_l_1 = "Il1[]" | |||||||||
int | min_sane_x_ht_pixels = 8 | |||||||||
bool | tessedit_create_boxfile = false | |||||||||
int | tessedit_page_number = -1 | |||||||||
bool | tessedit_write_images = false | |||||||||
bool | interactive_display_mode = false | |||||||||
char * | file_type = ".tif" | |||||||||
bool | tessedit_override_permuter = true | |||||||||
int | tessdata_manager_debug_level = 0 | |||||||||
char * | tessedit_load_sublangs = "" | |||||||||
bool | tessedit_use_primary_params_model = false | |||||||||
double | min_orientation_margin = 7.0 | |||||||||
bool | textord_tabfind_show_vlines = false | |||||||||
bool | textord_use_cjk_fp_model = FALSE | |||||||||
bool | poly_allow_detailed_fx = false | |||||||||
bool | tessedit_init_config_only = false | |||||||||
bool | textord_equation_detect = false | |||||||||
int | tessedit_parallelize = 0 | |||||||||
int | tessedit_ok_mode = 5 | |||||||||
bool | load_fixed_length_dawgs = true | |||||||||
int | segment_debug = 0 | |||||||||
bool | permute_debug = 0 | |||||||||
double | bestrate_pruning_factor = 2.0 | |||||||||
bool | permute_script_word = 0 | |||||||||
bool | segment_segcost_rating = 0 | |||||||||
double | segment_reward_script = 0.95 | |||||||||
bool | permute_fixed_length_dawg = 0 | |||||||||
bool | permute_chartype_word = 0 | |||||||||
double | segment_reward_chartype = 0.97 | |||||||||
double | segment_reward_ngram_best_choice = 0.99 | |||||||||
bool | ngram_permuter_activated = false | |||||||||
bool | permute_only_top = false | |||||||||
int | language_model_fixed_length_choices_depth = 3 | |||||||||
bool | use_new_state_cost = FALSE | |||||||||
double | heuristic_segcost_rating_base = 1.25 | |||||||||
double | heuristic_weight_rating = 1 | |||||||||
double | heuristic_weight_width = 1000.0 | |||||||||
double | heuristic_weight_seamcut = 0 | |||||||||
double | heuristic_max_char_wh_ratio = 2.0 | |||||||||
bool | enable_new_segsearch = false | |||||||||
double | segsearch_max_fixed_pitch_char_wh_ratio = 2.0 |
Definition at line 153 of file tesseractclass.h.
Definition at line 37 of file tesseractclass.cpp.
: BOOL_MEMBER(tessedit_resegment_from_boxes, false, "Take segmentation and labeling from box file", this->params()), BOOL_MEMBER(tessedit_resegment_from_line_boxes, false, "Conversion of word/line box file to char box file", this->params()), BOOL_MEMBER(tessedit_train_from_boxes, false, "Generate training data from boxed chars", this->params()), BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, "Generate more boxes from boxed chars", this->params()), BOOL_MEMBER(tessedit_dump_pageseg_images, false, "Dump intermediate images made during page segmentation", this->params()), // The default for pageseg_mode is the old behaviour, so as not to // upset anything that relies on that. INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK, "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)", this->params()), INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, "Which OCR engine(s) to run (Tesseract, Cube, both)." " Defaults to loading and running only Tesseract" " (no Cube,no combiner)." " Values from OcrEngineMode enum in tesseractclass.h)", this->params()), STRING_MEMBER(tessedit_char_blacklist, "", "Blacklist of chars not to recognize", this->params()), STRING_MEMBER(tessedit_char_whitelist, "", "Whitelist of chars to recognize", this->params()), BOOL_MEMBER(tessedit_ambigs_training, false, "Perform training for ambiguities", this->params()), INT_MEMBER(pageseg_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT, "Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation.", this->params()), INT_MEMBER(ocr_devanagari_split_strategy, tesseract::ShiroRekhaSplitter::NO_SPLIT, "Whether to use the top-line splitting process for Devanagari " "documents while performing ocr.", this->params()), STRING_MEMBER(tessedit_write_params_to_file, "", "Write all parameters to the given file.", this->params()), BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug" " information for adaption", this->params()), INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()), INT_MEMBER(applybox_debug, 1, "Debug level", this->params()), INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", this->params()), STRING_MEMBER(applybox_exposure_pattern, ".exp", "Exposure value follows" " this pattern in the image filename. The name of the image" " files are expected to be in the form" " [lang].[fontname].exp[num].tif", this->params()), BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false, "Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters.", this->params()), BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box" " is assumed to contain ngrams. Only learn the ngrams" " whose outlines overlap horizontally.", this->params()), BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", this->params()), BOOL_MEMBER(tessedit_training_tess, false, "Call Tess to learn blobs", this->params()), BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices", this->params()), BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", this->params()), BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces", this->params()), BOOL_MEMBER(tessedit_unrej_any_wd, false, "Dont bother with word plausibility", this->params()), BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", this->params()), BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height", this->params()), BOOL_MEMBER(tessedit_enable_doc_dict, true, "Add words to the document dictionary", this->params()), BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", this->params()), BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", this->params()), BOOL_MEMBER(tessedit_enable_bigram_correction, true, "Enable correction based on the word bigram dictionary.", this->params()), INT_MEMBER(tessedit_bigram_debug, 0, "Amount of debug output for bigram correction.", this->params()), INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()), BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk", this->params()), STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation", this->params()), STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation", this->params()), STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation", this->params()), double_MEMBER(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit", this->params()), double_MEMBER(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit", this->params()), double_MEMBER(quality_outline_pc, 1.0, "good_quality_doc lte outline error limit", this->params()), double_MEMBER(quality_char_pc, 0.95, "good_quality_doc gte good char limit", this->params()), INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word", this->params()), INT_MEMBER(tessedit_tess_adaption_mode, 0x27, "Adaptation decision algorithm for tess", this->params()), BOOL_MEMBER(tessedit_minimal_rej_pass1, false, "Do minimal rejection on pass 1 output", this->params()), BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria", this->params()), BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity", this->params()), INT_MEMBER(tessedit_test_adaption_mode, 3, "Adaptation decision algorithm for tess", this->params()), BOOL_MEMBER(test_pt, false, "Test for point", this->params()), double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()), double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()), INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", this->params()), BOOL_MEMBER(paragraph_text_based, true, "Run paragraph detection on the post-text-recognition " "(more accurate)", this->params()), INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()), STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", this->params()), STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", this->params()), BOOL_MEMBER(docqual_excuse_outline_errs, false, "Allow outline errs in unrejection?", this->params()), BOOL_MEMBER(tessedit_good_quality_unrej, true, "Reduce rejection on good docs", this->params()), BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?", this->params()), double_MEMBER(tessedit_reject_doc_percent, 65.00, "%rej allowed before rej whole doc", this->params()), double_MEMBER(tessedit_reject_block_percent, 45.00, "%rej allowed before rej whole block", this->params()), double_MEMBER(tessedit_reject_row_percent, 40.00, "%rej allowed before rej whole row", this->params()), double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00, "Number of row rejects in whole word rejects" "which prevents whole row rejection", this->params()), BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true, "Only rej partially rejected words in block rejection", this->params()), BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true, "Only rej partially rejected words in row rejection", this->params()), BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, "Use word segmentation quality metric", this->params()), BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, "Use word segmentation quality metric", this->params()), INT_MEMBER(tessedit_preserve_min_wd_len, 2, "Only preserve wds longer than this", this->params()), BOOL_MEMBER(tessedit_row_rej_good_docs, true, "Apply row rejection to good docs", this->params()), double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1, "rej good doc wd if more than this fraction rejected", this->params()), BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, "Reject all bad quality wds", this->params()), BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats", this->params()), BOOL_MEMBER(tessedit_debug_quality_metrics, false, "Output data to debug file", this->params()), BOOL_MEMBER(bland_unrej, false, "unrej potential with no chekcs", this->params()), double_MEMBER(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit", this->params()), BOOL_MEMBER(unlv_tilde_crunching, true, "Mark v.bad words for tilde crunch", this->params()), BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params()), BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?", this->params()), double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", this->params()), BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()), double_MEMBER(crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this", this->params()), double_MEMBER(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this", this->params()), double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this", this->params()), double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this", this->params()), BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage", this->params()), double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this", this->params()), double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this", this->params()), double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this", this->params()), double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this", this->params()), double_MEMBER(crunch_del_min_width, 3.0, "Del if word width lt xht x this", this->params()), double_MEMBER(crunch_del_high_word, 1.5, "Del if word gt xht x this above bl", this->params()), double_MEMBER(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl", this->params()), double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this", this->params()), INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch", this->params()), INT_MEMBER(crunch_pot_indicators, 1, "How many potential indicators needed", this->params()), BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings", this->params()), BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", this->params()), BOOL_MEMBER(crunch_leave_accept_strings, false, "Dont pot crunch sensible strings", this->params()), BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", this->params()), INT_MEMBER(crunch_leave_lc_strings, 4, "Dont crunch words with long lower case strings", this->params()), INT_MEMBER(crunch_leave_uc_strings, 4, "Dont crunch words with long lower case strings", this->params()), INT_MEMBER(crunch_long_repetitions, 3, "Crunch words with long repetitions", this->params()), INT_MEMBER(crunch_debug, 0, "As it says", this->params()), INT_MEMBER(fixsp_non_noise_limit, 1, "How many non-noise blbs either side?", this->params()), double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this", this->params()), BOOL_MEMBER(tessedit_prefer_joined_punct, false, "Reward punctation joins", this->params()), INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing", this->params()), INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug", this->params()), STRING_MEMBER(numeric_punctuation, ".,", "Punct. chs expected WITHIN numbers", this->params()), INT_MEMBER(x_ht_acceptance_tolerance, 8, "Max allowed deviation of blob top outside of font data", this->params()), INT_MEMBER(x_ht_min_change, 8, "Min change in xht before actually trying it", this->params()), INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer", this->params()), double_MEMBER(superscript_worse_certainty, 2.0, "How many times worse " "certainty does a superscript position glyph need to be for " "us to try classifying it as a char with a different " "baseline?", this->params()), double_MEMBER(superscript_bettered_certainty, 0.97, "What reduction in " "badness do we think sufficient to choose a superscript " "over what we'd thought. For example, a value of 0.6 means " "we want to reduce badness of certainty by at least 40%", this->params()), double_MEMBER(superscript_scaledown_ratio, 0.4, "A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size.", this->params()), double_MEMBER(subscript_max_y_top, 0.5, "Maximum top of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether " "it's a subscript.", this->params()), double_MEMBER(superscript_min_y_bottom, 0.3, "Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether " "it's a superscript.", this->params()), BOOL_MEMBER(tessedit_write_block_separators, false, "Write block separators in output", this->params()), BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code", this->params()), BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", this->params()), BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", this->params()), BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params()), STRING_MEMBER(unrecognised_char, "|", "Output char for unidentified blobs", this->params()), INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()), INT_MEMBER(suspect_space_level, 100, "Min suspect level for rejecting spaces", this->params()), INT_MEMBER(suspect_short_words, 2, "Dont Suspect dict wds longer than this", this->params()), BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", this->params()), double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit", this->params()), double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", this->params()), BOOL_MEMBER(tessedit_minimal_rejection, false, "Only reject tess failures", this->params()), BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING", this->params()), BOOL_MEMBER(tessedit_word_for_word, false, "Make output have exactly one word per WERD", this->params()), BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, "Dont reject ANYTHING AT ALL", this->params()), BOOL_MEMBER(tessedit_consistent_reps, true, "Force all rep chars the same", this->params()), INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()), BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug", this->params()), BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips", this->params()), double_MEMBER(tessedit_lower_flip_hyphen, 1.5, "Aspect ratio dot/hyphen test", this->params()), double_MEMBER(tessedit_upper_flip_hyphen, 1.8, "Aspect ratio dot/hyphen test", this->params()), BOOL_MEMBER(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector", this->params()), BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", this->params()), BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check", this->params()), BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", this->params()), BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control", this->params()), BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control", this->params()), BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check", this->params()), BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check", this->params()), double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract", this->params()), INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit", this->params()), STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", "Allow NN to unrej", this->params()), STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set", this->params()), INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this", this->params()), BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes", this->params()), INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages" " , else specifc page to process", this->params()), BOOL_MEMBER(tessedit_write_images, false, "Capture the image from the IPE", this->params()), BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", this->params()), STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()), BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", this->params()), INT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for" " TessdataManager functions.", this->params()), STRING_MEMBER(tessedit_load_sublangs, "", "List of languages to load with this one", this->params()), BOOL_MEMBER(tessedit_use_primary_params_model, false, "In multilingual mode use params model of the" " primary language", this->params()), double_MEMBER(min_orientation_margin, 7.0, "Min acceptable orientation margin", this->params()), BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", this->params()), BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model", this->params()), BOOL_MEMBER(poly_allow_detailed_fx, false, "Allow feature extractors to see the original outline", this->params()), BOOL_INIT_MEMBER(tessedit_init_config_only, false, "Only initialize with the config file. Useful if the " "instance is not going to be used for OCR but say only " "for layout analysis.", this->params()), BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", this->params()), INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", this->params()), // The following parameters were deprecated and removed from their original // locations. The parameters are temporarily kept here to give Tesseract // users a chance to updated their [lang].traineddata and config files // without introducing failures during Tesseract initialization. // TODO(ocr-team): remove these parameters from the code once we are // reasonably sure that Tesseract users have updated their data files. // // BEGIN DEPRECATED PARAMETERS INT_MEMBER(tessedit_ok_mode, 5, "Acceptance decision algorithm", this->params()), BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs" " (e.g. for non-space delimited languages)", this->params()), INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process", this->params()), BOOL_MEMBER(permute_debug, 0, "Debug char permutation process", this->params()), double_MEMBER(bestrate_pruning_factor, 2.0, "Multiplying factor of" " current best rate to prune other hypotheses", this->params()), BOOL_MEMBER(permute_script_word, 0, "Turn on word script consistency permuter", this->params()), BOOL_MEMBER(segment_segcost_rating, 0, "incorporate segmentation cost in word rating?", this->params()), double_MEMBER(segment_reward_script, 0.95, "Score multipler for script consistency within a word. " "Being a 'reward' factor, it should be <= 1. " "Smaller value implies bigger reward.", this->params()), BOOL_MEMBER(permute_fixed_length_dawg, 0, "Turn on fixed-length phrasebook search permuter", this->params()), BOOL_MEMBER(permute_chartype_word, 0, "Turn on character type (property) consistency permuter", this->params()), double_MEMBER(segment_reward_chartype, 0.97, "Score multipler for char type consistency within a word. ", this->params()), double_MEMBER(segment_reward_ngram_best_choice, 0.99, "Score multipler for ngram permuter's best choice" " (only used in the Han script path).", this->params()), BOOL_MEMBER(ngram_permuter_activated, false, "Activate character-level n-gram-based permuter", this->params()), BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter", this->params()), INT_MEMBER(language_model_fixed_length_choices_depth, 3, "Depth of blob choice lists to explore" " when fixed length dawgs are on", this->params()), BOOL_MEMBER(use_new_state_cost, FALSE, "use new state cost heuristics for segmentation state" " evaluation", this->params()), double_MEMBER(heuristic_segcost_rating_base, 1.25, "base factor for adding segmentation cost into word rating." "It's a multiplying factor, the larger the value above 1, " "the bigger the effect of segmentation cost.", this->params()), double_MEMBER(heuristic_weight_rating, 1.0, "weight associated with char rating in combined cost of" "state", this->params()), double_MEMBER(heuristic_weight_width, 1000.0, "weight associated with width evidence in combined cost of" " state", this->params()), double_MEMBER(heuristic_weight_seamcut, 0.0, "weight associated with seam cut in combined cost of state", this->params()), double_MEMBER(heuristic_max_char_wh_ratio, 2.0, "max char width-to-height ratio allowed in segmentation", this->params()), BOOL_MEMBER(enable_new_segsearch, true, "Enable new segmentation search path.", this->params()), double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, "Maximum character width-to-height ratio for" " fixed-pitch fonts", this->params()), // END DEPRECATED PARAMETERS backup_config_file_(NULL), pix_binary_(NULL), cube_binary_(NULL), pix_grey_(NULL), pix_thresholds_(NULL), source_resolution_(0), textord_(this), right_to_left_(false), scaled_color_(NULL), scaled_factor_(-1), deskew_(1.0f, 0.0f), reskew_(1.0f, 0.0f), most_recently_used_(this), font_table_size_(0), cube_cntxt_(NULL), tess_cube_combiner_(NULL), equ_detect_(NULL) { }
Definition at line 510 of file tesseractclass.cpp.
{ Clear(); end_tesseract(); sub_langs_.delete_data_pointers(); // Delete cube objects. if (cube_cntxt_ != NULL) { delete cube_cntxt_; cube_cntxt_ = NULL; } if (tess_cube_combiner_ != NULL) { delete tess_cube_combiner_; tess_cube_combiner_ = NULL; } }
BOOL8 tesseract::Tesseract::acceptable_number_string | ( | const char * | s, |
const char * | lengths | ||
) |
Definition at line 422 of file output.cpp.
{ BOOL8 prev_digit = FALSE; if (*lengths == 1 && *s == '(') s++; if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) s++; for (; *s != '\0'; s += *(lengths++)) { if (unicharset.get_isdigit(s, *lengths)) prev_digit = TRUE; else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) prev_digit = FALSE; else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')'))) return TRUE; else if (prev_digit && *lengths == 1 && (*s == '%') && (*(lengths + 1) == 1 && *(s + *lengths) == ')') && (*(s + *lengths + *(lengths + 1)) == '\0')) return TRUE; else return FALSE; } return TRUE; }
ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string | ( | const UNICHARSET & | char_set, |
const char * | s, | ||
const char * | lengths | ||
) |
Definition at line 1237 of file control.cpp.
{ int i = 0; int offset = 0; int leading_punct_count; int upper_count = 0; int hyphen_pos = -1; ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE; if (strlen (lengths) > 20) return word_type; /* Single Leading punctuation char*/ if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset])) offset += lengths[i++]; leading_punct_count = i; /* Initial cap */ while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) { offset += lengths[i++]; upper_count++; } if (upper_count > 1) { word_type = AC_UPPER_CASE; } else { /* Lower case word, possibly with an initial cap */ while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) { offset += lengths[i++]; } if (i - leading_punct_count < quality_min_initial_alphas_reqd) goto not_a_word; /* Allow a single hyphen in a lower case word - dont trust upper case - I've seen several cases of "H" -> "I-I" */ if (lengths[i] == 1 && s[offset] == '-') { hyphen_pos = i; offset += lengths[i++]; if (s[offset] != '\0') { while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) { offset += lengths[i++]; } if (i < hyphen_pos + 3) goto not_a_word; } } else { /* Allow "'s" in NON hyphenated lower case words */ if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) { offset += lengths[i++]; offset += lengths[i++]; } } if (upper_count > 0) word_type = AC_INITIAL_CAP; else word_type = AC_LOWER_CASE; } /* Up to two different, constrained trailing punctuation chars */ if (lengths[i] == 1 && s[offset] != '\0' && STRING(chs_trailing_punct1).contains(s[offset])) offset += lengths[i++]; if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] && STRING(chs_trailing_punct2).contains (s[offset])) offset += lengths[i++]; if (s[offset] != '\0') word_type = AC_UNACCEPTABLE; not_a_word: if (word_type == AC_UNACCEPTABLE) { /* Look for abbreviation string */ i = 0; offset = 0; if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) { word_type = AC_UC_ABBREV; while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) && lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { offset += lengths[i++]; offset += lengths[i++]; } } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) { word_type = AC_LC_ABBREV; while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) && lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') { offset += lengths[i++]; offset += lengths[i++]; } } if (s[offset] != '\0') word_type = AC_UNACCEPTABLE; } return word_type; }
inT16 tesseract::Tesseract::alpha_count | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 498 of file reject.cpp.
{ inT16 i; inT16 offset; inT16 count = 0; for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { if (unicharset.get_isalpha (word + offset, word_lengths[i])) count++; } return count; }
void tesseract::Tesseract::ambigs_classify_and_output | ( | WERD_RES * | werd_res, |
ROW_RES * | row_res, | ||
BLOCK_RES * | block_res, | ||
const char * | label, | ||
FILE * | output_file | ||
) |
Definition at line 199 of file recogtraining.cpp.
{ // Classify word. fflush(stdout); WordData word_data(block_res->block, row_res->row, werd_res); SetupWordPassN(1, &word_data); classify_word_pass1(&word_data, werd_res); WERD_CHOICE *best_choice = werd_res->best_choice; ASSERT_HOST(best_choice != NULL); // Compute the number of unichars in the label. GenericVector<UNICHAR_ID> encoding; if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) { tprintf("Not outputting illegal unichar %s\n", label); return; } // Dump all paths through the ratings matrix (which is normally small). int dim = werd_res->ratings->dimension(); const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim]; PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file); delete [] blob_choices; }
PAGE_RES * tesseract::Tesseract::ApplyBoxes | ( | const STRING & | fname, |
bool | find_segmentation, | ||
BLOCK_LIST * | block_list | ||
) |
Definition at line 110 of file applybox.cpp.
{ int box_count = 0; int box_failures = 0; FILE* box_file = OpenBoxFile(fname); TBOX box; GenericVector<TBOX> boxes; GenericVector<STRING> texts, full_texts; bool found_box = true; while (found_box) { int line_number = 0; // Line number of the box file. STRING text, full_text; found_box = ReadNextBox(applybox_page, &line_number, box_file, &text, &box); if (found_box) { ++box_count; MakeBoxFileStr(text.string(), box, applybox_page, &full_text); } else { full_text = ""; } boxes.push_back(box); texts.push_back(text); full_texts.push_back(full_text); } // In word mode, we use the boxes to make a word for each box, but // in blob mode we use the existing words and maximally chop them first. PAGE_RES* page_res = find_segmentation ? NULL : SetupApplyBoxes(boxes, block_list); clear_any_old_text(block_list); for (int i = 0; i < boxes.size() - 1; i++) { bool foundit = false; if (page_res != NULL) { if (i == 0) { foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1], full_texts[i].string()); } else { foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i], boxes[i + 1], full_texts[i].string()); } } else { foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1], texts[i].string()); } if (!foundit) { box_failures++; ReportFailedBox(i, boxes[i], texts[i].string(), "FAILURE! Couldn't find a matching blob"); } } if (page_res == NULL) { // In word/line mode, we now maximally chop all the words and resegment // them with the classifier. page_res = SetupApplyBoxes(boxes, block_list); ReSegmentByClassification(page_res); } if (applybox_debug > 0) { tprintf("APPLY_BOXES:\n"); tprintf(" Boxes read from boxfile: %6d\n", box_count); if (box_failures > 0) tprintf(" Boxes failed resegmentation: %6d\n", box_failures); } TidyUp(page_res); return page_res; }
void tesseract::Tesseract::ApplyBoxTraining | ( | const STRING & | filename, |
PAGE_RES * | page_res | ||
) |
Definition at line 773 of file applybox.cpp.
int tesseract::Tesseract::AutoPageSeg | ( | PageSegMode | pageseg_mode, |
BLOCK_LIST * | blocks, | ||
TO_BLOCK_LIST * | to_blocks, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.
Resolution (in ppi) is derived from the input image.
The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.
If single_column is true, then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.
If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).
Definition at line 220 of file pagesegmain.cpp.
{ if (textord_debug_images) { WriteDebugBackgroundImage(textord_debug_printable, pix_binary_); } Pix* photomask_pix = NULL; Pix* musicmask_pix = NULL; // The blocks made by the ColumnFinder. Moved to blocks before return. BLOCK_LIST found_blocks; TO_BLOCK_LIST temp_blocks; bool single_column = !PSM_COL_FIND_ENABLED(pageseg_mode); bool osd_enabled = PSM_OSD_ENABLED(pageseg_mode); bool osd_only = pageseg_mode == PSM_OSD_ONLY; ColumnFinder* finder = SetupPageSegAndDetectOrientation( single_column, osd_enabled, osd_only, blocks, osd_tess, osr, &temp_blocks, &photomask_pix, &musicmask_pix); int result = 0; if (finder != NULL) { TO_BLOCK_IT to_block_it(&temp_blocks); TO_BLOCK* to_block = to_block_it.data(); if (musicmask_pix != NULL) { // TODO(rays) pass the musicmask_pix into FindBlocks and mark music // blocks separately. For now combine with photomask_pix. pixOr(photomask_pix, photomask_pix, musicmask_pix); } if (equ_detect_) { finder->SetEquationDetect(equ_detect_); } result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix, pix_thresholds_, pix_grey_, &found_blocks, to_blocks); if (result >= 0) finder->GetDeskewVectors(&deskew_, &reskew_); delete finder; } pixDestroy(&photomask_pix); pixDestroy(&musicmask_pix); if (result < 0) return result; blocks->clear(); BLOCK_IT block_it(blocks); // Move the found blocks to the input/output blocks. block_it.add_list_after(&found_blocks); if (textord_debug_images) { // The debug image is no longer needed so delete it. unlink(AlignedBlob::textord_debug_pix().string()); } return result; }
bool tesseract::Tesseract::BelievableSuperscript | ( | bool | debug, |
const WERD_RES & | word, | ||
float | certainty_threshold, | ||
int * | left_ok, | ||
int * | right_ok | ||
) | const |
Return whether this is believable superscript or subscript text.
We insist that: + there are no punctuation marks. + there are no italics. + no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and + each character is at least as certain as certainty_threshold.
[in] | debug | If true, spew debug output |
[in] | word | The word whose best_choice we're evaluating |
[in] | certainty_threshold | If any of the characters have less certainty than this, reject. |
[out] | left_ok | How many left-side characters were ok? |
[out] | right_ok | How many right-side characters were ok? |
Definition at line 520 of file superscript.cpp.
{ int initial_ok_run_count = 0; int ok_run_count = 0; float worst_certainty = 0.0f; const WERD_CHOICE &wc = *word.best_choice; const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table(); for (int i = 0; i < wc.length(); i++) { TBLOB *blob = word.rebuild_word->blobs[i]; UNICHAR_ID unichar_id = wc.unichar_id(i); float char_certainty = wc.certainty(i); bool bad_certainty = char_certainty < certainty_threshold; bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id); bool is_italic = word.fontinfo && word.fontinfo->is_italic(); BLOB_CHOICE *choice = word.GetBlobChoice(i); if (choice && fontinfo_table.size() > 0) { // Get better information from the specific choice, if available. int font_id1 = choice->fontinfo_id(); bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.get(font_id1).is_italic() : false; int font_id2 = choice->fontinfo_id2(); is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic()); } float height_fraction = 1.0f; float char_height = blob->bounding_box().height(); float normal_height = char_height; if (wc.unicharset()->top_bottom_useful()) { int min_bot, max_bot, min_top, max_top; wc.unicharset()->get_top_bottom(unichar_id, &min_bot, &max_bot, &min_top, &max_top); float hi_height = max_top - max_bot; float lo_height = min_top - min_bot; normal_height = (hi_height + lo_height) / 2; if (normal_height >= kBlnXHeight) { // Only ding characters that we have decent information for because // they're supposed to be normal sized, not tiny specks or dashes. height_fraction = char_height / normal_height; } } bool bad_height = height_fraction < superscript_scaledown_ratio; if (debug) { if (is_italic) { tprintf(" Rejecting: superscript is italic.\n"); } if (is_punc) { tprintf(" Rejecting: punctuation present.\n"); } const char *char_str = wc.unicharset()->id_to_unichar(unichar_id); if (bad_certainty) { tprintf(" Rejecting: don't believe character %s with certainty %.2f " "which is less than threshold %.2f\n", char_str, char_certainty, certainty_threshold); } if (bad_height) { tprintf(" Rejecting: character %s seems too small @ %.2f versus " "expected %.2f\n", char_str, char_height, normal_height); } } if (bad_certainty || bad_height || is_punc || is_italic) { if (ok_run_count == i) { initial_ok_run_count = ok_run_count; } ok_run_count = 0; } else { ok_run_count++; } if (char_certainty < worst_certainty) { worst_certainty = char_certainty; } } bool all_ok = ok_run_count == wc.length(); if (all_ok && debug) { tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty); } if (!all_ok) { if (left_ok) *left_ok = initial_ok_run_count; if (right_ok) *right_ok = ok_run_count; } return all_ok; }
Pix* tesseract::Tesseract::BestPix | ( | ) | const [inline] |
Definition at line 195 of file tesseractclass.h.
{
return pix_grey_ != NULL ? pix_grey_ : pix_binary_;
}
void tesseract::Tesseract::bigram_correction_pass | ( | PAGE_RES * | page_res | ) |
Definition at line 434 of file control.cpp.
{ PAGE_RES_IT word_it(page_res); WERD_RES *w_prev = NULL; WERD_RES *w = word_it.word(); while (1) { w_prev = w; while (word_it.forward() != NULL && (!word_it.word() || word_it.word()->part_of_combo)) { // advance word_it, skipping over parts of combos } if (!word_it.word()) break; w = word_it.word(); if (!w || !w_prev || w->uch_set != w_prev->uch_set) { continue; } if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) { if (tessedit_bigram_debug) { tprintf("Skipping because one of the words is W_REP_CHAR\n"); } continue; } // Two words sharing the same language model, excellent! GenericVector<WERD_CHOICE *> overrides_word1; GenericVector<WERD_CHOICE *> overrides_word2; STRING orig_w1_str = w_prev->best_choice->unichar_string(); STRING orig_w2_str = w->best_choice->unichar_string(); WERD_CHOICE prev_best(w->uch_set); { int w1start, w1end; w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end); prev_best = w_prev->best_choice->shallow_copy(w1start, w1end); } WERD_CHOICE this_best(w->uch_set); { int w2start, w2end; w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end); this_best = w->best_choice->shallow_copy(w2start, w2end); } if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) { if (tessedit_bigram_debug) { tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.string(), orig_w2_str.string()); } continue; } if (tessedit_bigram_debug > 2) { tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.string(), orig_w2_str.string()); } if (tessedit_bigram_debug > 1) { if (!w_prev->best_choices.singleton()) { w_prev->PrintBestChoices(); } if (!w->best_choices.singleton()) { w->PrintBestChoices(); } } float best_rating = 0.0; int best_idx = 0; WERD_CHOICE_IT prev_it(&w_prev->best_choices); for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) { WERD_CHOICE *p1 = prev_it.data(); WERD_CHOICE strip1(w->uch_set); { int p1start, p1end; p1->GetNonSuperscriptSpan(&p1start, &p1end); strip1 = p1->shallow_copy(p1start, p1end); } WERD_CHOICE_IT w_it(&w->best_choices); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD_CHOICE *p2 = w_it.data(); WERD_CHOICE strip2(w->uch_set); { int p2start, p2end; p2->GetNonSuperscriptSpan(&p2start, &p2end); strip2 = p2->shallow_copy(p2start, p2end); } if (w->tesseract->getDict().valid_bigram(strip1, strip2)) { overrides_word1.push_back(p1); overrides_word2.push_back(p2); if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) { best_rating = p1->rating() + p2->rating(); best_idx = overrides_word1.size() - 1; } } } } if (overrides_word1.size() >= 1) { // Excellent, we have some bigram matches. if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) && EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) { if (tessedit_bigram_debug > 1) { tprintf("Top choice \"%s %s\" verified (sans case) by bigram " "model.\n", orig_w1_str.string(), orig_w2_str.string()); } continue; } STRING new_w1_str = overrides_word1[best_idx]->unichar_string(); STRING new_w2_str = overrides_word2[best_idx]->unichar_string(); if (new_w1_str != orig_w1_str) { w_prev->ReplaceBestChoice(overrides_word1[best_idx]); } if (new_w2_str != orig_w2_str) { w->ReplaceBestChoice(overrides_word2[best_idx]); } if (tessedit_bigram_debug > 0) { STRING choices_description; int num_bigram_choices = overrides_word1.size() * overrides_word2.size(); if (num_bigram_choices == 1) { choices_description = "This was the unique bigram choice."; } else { if (tessedit_bigram_debug > 1) { STRING bigrams_list; const int kMaxChoicesToPrint = 20; for (int i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) { if (i > 0) { bigrams_list += ", "; } WERD_CHOICE *p1 = overrides_word1[i]; WERD_CHOICE *p2 = overrides_word2[i]; bigrams_list += p1->unichar_string() + " " + p2->unichar_string(); if (i == kMaxChoicesToPrint) { bigrams_list += " ..."; } } choices_description = "There were many choices: {"; choices_description += bigrams_list; choices_description += "}"; } else { choices_description.add_str_int("There were ", num_bigram_choices); choices_description += " compatible bigrams."; } } tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.string(), orig_w2_str.string(), new_w1_str.string(), new_w2_str.string(), choices_description.string()); } } } }
void tesseract::Tesseract::blamer_pass | ( | PAGE_RES * | page_res | ) |
Definition at line 678 of file control.cpp.
{ if (!wordrec_run_blamer) return; PAGE_RES_IT page_res_it(page_res); for (page_res_it.restart_page(); page_res_it.word() != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word); page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++; } tprintf("Blame reasons:\n"); for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) { tprintf("%s %d\n", BlamerBundle::IncorrectReasonName( static_cast<IncorrectResultReason>(bl)), page_res->blame_reasons[bl]); } if (page_res->misadaption_log.length() > 0) { tprintf("Misadaption log:\n"); for (int i = 0; i < page_res->misadaption_log.length(); ++i) { tprintf("%s\n", page_res->misadaption_log[i].string()); } } }
void tesseract::Tesseract::blob_feature_display | ( | PAGE_RES * | page_res, |
const TBOX & | selection_box | ||
) |
Definition at line 959 of file pgedit.cpp.
{ ROW* row; // row of word BLOCK* block; // block of word WERD* word = make_pseudo_word(page_res, selection_box, block, row); if (word != NULL) { WERD_RES word_res(word); word_res.x_height = row->x_height(); word_res.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, classify_bln_numeric_mode, textord_use_cjk_fp_model, poly_allow_detailed_fx, row, block); TWERD* bln_word = word_res.chopped_word; TBLOB* bln_blob = bln_word->blobs[0]; INT_FX_RESULT_STRUCT fx_info; GenericVector<INT_FEATURE_STRUCT> bl_features; GenericVector<INT_FEATURE_STRUCT> cn_features; Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features, &cn_features, &fx_info, NULL); // Display baseline features. ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0); ClearFeatureSpaceWindow(baseline, bl_win); for (int f = 0; f < bl_features.size(); ++f) RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN); bl_win->Update(); // Display cn features. ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0); ClearFeatureSpaceWindow(character, cn_win); for (int f = 0; f < cn_features.size(); ++f) RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN); cn_win->Update(); delete word; } }
float tesseract::Tesseract::blob_noise_score | ( | TBLOB * | blob | ) |
Definition at line 764 of file fixspace.cpp.
{ TBOX box; // BB of outline inT16 outline_count = 0; inT16 max_dimension; inT16 largest_outline_dimension = 0; for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) { outline_count++; box = ol->bounding_box(); if (box.height() > box.width()) { max_dimension = box.height(); } else { max_dimension = box.width(); } if (largest_outline_dimension < max_dimension) largest_outline_dimension = max_dimension; } if (outline_count > 5) { // penalise LOTS of blobs largest_outline_dimension *= 2; } box = blob->bounding_box(); if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) { // Lax blob is if high or low largest_outline_dimension /= 2; } return largest_outline_dimension; }
void tesseract::Tesseract::break_noisiest_blob_word | ( | WERD_RES_LIST & | words | ) |
break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.
Definition at line 619 of file fixspace.cpp.
{ WERD_RES_IT word_it(&words); WERD_RES_IT worst_word_it; float worst_noise_score = 9999; int worst_blob_index = -1; // Noisiest blob of noisiest wd int blob_index; // of wds noisiest blob float noise_score; // of wds noisiest blob WERD_RES *word_res; C_BLOB_IT blob_it; C_BLOB_IT rej_cblob_it; C_BLOB_LIST new_blob_list; C_BLOB_IT new_blob_it; C_BLOB_IT new_rej_cblob_it; WERD *new_word; inT16 start_of_noise_blob; inT16 i; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { blob_index = worst_noise_blob(word_it.data(), &noise_score); if (blob_index > -1 && worst_noise_score > noise_score) { worst_noise_score = noise_score; worst_blob_index = blob_index; worst_word_it = word_it; } } if (worst_blob_index < 0) { words.clear(); // signal termination return; } /* Now split the worst_word_it */ word_res = worst_word_it.data(); /* Move blobs before noise blob to a new bloblist */ new_blob_it.set_to_list(&new_blob_list); blob_it.set_to_list(word_res->word->cblob_list()); for (i = 0; i < worst_blob_index; i++, blob_it.forward()) { new_blob_it.add_after_then_move(blob_it.extract()); } start_of_noise_blob = blob_it.data()->bounding_box().left(); delete blob_it.extract(); // throw out noise blob new_word = new WERD(&new_blob_list, word_res->word); new_word->set_flag(W_EOL, FALSE); word_res->word->set_flag(W_BOL, FALSE); word_res->word->set_blanks(1); // After break new_rej_cblob_it.set_to_list(new_word->rej_cblob_list()); rej_cblob_it.set_to_list(word_res->word->rej_cblob_list()); for (; (!rej_cblob_it.empty() && (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob)); rej_cblob_it.forward()) { new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract()); } WERD_RES* new_word_res = new WERD_RES(new_word); new_word_res->combination = TRUE; worst_word_it.add_before_then_move(new_word_res); word_res->ClearResults(); }
Definition at line 257 of file pgedit.cpp.
{ SVMenuNode* parent_menu; SVMenuNode* root_menu_item = new SVMenuNode(); SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES"); modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT); modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT); modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT); modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT); modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT); modes_menu_item->AddChild("Recog Words", RECOG_WERDS); modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO); modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES); parent_menu = root_menu_item->AddChild("DISPLAY"); parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, FALSE); parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, FALSE); parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, FALSE); parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, FALSE); parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, FALSE); parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, TRUE); parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT); parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT); parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT); parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT); parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT); parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT); parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT); parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT); parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT); parent_menu = root_menu_item->AddChild("OTHER"); parent_menu->AddChild("Quit", QUIT_CMD_EVENT); parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, FALSE); parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, FALSE); parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, FALSE); parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT); parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT); return root_menu_item; }
BOOL8 tesseract::Tesseract::check_debug_pt | ( | WERD_RES * | word, |
int | location | ||
) |
Definition at line 1341 of file control.cpp.
{ BOOL8 show_map_detail = FALSE; inT16 i; #ifndef SECURE_NAMES if (!test_pt) return FALSE; tessedit_rejection_debug.set_value (FALSE); debug_x_ht_level.set_value (0); if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) { if (location < 0) return TRUE; // For breakpoint use tessedit_rejection_debug.set_value (TRUE); debug_x_ht_level.set_value (20); tprintf ("\n\nTESTWD::"); switch (location) { case 0: tprintf ("classify_word_pass1 start\n"); word->word->print(); break; case 10: tprintf ("make_reject_map: initial map"); break; case 20: tprintf ("make_reject_map: after NN"); break; case 30: tprintf ("classify_word_pass2 - START"); break; case 40: tprintf ("classify_word_pass2 - Pre Xht"); break; case 50: tprintf ("classify_word_pass2 - END"); show_map_detail = TRUE; break; case 60: tprintf ("fixspace"); break; case 70: tprintf ("MM pass START"); break; case 80: tprintf ("MM pass END"); break; case 90: tprintf ("After Poor quality rejection"); break; case 100: tprintf ("unrej_good_quality_words - START"); break; case 110: tprintf ("unrej_good_quality_words - END"); break; case 120: tprintf ("Write results pass"); show_map_detail = TRUE; break; } tprintf(" \"%s\" ", word->best_choice->unichar_string().string()); word->reject_map.print (debug_fp); tprintf ("\n"); if (show_map_detail) { tprintf ("\"%s\"\n", word->best_choice->unichar_string().string()); for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) { tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]); word->reject_map[i].full_print(debug_fp); } } tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); return TRUE; } else #endif return FALSE; }
void tesseract::Tesseract::classify_word_and_language | ( | WordRecognizer | recognizer, |
WordData * | word_data | ||
) |
Definition at line 806 of file control.cpp.
{ // Points to the best result. May be word or in lang_words. WERD_RES* word = word_data->word; clock_t start_t = clock(); if (classify_debug_level || cube_debug_level) { tprintf("Processing word with lang %s at:", most_recently_used_->lang.string()); word->word->bounding_box().print(); } const char* result_type = "Initial"; bool initially_done = !word->tess_failed && word->done; if (initially_done) { // If done on pass1, leave it as-is. most_recently_used_ = word->tesseract; result_type = "Already done"; } else { if (most_recently_used_ != this) { // Point to the word for most_recently_used_. for (int s = 0; s < sub_langs_.size(); ++s) { if (most_recently_used_ == sub_langs_[s]) { word = &word_data->lang_words[s]; break; } } } (most_recently_used_->*recognizer)(word_data, word); if (!word->tess_failed && word->tess_accepted) result_type = "Accepted"; } if (classify_debug_level || cube_debug_level) { tprintf("%s result: %s r=%.4g, c=%.4g, accepted=%d, adaptable=%d" " xht=[%g,%g]\n", result_type, word->best_choice->unichar_string().string(), word->best_choice->rating(), word->best_choice->certainty(), word->tess_accepted, word->tess_would_adapt, word->best_choice->min_x_height(), word->best_choice->max_x_height()); } if (word->tess_failed || !word->tess_accepted) { // Try all the other languages to see if they are any better. Tesseract* previous_used = most_recently_used_; if (most_recently_used_ != this) { if (classify_debug_level) { tprintf("Retrying with main-Tesseract, lang: %s\n", lang.string()); } if (word_data->word->tesseract == this) { // This is pass1, and we are trying the main language. if (RetryWithLanguage(*word, word_data, word_data->word, recognizer)) { most_recently_used_ = this; word = word_data->word; } } else { // This is pass2, and we are trying the main language again, but it // has no word allocated to it, so we must re-initialize it. WERD_RES main_word(*word_data->word); main_word.InitForRetryRecognition(*word_data->word); main_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, classify_bln_numeric_mode, textord_use_cjk_fp_model, poly_allow_detailed_fx, word_data->row, word_data->block); if (RetryWithLanguage(*word, word_data, &main_word, recognizer)) { most_recently_used_ = this; word_data->word->ConsumeWordResults(&main_word); word = word_data->word; } } if (!word->tess_failed && word->tess_accepted) return; // No need to look at the others. } for (int i = 0; i < sub_langs_.size(); ++i) { if (sub_langs_[i] != previous_used) { if (classify_debug_level) { tprintf("Retrying with sub-Tesseract[%d] lang: %s\n", i, sub_langs_[i]->lang.string()); } if (sub_langs_[i]->RetryWithLanguage(*word, word_data, &word_data->lang_words[i], recognizer)) { most_recently_used_ = sub_langs_[i]; word = &word_data->lang_words[i]; if (!word->tess_failed && word->tess_accepted) break; // No need to look at the others. } } } } if (word != word_data->word) { // Move the result for the best language to the main word. word_data->word->ConsumeWordResults(word); } clock_t ocr_t = clock(); if (tessedit_timing_debug) { tprintf("%s (ocr took %.2f sec)\n", word->best_choice->unichar_string().string(), static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC); } }
void tesseract::Tesseract::classify_word_pass1 | ( | WordData * | word_data, |
WERD_RES * | word | ||
) |
classify_word_pass1
Baseline normalize the word and pass it to Tess.
Definition at line 916 of file control.cpp.
{ ROW* row = word_data->row; BLOCK* block = word_data->block; prev_word_best_choice_ = word_data->prev_word != NULL ? word_data->prev_word->word->best_choice : NULL; // If we only intend to run cube - run it and return. if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { cube_word_pass1(block, row, word); return; } match_word_pass_n(1, word, row, block); if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { word->tess_would_adapt = AdaptableWord(word); bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode); if (adapt_ok) { // Send word to adaptive classifier for training. word->BestChoiceToCorrectText(); LearnWord(NULL, word); // Mark misadaptions if running blamer. if (word->blamer_bundle != NULL) { word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer); } } if (tessedit_enable_doc_dict && !word->IsAmbiguous()) tess_add_doc_word(word->best_choice); } }
void tesseract::Tesseract::classify_word_pass2 | ( | WordData * | word_data, |
WERD_RES * | word | ||
) |
classify_word_pass2
Control what to do with the word in pass 2
Definition at line 1023 of file control.cpp.
{ // Return if we do not want to run Tesseract. if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY && tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED) return; ROW* row = word_data->row; BLOCK* block = word_data->block; prev_word_best_choice_ = word_data->prev_word != NULL ? word_data->prev_word->word->best_choice : NULL; set_global_subloc_code(SUBLOC_NORM); check_debug_pt(word, 30); if (!word->done || tessedit_training_tess) { word->caps_height = 0.0; if (word->x_height == 0.0f) word->x_height = row->x_height(); match_word_pass_n(2, word, row, block); check_debug_pt(word, 40); } SubAndSuperscriptFix(word); if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() && block->classify_rotation().y() == 0.0f) { // Use the tops and bottoms since they are available. TrainedXheightFix(word, block, row); } set_global_subloc_code(SUBLOC_NORM); } #ifndef GRAPHICS_DISABLED if (tessedit_display_outwords) { if (fx_win == NULL) create_fx_win(); clear_fx_win(); word->rebuild_word->plot(fx_win); TBOX wbox = word->rebuild_word->bounding_box(); fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom()); ScrollView::Update(); } #endif set_global_subloc_code(SUBLOC_NORM); check_debug_pt(word, 50); }
void tesseract::Tesseract::Clear | ( | ) |
Definition at line 525 of file tesseractclass.cpp.
{ pixDestroy(&pix_binary_); pixDestroy(&cube_binary_); pixDestroy(&pix_grey_); pixDestroy(&pix_thresholds_); pixDestroy(&scaled_color_); deskew_ = FCOORD(1.0f, 0.0f); reskew_ = FCOORD(1.0f, 0.0f); splitter_.Clear(); scaled_factor_ = -1; for (int i = 0; i < sub_langs_.size(); ++i) sub_langs_[i]->Clear(); }
float tesseract::Tesseract::ComputeCompatibleXheight | ( | WERD_RES * | word_res | ) |
Definition at line 95 of file fixxht.cpp.
{ STATS top_stats(0, MAX_UINT8); int num_blobs = word_res->rebuild_word->NumBlobs(); for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { TBLOB* blob = word_res->rebuild_word->blobs[blob_id]; UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { int top = blob->bounding_box().top(); // Clip the top to the limit of normalized feature space. if (top >= INT_FEAT_RANGE) top = INT_FEAT_RANGE - 1; int bottom = blob->bounding_box().bottom(); int min_bottom, max_bottom, min_top, max_top; unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top); // Chars with a wild top range would mess up the result so ignore them. if (max_top - min_top > kMaxCharTopRange) continue; int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top, top - (max_top + x_ht_acceptance_tolerance)); int height = top - kBlnBaselineOffset; if (debug_x_ht_level >= 20) { tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ", unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top, max_top, bottom, top); } // Use only chars that fit in the expected bottom range, and where // the range of tops is sensibly near the xheight. if (min_bottom <= bottom + x_ht_acceptance_tolerance && bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset && max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) { // Compute the x-height position using proportionality between the // actual height and expected height. int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset); int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset); if (debug_x_ht_level >= 20) { tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht); } // The range of expected heights gets a vote equal to the distance // of the actual top from the expected top. for (int y = min_xht; y <= max_xht; ++y) top_stats.add(y, misfit_dist); } else if (debug_x_ht_level >= 20) { tprintf(" already OK\n"); } } } if (top_stats.get_total() == 0) return 0.0f; // The new xheight is just the median vote, which is then scaled out // of BLN space back to pixel space to get the x-height in pixel space. float new_xht = top_stats.median(); if (debug_x_ht_level >= 20) { tprintf("Median xht=%f\n", new_xht); tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht, new_xht / word_res->denorm.y_scale()); } // The xheight must change by at least x_ht_min_change to be used. if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change) return new_xht / word_res->denorm.y_scale(); else return 0.0f; }
void tesseract::Tesseract::convert_bad_unlv_chs | ( | WERD_RES * | word_res | ) |
Definition at line 664 of file docqual.cpp.
{ int i; UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" "); UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~"); UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^"); for (i = 0; i < word_res->reject_map.length(); ++i) { if (word_res->best_choice->unichar_id(i) == unichar_tilde) { word_res->best_choice->set_unichar_id(unichar_dash, i); if (word_res->reject_map[i].accepted ()) word_res->reject_map[i].setrej_unlv_rej (); } if (word_res->best_choice->unichar_id(i) == unichar_pow) { word_res->best_choice->set_unichar_id(unichar_space, i); if (word_res->reject_map[i].accepted ()) word_res->reject_map[i].setrej_unlv_rej (); } } }
bool tesseract::Tesseract::ConvertStringToUnichars | ( | const char * | utf8, |
GenericVector< UNICHAR_ID > * | class_ids | ||
) |
Definition at line 521 of file applybox.cpp.
{ for (int step = 0; *utf8 != '\0'; utf8 += step) { const char* next_space = strchr(utf8, ' '); if (next_space == NULL) next_space = utf8 + strlen(utf8); step = next_space - utf8; UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step); if (class_id == INVALID_UNICHAR_ID) { return false; } while (utf8[step] == ' ') ++step; class_ids->push_back(class_id); } return true; }
void tesseract::Tesseract::CorrectClassifyWords | ( | PAGE_RES * | page_res | ) |
Definition at line 749 of file applybox.cpp.
{ PAGE_RES_IT pr_it(page_res); for (WERD_RES *word_res = pr_it.word(); word_res != NULL; word_res = pr_it.forward()) { WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size()); for (int i = 0; i < word_res->correct_text.size(); ++i) { // The part before the first space is the real ground truth, and the // rest is the bounding box location and page number. GenericVector<STRING> tokens; word_res->correct_text[i].split(' ', &tokens); UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string()); choice->append_unichar_id_space_allocated(char_id, word_res->best_state[i], 0.0f, 0.0f); } word_res->ClearWordChoices(); word_res->LogNewRawChoice(choice); word_res->LogNewCookedChoice(1, false, choice); } }
inT16 tesseract::Tesseract::count_alphanums | ( | const WERD_CHOICE & | word | ) |
Definition at line 411 of file output.cpp.
{ int count = 0; for (int i = 0; i < word.length(); ++i) { if (word.unicharset()->get_isalpha(word.unichar_id(i)) || word.unicharset()->get_isdigit(word.unichar_id(i))) count++; } return count; }
Definition at line 561 of file reject.cpp.
{ int count = 0; const WERD_CHOICE *best_choice = word_res->best_choice; for (int i = 0; i < word_res->reject_map.length(); ++i) { if ((word_res->reject_map[i].accepted()) && (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) || word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) { count++; } } return count; }
inT16 tesseract::Tesseract::count_alphas | ( | const WERD_CHOICE & | word | ) |
Definition at line 401 of file output.cpp.
{ int count = 0; for (int i = 0; i < word.length(); ++i) { if (word.unicharset()->get_isalpha(word.unichar_id(i))) count++; } return count; }
inT16 tesseract::Tesseract::count_outline_errs | ( | char | c, |
inT16 | outline_count | ||
) |
Definition at line 129 of file docqual.cpp.
{ int expected_outline_count; if (STRING (outlines_odd).contains (c)) return 0; //Dont use this char else if (STRING (outlines_2).contains (c)) expected_outline_count = 2; else expected_outline_count = 1; return abs (outline_count - expected_outline_count); }
int tesseract::Tesseract::CountMisfitTops | ( | WERD_RES * | word_res | ) |
Definition at line 63 of file fixxht.cpp.
{ int bad_blobs = 0; int num_blobs = word_res->rebuild_word->NumBlobs(); for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { TBLOB* blob = word_res->rebuild_word->blobs[blob_id]; UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { int top = blob->bounding_box().top(); if (top >= INT_FEAT_RANGE) top = INT_FEAT_RANGE - 1; int min_bottom, max_bottom, min_top, max_top; unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top); if (max_top - min_top > kMaxCharTopRange) continue; bool bad = top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance; if (bad) ++bad_blobs; if (debug_x_ht_level >= 1) { tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n", unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top, static_cast<int>(x_ht_acceptance_tolerance)); } } } return bad_blobs; }
bool tesseract::Tesseract::create_cube_box_word | ( | Boxa * | char_boxes, |
int | num_chars, | ||
TBOX | word_box, | ||
BoxWord * | box_word | ||
) |
Definition at line 116 of file cube_control.cpp.
{ if (!box_word) { if (cube_debug_level > 0) { tprintf("Cube WARNING (create_cube_box_word): Invalid box_word.\n"); } return false; } // Find the x-coordinate of left-most char_box, which could be // nonzero if the word image was padded before recognition took place. int x_offset = -1; for (int i = 0; i < num_chars; ++i) { Box* char_box = boxaGetBox(char_boxes, i, L_CLONE); if (x_offset < 0 || char_box->x < x_offset) { x_offset = char_box->x; } boxDestroy(&char_box); } for (int i = 0; i < num_chars; ++i) { Box* char_box = boxaGetBox(char_boxes, i, L_CLONE); TBOX tbox = char_box_to_tbox(char_box, word_box, x_offset); boxDestroy(&char_box); box_word->InsertBox(i, tbox); } return true; }
void tesseract::Tesseract::cube_combine_word | ( | CubeObject * | cube_obj, |
WERD_RES * | cube_word, | ||
WERD_RES * | tess_word | ||
) |
Definition at line 283 of file cube_control.cpp.
{ float combiner_prob = tess_cube_combiner_->CombineResults(tess_word, cube_obj); // If combiner probability is greater than tess/cube combiner // classifier threshold, i.e. tesseract wins, then just return the // tesseract result unchanged, as the combiner knows nothing about how // correct the answer is. If cube and tesseract agree, then improve the // scores before returning. WERD_CHOICE* tess_best = tess_word->best_choice; WERD_CHOICE* cube_best = cube_word->best_choice; if (cube_debug_level || classify_debug_level) { tprintf("Combiner prob = %g vs threshold %g\n", combiner_prob, cube_cntxt_->Params()->CombinerClassifierThresh()); } if (combiner_prob >= cube_cntxt_->Params()->CombinerClassifierThresh()) { if (tess_best->unichar_string() == cube_best->unichar_string()) { // Cube and tess agree, so improve the scores. tess_best->set_rating(tess_best->rating() / 2); tess_best->set_certainty(tess_best->certainty() / 2); } return; } // Cube wins. // It is better for the language combiner to have all tesseract scores, // so put them in the cube result. cube_best->set_rating(tess_best->rating()); cube_best->set_certainty(tess_best->certainty()); if (cube_debug_level || classify_debug_level) { tprintf("Cube INFO: tesseract result replaced by cube: %s -> %s\n", tess_best->unichar_string().string(), cube_best->unichar_string().string()); } tess_word->ConsumeWordResults(cube_word); }
bool tesseract::Tesseract::cube_recognize | ( | CubeObject * | cube_obj, |
BLOCK * | block, | ||
WERD_RES * | word | ||
) |
Definition at line 326 of file cube_control.cpp.
{ // Run cube WordAltList *cube_alt_list = cube_obj->RecognizeWord(); if (!cube_alt_list || cube_alt_list->AltCount() <= 0) { if (cube_debug_level > 0) { tprintf("Cube returned nothing for word at:"); word->word->bounding_box().print(); } word->SetupFake(unicharset); return false; } // Get cube's best result and its probability, mapped to tesseract's // certainty range char_32 *cube_best_32 = cube_alt_list->Alt(0); double cube_prob = CubeUtils::Cost2Prob(cube_alt_list->AltCost(0)); float cube_certainty = convert_prob_to_tess_certainty(cube_prob); string cube_best_str; CubeUtils::UTF32ToUTF8(cube_best_32, &cube_best_str); // Retrieve Cube's character bounding boxes and CharSamples, // corresponding to the most recent call to RecognizeWord(). Boxa *char_boxes = NULL; CharSamp **char_samples = NULL;; int num_chars; if (!extract_cube_state(cube_obj, &num_chars, &char_boxes, &char_samples) && cube_debug_level > 0) { tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot extract " "cube state.\n"); word->SetupFake(unicharset); return false; } // Convert cube's character bounding boxes to a BoxWord. BoxWord cube_box_word; TBOX tess_word_box = word->word->bounding_box(); if (word->denorm.block() != NULL) tess_word_box.rotate(word->denorm.block()->re_rotation()); bool box_word_success = create_cube_box_word(char_boxes, num_chars, tess_word_box, &cube_box_word); boxaDestroy(&char_boxes); if (!box_word_success) { if (cube_debug_level > 0) { tprintf("Cube WARNING (Tesseract::cube_recognize): Could not " "create cube BoxWord\n"); } word->SetupFake(unicharset); return false; } // Fill tesseract result's fields with cube results fill_werd_res(cube_box_word, cube_best_str.c_str(), word); // Create cube's best choice. BLOB_CHOICE** choices = new BLOB_CHOICE*[num_chars]; for (int i = 0; i < num_chars; ++i) { UNICHAR_ID uch_id = cube_cntxt_->CharacterSet()->UnicharID(char_samples[i]->StrLabel()); choices[i] = new BLOB_CHOICE(uch_id, 0.0, cube_certainty, -1, -1, 0, 0, 0, 0, BCC_STATIC_CLASSIFIER); } word->FakeClassifyWord(num_chars, choices); // within a word, cube recognizes the word in reading order. word->best_choice->set_unichars_in_script_order(true); delete [] choices; delete [] char_samples; // Some sanity checks ASSERT_HOST(word->best_choice->length() == word->reject_map.length()); if (cube_debug_level || classify_debug_level) { tprintf("Cube result: %s r=%g, c=%g\n", word->best_choice->unichar_string().string(), word->best_choice->rating(), word->best_choice->certainty()); } return true; }
CubeObject * tesseract::Tesseract::cube_recognize_word | ( | BLOCK * | block, |
WERD_RES * | word | ||
) |
Definition at line 246 of file cube_control.cpp.
{ if (!cube_binary_ || !cube_cntxt_) { if (cube_debug_level > 0 && !cube_binary_) tprintf("Tesseract::run_cube(): NULL binary image.\n"); word->SetupFake(unicharset); return NULL; } TBOX word_box = word->word->bounding_box(); if (block != NULL && (block->re_rotation().x() != 1.0f || block->re_rotation().y() != 0.0f)) { // TODO(rays) We have to rotate the bounding box to get the true coords. // This will be achieved in the future via DENORM. // In the mean time, cube can't process this word. if (cube_debug_level > 0) { tprintf("Cube can't process rotated word at:"); word_box.print(); } word->SetupFake(unicharset); return NULL; } CubeObject* cube_obj = new tesseract::CubeObject( cube_cntxt_, cube_binary_, word_box.left(), pixGetHeight(cube_binary_) - word_box.top(), word_box.width(), word_box.height()); if (!cube_recognize(cube_obj, block, word)) { delete cube_obj; return NULL; } return cube_obj; }
void tesseract::Tesseract::cube_word_pass1 | ( | BLOCK * | block, |
ROW * | row, | ||
WERD_RES * | word | ||
) |
Definition at line 235 of file cube_control.cpp.
{ CubeObject *cube_obj = cube_recognize_word(block, word); delete cube_obj; }
void tesseract::Tesseract::debug_word | ( | PAGE_RES * | page_res, |
const TBOX & | selection_box | ||
) |
debug_word
Process the whole image, but load word_config_ for the selected word(s).
Definition at line 642 of file pgedit.cpp.
{ ResetAdaptiveClassifier(); recog_all_words(page_res, NULL, &selection_box, word_config_.string(), 0); }
BOOL8 tesseract::Tesseract::digit_or_numeric_punct | ( | WERD_RES * | word, |
int | char_position | ||
) |
Definition at line 345 of file fixspace.cpp.
{ int i; int offset; for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]); return ( word->uch_set->get_isdigit( word->best_choice->unichar_string().string() + offset, word->best_choice->unichar_lengths()[i]) || (word->best_choice->permuter() == NUMBER_PERM && STRING(numeric_punctuation).contains( word->best_choice->unichar_string().string()[offset]))); }
void tesseract::Tesseract::do_re_display | ( | BOOL8(tesseract::Tesseract::*)(BLOCK *block, ROW *row, WERD_RES *word_res) | word_painter | ) |
Redisplay page
Definition at line 308 of file pgedit.cpp.
{ PAGE_RES_IT pr_it(current_page_res); int block_count = 1; image_win->Clear(); if (display_image != 0) { image_win->Image(pix_binary_, 0, 0); } for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) { (this->*word_painter)(pr_it.block()->block, pr_it.row()->row, word); if (display_baselines && pr_it.row() != pr_it.prev_row()) pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN); if (display_blocks && pr_it.block() != pr_it.prev_block()) pr_it.block()->block->plot(image_win, block_count++, ScrollView::RED); } image_win->Update(); }
void tesseract::Tesseract::doc_and_block_rejection | ( | PAGE_RES_IT & | page_res_it, |
BOOL8 | good_quality_doc | ||
) |
Definition at line 236 of file docqual.cpp.
{ inT16 block_no = 0; inT16 row_no = 0; BLOCK_RES *current_block; ROW_RES *current_row; BOOL8 rej_word; BOOL8 prev_word_rejected; inT16 char_quality = 0; inT16 accepted_char_quality; if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count > tessedit_reject_doc_percent) { reject_whole_page(page_res_it); if (tessedit_debug_doc_rejection) { tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count, page_res_it.page_res->rej_count); } } else { if (tessedit_debug_doc_rejection) { tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count, page_res_it.page_res->rej_count); } /* Walk blocks testing for block rejection */ page_res_it.restart_page(); WERD_RES* word; while ((word = page_res_it.word()) != NULL) { current_block = page_res_it.block(); block_no = current_block->block->index(); if (current_block->char_count > 0 && (current_block->rej_count * 100.0 / current_block->char_count) > tessedit_reject_block_percent) { if (tessedit_debug_block_rejection) { tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no, current_block->char_count, current_block->rej_count); } prev_word_rejected = FALSE; while ((word = page_res_it.word()) != NULL && (page_res_it.block() == current_block)) { if (tessedit_preserve_blk_rej_perfect_wds) { rej_word = word->reject_map.reject_count() > 0 || word->reject_map.length () < tessedit_preserve_min_wd_len; if (rej_word && tessedit_dont_blkrej_good_wds && word->reject_map.length() >= tessedit_preserve_min_wd_len && acceptable_word_string( *word->uch_set, word->best_choice->unichar_string().string(), word->best_choice->unichar_lengths().string()) != AC_UNACCEPTABLE) { word_char_quality(word, page_res_it.row()->row, &char_quality, &accepted_char_quality); rej_word = char_quality != word->reject_map.length(); } } else { rej_word = TRUE; } if (rej_word) { /* Reject spacing if both current and prev words are rejected. NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated more space errors. */ if (tessedit_use_reject_spaces && prev_word_rejected && page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) word->reject_spaces = TRUE; word->reject_map.rej_word_block_rej(); } prev_word_rejected = rej_word; page_res_it.forward(); } } else { if (tessedit_debug_block_rejection) { tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no, page_res_it.block()->char_count, page_res_it.block()->rej_count); } /* Walk rows in block testing for row rejection */ row_no = 0; while ((word = page_res_it.word()) != NULL && page_res_it.block() == current_block) { current_row = page_res_it.row(); row_no++; /* Reject whole row if: fraction of chars on row which are rejected exceed a limit AND fraction rejects which occur in WHOLE WERD rejects is LESS THAN a limit */ if (current_row->char_count > 0 && (current_row->rej_count * 100.0 / current_row->char_count) > tessedit_reject_row_percent && (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) < tessedit_whole_wd_rej_row_percent) { if (tessedit_debug_block_rejection) { tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no, current_row->char_count, current_row->rej_count); } prev_word_rejected = FALSE; while ((word = page_res_it.word()) != NULL && page_res_it.row () == current_row) { /* Preserve words on good docs unless they are mostly rejected*/ if (!tessedit_row_rej_good_docs && good_quality_doc) { rej_word = word->reject_map.reject_count() / static_cast<float>(word->reject_map.length()) > tessedit_good_doc_still_rowrej_wd; } else if (tessedit_preserve_row_rej_perfect_wds) { /* Preserve perfect words anyway */ rej_word = word->reject_map.reject_count() > 0 || word->reject_map.length () < tessedit_preserve_min_wd_len; if (rej_word && tessedit_dont_rowrej_good_wds && word->reject_map.length() >= tessedit_preserve_min_wd_len && acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().string(), word->best_choice->unichar_lengths().string()) != AC_UNACCEPTABLE) { word_char_quality(word, page_res_it.row()->row, &char_quality, &accepted_char_quality); rej_word = char_quality != word->reject_map.length(); } } else { rej_word = TRUE; } if (rej_word) { /* Reject spacing if both current and prev words are rejected. NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated more space errors. */ if (tessedit_use_reject_spaces && prev_word_rejected && page_res_it.prev_row() == page_res_it.row() && word->word->space () == 1) word->reject_spaces = TRUE; word->reject_map.rej_word_row_rej(); } prev_word_rejected = rej_word; page_res_it.forward(); } } else { if (tessedit_debug_block_rejection) { tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no, current_row->char_count, current_row->rej_count); } while (page_res_it.word() != NULL && page_res_it.row() == current_row) page_res_it.forward(); } } } } } }
void tesseract::Tesseract::dont_allow_1Il | ( | WERD_RES * | word | ) |
Definition at line 529 of file reject.cpp.
{ int i = 0; int offset; int word_len = word->reject_map.length(); const char *s = word->best_choice->unichar_string().string(); const char *lengths = word->best_choice->unichar_lengths().string(); BOOL8 accepted_1Il = FALSE; for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) { if (word->reject_map[i].accepted()) { if (STRING(conflict_set_I_l_1).contains(s[offset])) { accepted_1Il = TRUE; } else { if (word->uch_set->get_isalpha(s + offset, lengths[i]) || word->uch_set->get_isdigit(s + offset, lengths[i])) return; // >=1 non 1Il ch accepted } } } if (!accepted_1Il) return; //Nothing to worry about for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) { if (STRING(conflict_set_I_l_1).contains(s[offset]) && word->reject_map[i].accepted()) word->reject_map[i].setrej_postNN_1Il(); } }
void tesseract::Tesseract::dump_words | ( | WERD_RES_LIST & | perm, |
inT16 | score, | ||
inT16 | mode, | ||
BOOL8 | improved | ||
) |
Definition at line 451 of file fixspace.cpp.
{ WERD_RES_IT word_res_it(&perm); if (debug_fix_space_level > 0) { if (mode == 1) { stats_.dump_words_str = ""; for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) { if (!word_res_it.data()->part_of_combo) { stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string(); stats_.dump_words_str += ' '; } } } #ifndef SECURE_NAMES if (debug_fix_space_level > 1) { switch (mode) { case 1: tprintf("EXTRACTED (%d): \"", score); break; case 2: tprintf("TESTED (%d): \"", score); break; case 3: tprintf("RETURNED (%d): \"", score); break; } for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) { if (!word_res_it.data()->part_of_combo) { tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().string(), (int)word_res_it.data()->best_choice->permuter()); } } tprintf("\"\n"); } else if (improved) { tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string()); for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) { if (!word_res_it.data()->part_of_combo) { tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().string(), (int)word_res_it.data()->best_choice->permuter()); } } tprintf("\"\n"); } #endif } }
void tesseract::Tesseract::end_tesseract | ( | ) |
Definition at line 464 of file tessedit.cpp.
{ end_recog(); }
inT16 tesseract::Tesseract::eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
Definition at line 241 of file fixspace.cpp.
{ WERD_RES_IT word_res_it(&word_res_list); inT16 total_score = 0; inT16 word_count = 0; inT16 done_word_count = 0; inT16 word_len; inT16 i; inT16 offset; WERD_RES *word; // current word inT16 prev_word_score = 0; BOOL8 prev_word_done = FALSE; BOOL8 prev_char_1 = FALSE; // prev ch a "1/I/l"? BOOL8 prev_char_digit = FALSE; // prev ch 2..9 or 0 BOOL8 current_char_1 = FALSE; BOOL8 current_word_ok_so_far; STRING punct_chars = "!\"`',.:;"; BOOL8 prev_char_punct = FALSE; BOOL8 current_char_punct = FALSE; BOOL8 word_done = FALSE; do { word = word_res_it.data(); word_done = fixspace_thinks_word_done(word); word_count++; if (word->tess_failed) { total_score += prev_word_score; if (prev_word_done) done_word_count++; prev_word_score = 0; prev_char_1 = FALSE; prev_char_digit = FALSE; prev_word_done = FALSE; } else { /* Can we add the prev word score and potentially count this word? Yes IF it didnt end in a 1 when the first char of this word is a digit AND it didnt end in a digit when the first char of this word is a 1 */ word_len = word->reject_map.length(); current_word_ok_so_far = FALSE; if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) || (prev_char_digit && ( (word_done && word->best_choice->unichar_lengths().string()[0] == 1 && word->best_choice->unichar_string()[0] == '1') || (!word_done && STRING(conflict_set_I_l_1).contains( word->best_choice->unichar_string()[0])))))) { total_score += prev_word_score; if (prev_word_done) done_word_count++; current_word_ok_so_far = word_done; } if (current_word_ok_so_far) { prev_word_done = TRUE; prev_word_score = word_len; } else { prev_word_done = FALSE; prev_word_score = 0; } /* Add 1 to total score for every joined 1 regardless of context and rejtn */ for (i = 0, prev_char_1 = FALSE; i < word_len; i++) { current_char_1 = word->best_choice->unichar_string()[i] == '1'; if (prev_char_1 || (current_char_1 && (i > 0))) total_score++; prev_char_1 = current_char_1; } /* Add 1 to total score for every joined punctuation regardless of context and rejtn */ if (tessedit_prefer_joined_punct) { for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) { current_char_punct = punct_chars.contains(word->best_choice->unichar_string()[offset]); if (prev_char_punct || (current_char_punct && i > 0)) total_score++; prev_char_punct = current_char_punct; } } prev_char_digit = digit_or_numeric_punct(word, word_len - 1); for (i = 0, offset = 0; i < word_len - 1; offset += word->best_choice->unichar_lengths()[i++]); prev_char_1 = ((word_done && (word->best_choice->unichar_string()[offset] == '1')) || (!word_done && STRING(conflict_set_I_l_1).contains( word->best_choice->unichar_string()[offset]))); } /* Find next word */ do { word_res_it.forward(); } while (word_res_it.data()->part_of_combo); } while (!word_res_it.at_first()); total_score += prev_word_score; if (prev_word_done) done_word_count++; if (done_word_count == word_count) return PERFECT_WERDS; else return total_score; }
void tesseract::Tesseract::ExplodeRepeatedWord | ( | BLOB_CHOICE * | best_choice, |
PAGE_RES_IT * | page_res_it | ||
) |
Definition at line 1204 of file control.cpp.
{ WERD_RES *word_res = page_res_it->word(); ASSERT_HOST(best_choice != NULL); // Make a new word for each blob in the original. WERD* werd = word_res->word; C_BLOB_IT blob_it(werd->cblob_list()); for (; !blob_it.empty(); blob_it.forward()) { bool first_blob = blob_it.at_first(); bool last_blob = blob_it.at_last(); WERD* blob_word = werd->ConstructFromSingleBlob(first_blob, last_blob, blob_it.extract()); // Note that blamer_bundle (truth information) is not copied, which is // desirable, since the newly inserted words would not have the original // bounding box corresponding to the one recorded in truth fields. WERD_RES* rep_word = page_res_it->InsertSimpleCloneWord(*word_res, blob_word); // Setup the single char WERD_RES if (rep_word->SetupForRecognition(*word_res->uch_set, this, BestPix(), tessedit_ocr_engine_mode, NULL, false, textord_use_cjk_fp_model, poly_allow_detailed_fx, page_res_it->row()->row, page_res_it->block()->block)) { rep_word->CloneChoppedToRebuild(); BLOB_CHOICE* blob_choice = new BLOB_CHOICE(*best_choice); rep_word->FakeClassifyWord(1, &blob_choice); } } page_res_it->DeleteCurrentWord(); }
bool tesseract::Tesseract::extract_cube_state | ( | CubeObject * | cube_obj, |
int * | num_chars, | ||
Boxa ** | char_boxes, | ||
CharSamp *** | char_samples | ||
) |
Definition at line 65 of file cube_control.cpp.
{ if (!cube_obj) { if (cube_debug_level > 0) { tprintf("Cube WARNING (extract_cube_state): Invalid cube object " "passed to extract_cube_state\n"); } return false; } // Note that the CubeObject accessors return either the deslanted or // regular objects search object or beam search object, whichever // was used in the last call to Recognize() CubeSearchObject* cube_search_obj = cube_obj->SrchObj(); if (!cube_search_obj) { if (cube_debug_level > 0) { tprintf("Cube WARNING (Extract_cube_state): Could not retrieve " "cube's search object in extract_cube_state.\n"); } return false; } BeamSearch *beam_search_obj = cube_obj->BeamObj(); if (!beam_search_obj) { if (cube_debug_level > 0) { tprintf("Cube WARNING (Extract_cube_state): Could not retrieve " "cube's beam search object in extract_cube_state.\n"); } return false; } // Get the character samples and bounding boxes by backtracking // through the beam search path int best_node_index = beam_search_obj->BestPresortedNodeIndex(); *char_samples = beam_search_obj->BackTrack( cube_search_obj, best_node_index, num_chars, NULL, char_boxes); if (!*char_samples) return false; return true; }
inT16 tesseract::Tesseract::failure_count | ( | WERD_RES * | word | ) |
Definition at line 970 of file docqual.cpp.
{ const char *str = word->best_choice->unichar_string().string(); int tess_rejs = 0; for (; *str != '\0'; str++) { if (*str == ' ') tess_rejs++; } return tess_rejs; }
void tesseract::Tesseract::fill_werd_res | ( | const BoxWord & | cube_box_word, |
const char * | cube_best_str, | ||
WERD_RES * | tess_werd_res | ||
) |
Definition at line 413 of file cube_control.cpp.
{ delete tess_werd_res->box_word; tess_werd_res->box_word = new BoxWord(cube_box_word); tess_werd_res->box_word->ClipToOriginalWord(tess_werd_res->denorm.block(), tess_werd_res->word); // Fill text and remaining fields tess_werd_res->word->set_text(cube_best_str); tess_werd_res->tess_failed = FALSE; tess_werd_res->tess_accepted = tess_acceptable_word(tess_werd_res); // There is no output word, so we can' call AdaptableWord, but then I don't // think we need to. Fudge the result with accepted. tess_werd_res->tess_would_adapt = tess_werd_res->tess_accepted; // Set word to done, i.e., ignore all of tesseract's tests for rejection tess_werd_res->done = tess_werd_res->tess_accepted; }
bool tesseract::Tesseract::FindSegmentation | ( | const GenericVector< UNICHAR_ID > & | target_text, |
WERD_RES * | word_res | ||
) |
Definition at line 545 of file applybox.cpp.
{ // Classify all required combinations of blobs and save results in choices. int word_length = word_res->box_word->length(); GenericVector<BLOB_CHOICE_LIST*>* choices = new GenericVector<BLOB_CHOICE_LIST*>[word_length]; for (int i = 0; i < word_length; ++i) { for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) { BLOB_CHOICE_LIST* match_result = classify_piece( word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word, word_res->blamer_bundle); if (applybox_debug > 2) { tprintf("%d+%d:", i, j); print_ratings_list("Segment:", match_result, unicharset); } choices[i].push_back(match_result); } } // Search the segmentation graph for the target text. Must be an exact // match. Using wildcards makes it difficult to find the correct // segmentation even when it is there. word_res->best_state.clear(); GenericVector<int> search_segmentation; float best_rating = 0.0f; SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating, &word_res->best_state); for (int i = 0; i < word_length; ++i) choices[i].delete_data_pointers(); delete [] choices; if (word_res->best_state.empty()) { // Build the original segmentation and if it is the same length as the // truth, assume it will do. int blob_count = 1; for (int s = 0; s < word_res->seam_array.size(); ++s) { SEAM* seam = word_res->seam_array[s]; if (seam->split1 == NULL) { word_res->best_state.push_back(blob_count); blob_count = 1; } else { ++blob_count; } } word_res->best_state.push_back(blob_count); if (word_res->best_state.size() != target_text.size()) { word_res->best_state.clear(); // No good. Original segmentation bad size. return false; } } word_res->correct_text.clear(); for (int i = 0; i < target_text.size(); ++i) { word_res->correct_text.push_back( STRING(unicharset.id_to_unichar(target_text[i]))); } return true; }
inT16 tesseract::Tesseract::first_alphanum_index | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 472 of file reject.cpp.
{ inT16 i; inT16 offset; for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { if (unicharset.get_isalpha(word + offset, word_lengths[i]) || unicharset.get_isdigit(word + offset, word_lengths[i])) return i; } return -1; }
inT16 tesseract::Tesseract::first_alphanum_offset | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 485 of file reject.cpp.
{ inT16 i; inT16 offset; for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { if (unicharset.get_isalpha(word + offset, word_lengths[i]) || unicharset.get_isdigit(word + offset, word_lengths[i])) return offset; } return -1; }
void tesseract::Tesseract::fix_fuzzy_space_list | ( | WERD_RES_LIST & | best_perm, |
ROW * | row, | ||
BLOCK * | block | ||
) |
Definition at line 146 of file fixspace.cpp.
{ inT16 best_score; WERD_RES_LIST current_perm; inT16 current_score; BOOL8 improved = FALSE; best_score = eval_word_spacing(best_perm); // default score dump_words(best_perm, best_score, 1, improved); if (best_score != PERFECT_WERDS) initialise_search(best_perm, current_perm); while ((best_score != PERFECT_WERDS) && !current_perm.empty()) { match_current_words(current_perm, row, block); current_score = eval_word_spacing(current_perm); dump_words(current_perm, current_score, 2, improved); if (current_score > best_score) { best_perm.clear(); best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); best_score = current_score; improved = TRUE; } if (current_score < PERFECT_WERDS) transform_to_next_perm(current_perm); } dump_words(best_perm, best_score, 3, improved); }
void tesseract::Tesseract::fix_fuzzy_spaces | ( | ETEXT_DESC * | monitor, |
inT32 | word_count, | ||
PAGE_RES * | page_res | ||
) |
Definition at line 49 of file fixspace.cpp.
{ BLOCK_RES_IT block_res_it; ROW_RES_IT row_res_it; WERD_RES_IT word_res_it_from; WERD_RES_IT word_res_it_to; WERD_RES *word_res; WERD_RES_LIST fuzzy_space_words; inT16 new_length; BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds inT32 word_index; // current word block_res_it.set_to_list(&page_res->block_res_list); word_index = 0; for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) { row_res_it.set_to_list(&block_res_it.data()->row_res_list); for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) { word_res_it_from.set_to_list(&row_res_it.data()->word_res_list); while (!word_res_it_from.at_last()) { word_res = word_res_it_from.data(); while (!word_res_it_from.at_last() && !(word_res->combination || word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) || word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) { fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block); word_res = word_res_it_from.forward(); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 90 + 5 * word_index / word_count; if (monitor->deadline_exceeded() || (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) return; } } if (!word_res_it_from.at_last()) { word_res_it_to = word_res_it_from; prevent_null_wd_fixsp = word_res->word->cblob_list()->empty(); if (check_debug_pt(word_res, 60)) debug_fix_space_level.set_value(10); word_res_it_to.forward(); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 90 + 5 * word_index / word_count; if (monitor->deadline_exceeded() || (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) return; } while (!word_res_it_to.at_last () && (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) || word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) { if (check_debug_pt(word_res, 60)) debug_fix_space_level.set_value(10); if (word_res->word->cblob_list()->empty()) prevent_null_wd_fixsp = TRUE; word_res = word_res_it_to.forward(); } if (check_debug_pt(word_res, 60)) debug_fix_space_level.set_value(10); if (word_res->word->cblob_list()->empty()) prevent_null_wd_fixsp = TRUE; if (prevent_null_wd_fixsp) { word_res_it_from = word_res_it_to; } else { fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to); fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row, block_res_it.data()->block); new_length = fuzzy_space_words.length(); word_res_it_from.add_list_before(&fuzzy_space_words); for (; !word_res_it_from.at_last() && new_length > 0; new_length--) { word_res_it_from.forward(); } } if (test_pt) debug_fix_space_level.set_value(0); } fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block); // Last word in row } } } }
void tesseract::Tesseract::fix_noisy_space_list | ( | WERD_RES_LIST & | best_perm, |
ROW * | row, | ||
BLOCK * | block | ||
) |
Definition at line 573 of file fixspace.cpp.
{ inT16 best_score; WERD_RES_IT best_perm_it(&best_perm); WERD_RES_LIST current_perm; WERD_RES_IT current_perm_it(¤t_perm); WERD_RES *old_word_res; inT16 current_score; BOOL8 improved = FALSE; best_score = fp_eval_word_spacing(best_perm); // default score dump_words(best_perm, best_score, 1, improved); old_word_res = best_perm_it.data(); // Even deep_copy doesn't copy the underlying WERD unless its combination // flag is true!. old_word_res->combination = TRUE; // Kludge to force deep copy current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res)); old_word_res->combination = FALSE; // Undo kludge break_noisiest_blob_word(current_perm); while (best_score != PERFECT_WERDS && !current_perm.empty()) { match_current_words(current_perm, row, block); current_score = fp_eval_word_spacing(current_perm); dump_words(current_perm, current_score, 2, improved); if (current_score > best_score) { best_perm.clear(); best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy); best_score = current_score; improved = TRUE; } if (current_score < PERFECT_WERDS) { break_noisiest_blob_word(current_perm); } } dump_words(best_perm, best_score, 3, improved); }
void tesseract::Tesseract::fix_rep_char | ( | PAGE_RES_IT * | page_res_it | ) |
fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.
Definition at line 1152 of file control.cpp.
{ WERD_RES *word_res = page_res_it->word(); const WERD_CHOICE &word = *(word_res->best_choice); // Find the frequency of each unique character in the word. UNICHAR_ID space = word_res->uch_set->unichar_to_id(" "); SortHelper<UNICHAR_ID> rep_ch(word.length()); for (int i = 0; i < word.length(); ++i) { if (word.unichar_id(i) != space) rep_ch.Add(word.unichar_id(i), 1); } // Find the most frequent result. UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char int max_count = rep_ch.MaxCount(&maxch_id); // Find the best exemplar of a classifier result for maxch_id. BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res); if (best_choice == NULL) { tprintf("Failed to find a choice for %s, occurring %d times\n", word_res->uch_set->debug_str(maxch_id).string(), max_count); return; } word_res->done = TRUE; // Measure the mean space. int total_gap = 0; int gap_count = 0; WERD* werd = word_res->word; C_BLOB_IT blob_it(werd->cblob_list()); C_BLOB* prev_blob = blob_it.data(); for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); int gap = blob->bounding_box().left(); gap -= prev_blob->bounding_box().right(); total_gap += gap; ++gap_count; prev_blob = blob; } if (total_gap > word_res->x_height * gap_count * kRepcharGapThreshold) { // Needs spaces between. ExplodeRepeatedWord(best_choice, page_res_it); } else { // Just correct existing classification. CorrectRepcharChoices(best_choice, word_res); word_res->reject_map.initialise(word.length()); } }
void tesseract::Tesseract::fix_sp_fp_word | ( | WERD_RES_IT & | word_res_it, |
ROW * | row, | ||
BLOCK * | block | ||
) |
Definition at line 539 of file fixspace.cpp.
{ WERD_RES *word_res; WERD_RES_LIST sub_word_list; WERD_RES_IT sub_word_list_it(&sub_word_list); inT16 blob_index; inT16 new_length; float junk; word_res = word_res_it.data(); if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo || !word_res->word->flag(W_DONT_CHOP)) return; blob_index = worst_noise_blob(word_res, &junk); if (blob_index < 0) return; if (debug_fix_space_level > 1) { tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().string()); } word_res->word->rej_cblob_list()->sort(c_blob_comparator); sub_word_list_it.add_after_stay_put(word_res_it.extract()); fix_noisy_space_list(sub_word_list, row, block); new_length = sub_word_list.length(); word_res_it.add_list_before(&sub_word_list); for (; !word_res_it.at_last() && new_length > 1; new_length--) { word_res_it.forward(); } }
Definition at line 507 of file fixspace.cpp.
{ if (word->done) return TRUE; /* Use all the standard pass 2 conditions for mode 5 in set_done() in reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT CARE WHETHER WE HAVE of/at on/an etc. */ if (fixsp_done_mode > 0 && (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) || fixsp_done_mode == 3) && (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) && ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) || (word->best_choice->permuter() == FREQ_DAWG_PERM) || (word->best_choice->permuter() == USER_DAWG_PERM) || (word->best_choice->permuter() == NUMBER_PERM))) { return TRUE; } else { return FALSE; } }
void tesseract::Tesseract::flip_0O | ( | WERD_RES * | word | ) |
Definition at line 676 of file reject.cpp.
{ WERD_CHOICE *best_choice = word_res->best_choice; int i; TBOX out_box; if (!tessedit_flip_0O) return; int num_blobs = word_res->rebuild_word->NumBlobs(); for (i = 0; i < best_choice->length() && i < num_blobs; ++i) { TBLOB* blob = word_res->rebuild_word->blobs[i]; if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) || word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) { out_box = blob->bounding_box(); if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) || (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) return; //Beware words with sub/superscripts } } UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0"); UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O"); if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) || unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) { return; // 0 or O are not present/enabled in unicharset } for (i = 1; i < best_choice->length(); ++i) { if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) { /* A0A */ if ((i+1) < best_choice->length() && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) { best_choice->set_unichar_id(unichar_O, i); } /* A00A */ if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+1) < best_choice->length() && (best_choice->unichar_id(i+1) == unichar_0 || best_choice->unichar_id(i+1) == unichar_O) && (i+2) < best_choice->length() && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) { best_choice->set_unichar_id(unichar_O, i); i++; } /* AA0<non digit or end of word> */ if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && (((i+1) < best_choice->length() && !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) && !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") && !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) || (i == best_choice->length() - 1))) { best_choice->set_unichar_id(unichar_O, i); } /* 9O9 */ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+1) < best_choice->length() && non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) { best_choice->set_unichar_id(unichar_0, i); } /* 9OOO */ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+2) < best_choice->length() && (best_choice->unichar_id(i+1) == unichar_0 || best_choice->unichar_id(i+1) == unichar_O) && (best_choice->unichar_id(i+2) == unichar_0 || best_choice->unichar_id(i+2) == unichar_O)) { best_choice->set_unichar_id(unichar_0, i); best_choice->set_unichar_id(unichar_0, i+1); best_choice->set_unichar_id(unichar_0, i+2); i += 2; } /* 9OO<non upper> */ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+2) < best_choice->length() && (best_choice->unichar_id(i+1) == unichar_0 || best_choice->unichar_id(i+1) == unichar_O) && !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) { best_choice->set_unichar_id(unichar_0, i); best_choice->set_unichar_id(unichar_0, i+1); i++; } /* 9O<non upper> */ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+1) < best_choice->length() && !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) { best_choice->set_unichar_id(unichar_0, i); } /* 9[.,]OOO.. */ if ((i > 1) && (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") || word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) && (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) || best_choice->unichar_id(i-2) == unichar_O)) { if (best_choice->unichar_id(i-2) == unichar_O) { best_choice->set_unichar_id(unichar_0, i-2); } while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O || best_choice->unichar_id(i) == unichar_0)) { best_choice->set_unichar_id(unichar_0, i); i++; } i--; } } } }
void tesseract::Tesseract::flip_hyphens | ( | WERD_RES * | word | ) |
Definition at line 619 of file reject.cpp.
{ WERD_CHOICE *best_choice = word_res->best_choice; int i; int prev_right = -9999; int next_left; TBOX out_box; float aspect_ratio; if (tessedit_lower_flip_hyphen <= 1) return; int num_blobs = word_res->rebuild_word->NumBlobs(); UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); for (i = 0; i < best_choice->length() && i < num_blobs; ++i) { TBLOB* blob = word_res->rebuild_word->blobs[i]; out_box = blob->bounding_box(); if (i + 1 == num_blobs) next_left = 9999; else next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left(); // Dont touch small or touching blobs - it is too dangerous. if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) && (out_box.right() < next_left)) { aspect_ratio = out_box.width() / (float) out_box.height(); if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) { if (aspect_ratio >= tessedit_upper_flip_hyphen && word_res->uch_set->contains_unichar_id(unichar_dash) && word_res->uch_set->get_enabled(unichar_dash)) { /* Certain HYPHEN */ best_choice->set_unichar_id(unichar_dash, i); if (word_res->reject_map[i].rejected()) word_res->reject_map[i].setrej_hyphen_accept(); } if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) //Suspected HYPHEN word_res->reject_map[i].setrej_hyphen (); } else if (best_choice->unichar_id(i) == unichar_dash) { if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) word_res->reject_map[i].setrej_hyphen_accept(); //Certain HYPHEN if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) //Suspected HYPHEN word_res->reject_map[i].setrej_hyphen(); } } prev_right = out_box.right(); } }
void tesseract::Tesseract::font_recognition_pass | ( | PAGE_RES * | page_res | ) |
font_recognition_pass
Smooth the fonts for the document.
Definition at line 1540 of file control.cpp.
{ PAGE_RES_IT page_res_it(page_res); WERD_RES *word; // current word STATS doc_fonts(0, font_table_size_); // font counters // Gather font id statistics. for (page_res_it.restart_page(); page_res_it.word() != NULL; page_res_it.forward()) { word = page_res_it.word(); if (word->fontinfo != NULL) { doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count); } if (word->fontinfo2 != NULL) { doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count); } } inT16 doc_font; // modal font inT8 doc_font_count; // modal font find_modal_font(&doc_fonts, &doc_font, &doc_font_count); if (doc_font_count == 0) return; // Get the modal font pointer. const FontInfo* modal_font = NULL; for (page_res_it.restart_page(); page_res_it.word() != NULL; page_res_it.forward()) { word = page_res_it.word(); if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) { modal_font = word->fontinfo; break; } if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) { modal_font = word->fontinfo2; break; } } ASSERT_HOST(modal_font != NULL); // Assign modal font to weak words. for (page_res_it.restart_page(); page_res_it.word() != NULL; page_res_it.forward()) { word = page_res_it.word(); int length = word->best_choice->length(); // 1st choices got 2 pts, so we need to halve the score for the mode. int count = (word->fontinfo_id_count + 1) / 2; if (!(count == length || (length > 3 && count >= length * 3 / 4))) { word->fontinfo = modal_font; // Counts only get 1 as it came from the doc. word->fontinfo_id_count = 1; word->italic = modal_font->is_italic() ? 1 : -1; word->bold = modal_font->is_bold() ? 1 : -1; } } }
inT16 tesseract::Tesseract::fp_eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
Definition at line 834 of file fixspace.cpp.
{ WERD_RES_IT word_it(&word_res_list); WERD_RES *word; inT16 word_length; inT16 score = 0; inT16 i; float small_limit = kBlnXHeight * fixsp_small_outlines_size; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word = word_it.data(); if (word->rebuild_word == NULL) continue; // Can't handle cube words. word_length = word->reject_map.length(); if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM || word->best_choice->permuter() == FREQ_DAWG_PERM || word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) { int num_blobs = word->rebuild_word->NumBlobs(); UNICHAR_ID space = word->uch_set->unichar_to_id(" "); for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) { TBLOB* blob = word->rebuild_word->blobs[i]; if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) { score -= 1; // penalise possibly erroneous non-space } else if (word->reject_map[i].accepted()) { score++; } } } } if (score < 0) score = 0; return score; }
GARBAGE_LEVEL tesseract::Tesseract::garbage_word | ( | WERD_RES * | word, |
BOOL8 | ok_dict_word | ||
) |
Definition at line 684 of file docqual.cpp.
{ enum STATES { JUNK, FIRST_UPPER, FIRST_LOWER, FIRST_NUM, SUBSEQUENT_UPPER, SUBSEQUENT_LOWER, SUBSEQUENT_NUM }; const char *str = word->best_choice->unichar_string().string(); const char *lengths = word->best_choice->unichar_lengths().string(); STATES state = JUNK; int len = 0; int isolated_digits = 0; int isolated_alphas = 0; int bad_char_count = 0; int tess_rejs = 0; int dodgy_chars = 0; int ok_chars; UNICHAR_ID last_char = -1; int alpha_repetition_count = 0; int longest_alpha_repetition_count = 0; int longest_lower_run_len = 0; int lower_string_count = 0; int longest_upper_run_len = 0; int upper_string_count = 0; int total_alpha_count = 0; int total_digit_count = 0; for (; *str != '\0'; str += *(lengths++)) { len++; if (word->uch_set->get_isupper (str, *lengths)) { total_alpha_count++; switch (state) { case SUBSEQUENT_UPPER: case FIRST_UPPER: state = SUBSEQUENT_UPPER; upper_string_count++; if (longest_upper_run_len < upper_string_count) longest_upper_run_len = upper_string_count; if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { alpha_repetition_count++; if (longest_alpha_repetition_count < alpha_repetition_count) { longest_alpha_repetition_count = alpha_repetition_count; } } else { last_char = word->uch_set->unichar_to_id(str, *lengths); alpha_repetition_count = 1; } break; case FIRST_NUM: isolated_digits++; default: state = FIRST_UPPER; last_char = word->uch_set->unichar_to_id(str, *lengths); alpha_repetition_count = 1; upper_string_count = 1; break; } } else if (word->uch_set->get_islower (str, *lengths)) { total_alpha_count++; switch (state) { case SUBSEQUENT_LOWER: case FIRST_LOWER: state = SUBSEQUENT_LOWER; lower_string_count++; if (longest_lower_run_len < lower_string_count) longest_lower_run_len = lower_string_count; if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { alpha_repetition_count++; if (longest_alpha_repetition_count < alpha_repetition_count) { longest_alpha_repetition_count = alpha_repetition_count; } } else { last_char = word->uch_set->unichar_to_id(str, *lengths); alpha_repetition_count = 1; } break; case FIRST_NUM: isolated_digits++; default: state = FIRST_LOWER; last_char = word->uch_set->unichar_to_id(str, *lengths); alpha_repetition_count = 1; lower_string_count = 1; break; } } else if (word->uch_set->get_isdigit (str, *lengths)) { total_digit_count++; switch (state) { case FIRST_NUM: state = SUBSEQUENT_NUM; case SUBSEQUENT_NUM: break; case FIRST_UPPER: case FIRST_LOWER: isolated_alphas++; default: state = FIRST_NUM; break; } } else { if (*lengths == 1 && *str == ' ') tess_rejs++; else bad_char_count++; switch (state) { case FIRST_NUM: isolated_digits++; break; case FIRST_UPPER: case FIRST_LOWER: isolated_alphas++; default: break; } state = JUNK; } } switch (state) { case FIRST_NUM: isolated_digits++; break; case FIRST_UPPER: case FIRST_LOWER: isolated_alphas++; default: break; } if (crunch_include_numerals) { total_alpha_count += total_digit_count - isolated_digits; } if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len && longest_alpha_repetition_count < crunch_long_repetitions) { if ((crunch_accept_ok && acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) || longest_lower_run_len > crunch_leave_lc_strings || longest_upper_run_len > crunch_leave_uc_strings) return G_NEVER_CRUNCH; } if (word->reject_map.length() > 1 && strpbrk(str, " ") == NULL && (word->best_choice->permuter() == SYSTEM_DAWG_PERM || word->best_choice->permuter() == FREQ_DAWG_PERM || word->best_choice->permuter() == USER_DAWG_PERM || word->best_choice->permuter() == NUMBER_PERM || acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) return G_OK; ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs; if (crunch_debug > 3) { tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().string()); tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count, isolated_digits, isolated_alphas, tess_rejs); } if (bad_char_count == 0 && tess_rejs == 0 && (len > isolated_digits + isolated_alphas || len <= 2)) return G_OK; if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) return G_TERRIBLE; if (len > 4) { dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas; if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5) return G_DODGY; else return G_OK; } else { dodgy_chars = 2 * tess_rejs + bad_char_count; if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) return G_DODGY; else return G_OK; } }
Definition at line 286 of file output.cpp.
{ // what char is repeated? int i; for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i); if (i < word->reject_map.length()) { return word->best_choice->unichar_id(i); } else { return word->uch_set->unichar_to_id(unrecognised_char.string()); } }
Tesseract* tesseract::Tesseract::get_sub_lang | ( | int | index | ) | const [inline] |
Definition at line 237 of file tesseractclass.h.
{
return sub_langs_[index];
}
CubeRecoContext* tesseract::Tesseract::GetCubeRecoContext | ( | ) | [inline] |
Definition at line 1043 of file tesseractclass.h.
{ return cube_cntxt_; }
void tesseract::Tesseract::GetSubAndSuperscriptCandidates | ( | const WERD_RES * | word, |
int * | num_rebuilt_leading, | ||
ScriptPos * | leading_pos, | ||
float * | leading_certainty, | ||
int * | num_rebuilt_trailing, | ||
ScriptPos * | trailing_pos, | ||
float * | trailing_certainty, | ||
float * | avg_certainty, | ||
float * | unlikely_threshold | ||
) |
Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.
[in] | word | The word to examine. |
[out] | num_rebuilt_leading | the number of rebuilt blobs at the start of the word which are all up or down and seem badly classified. |
[out] | leading_pos | "super" or "sub" (for debugging) |
[out] | leading_certainty | the worst certainty in the leading blobs. |
[out] | num_rebuilt_trailing | the number of rebuilt blobs at the end of the word which are all up or down and seem badly classified. |
[out] | trailing_pos | "super" or "sub" (for debugging) |
[out] | trailing_certainty | the worst certainty in the trailing blobs. |
[out] | avg_certainty | the average certainty of "normal" blobs in the word. |
[out] | unlikely_threshold | the threshold (on certainty) we used to select "bad enough" outlier characters. |
Definition at line 253 of file superscript.cpp.
{ *avg_certainty = *unlikely_threshold = 0.0f; *num_rebuilt_leading = *num_rebuilt_trailing = 0; *leading_certainty = *trailing_certainty = 0.0f; int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom; int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top; // Step one: Get an average certainty for "normally placed" characters. // Counts here are of blobs in the rebuild_word / unichars in best_choice. *leading_pos = *trailing_pos = SP_NORMAL; int leading_outliers = 0; int trailing_outliers = 0; int num_normal = 0; float normal_certainty_total = 0.0f; float worst_normal_certainty = 0.0f; ScriptPos last_pos = SP_NORMAL; int num_blobs = word->rebuild_word->NumBlobs(); for (int b = 0; b < num_blobs; ++b) { TBOX box = word->rebuild_word->blobs[b]->bounding_box(); ScriptPos pos = SP_NORMAL; if (box.bottom() >= super_y_bottom) { pos = SP_SUPERSCRIPT; } else if (box.top() <= sub_y_top) { pos = SP_SUBSCRIPT; } if (pos == SP_NORMAL) { if (word->best_choice->unichar_id(b) != 0) { float char_certainty = word->best_choice->certainty(b); if (char_certainty < worst_normal_certainty) { worst_normal_certainty = char_certainty; } num_normal++; normal_certainty_total += char_certainty; } if (trailing_outliers == b) { leading_outliers = trailing_outliers; *leading_pos = last_pos; } trailing_outliers = 0; } else { if (last_pos == pos) { trailing_outliers++; } else { trailing_outliers = 1; } } last_pos = pos; } *trailing_pos = last_pos; if (num_normal >= 3) { // throw out the worst as an outlier. num_normal--; normal_certainty_total -= worst_normal_certainty; } if (num_normal > 0) { *avg_certainty = normal_certainty_total / num_normal; *unlikely_threshold = superscript_worse_certainty * (*avg_certainty); } if (num_normal == 0 || (leading_outliers == 0 && trailing_outliers == 0)) { return; } // Step two: Try to split off bits of the word that are both outliers // and have much lower certainty than average // Calculate num_leading and leading_certainty. for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; *num_rebuilt_leading < leading_outliers; (*num_rebuilt_leading)++) { float char_certainty = word->best_choice->certainty(*num_rebuilt_leading); if (char_certainty > *unlikely_threshold) { break; } if (char_certainty < *leading_certainty) { *leading_certainty = char_certainty; } } // Calculate num_trailing and trailing_certainty. for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0; *num_rebuilt_trailing < trailing_outliers; (*num_rebuilt_trailing)++) { int blob_idx = num_blobs - 1 - *num_rebuilt_trailing; float char_certainty = word->best_choice->certainty(blob_idx); if (char_certainty > *unlikely_threshold) { break; } if (char_certainty < *trailing_certainty) { *trailing_certainty = char_certainty; } } }
int tesseract::Tesseract::ImageHeight | ( | ) | const [inline] |
Definition at line 211 of file tesseractclass.h.
{
return pixGetHeight(pix_binary_);
}
int tesseract::Tesseract::ImageWidth | ( | ) | const [inline] |
Definition at line 208 of file tesseractclass.h.
{
return pixGetWidth(pix_binary_);
}
bool tesseract::Tesseract::init_cube_objects | ( | bool | load_combiner, |
TessdataManager * | tessdata_manager | ||
) |
Definition at line 154 of file cube_control.cpp.
{ ASSERT_HOST(cube_cntxt_ == NULL); ASSERT_HOST(tess_cube_combiner_ == NULL); // Create the cube context object cube_cntxt_ = CubeRecoContext::Create(this, tessdata_manager, &unicharset); if (cube_cntxt_ == NULL) { if (cube_debug_level > 0) { tprintf("Cube WARNING (Tesseract::init_cube_objects()): Failed to " "instantiate CubeRecoContext\n"); } return false; } // Create the combiner object and load the combiner net for target languages. if (load_combiner) { tess_cube_combiner_ = new tesseract::TesseractCubeCombiner(cube_cntxt_); if (!tess_cube_combiner_ || !tess_cube_combiner_->LoadCombinerNet()) { delete cube_cntxt_; cube_cntxt_ = NULL; if (tess_cube_combiner_ != NULL) { delete tess_cube_combiner_; tess_cube_combiner_ = NULL; } if (cube_debug_level > 0) tprintf("Cube ERROR (Failed to instantiate TesseractCubeCombiner\n"); return false; } } return true; }
FILE * tesseract::Tesseract::init_recog_training | ( | const STRING & | fname | ) |
Definition at line 36 of file recogtraining.cpp.
{ if (tessedit_ambigs_training) { tessedit_tess_adaption_mode.set_value(0); // turn off adaption tessedit_enable_doc_dict.set_value(0); // turn off document dictionary // Explore all segmentations. getDict().stopper_no_acceptable_choices.set_value(1); } STRING output_fname = fname; const char *lastdot = strrchr(output_fname.string(), '.'); if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0'; output_fname += ".txt"; FILE *output_file = open_file(output_fname.string(), "a+"); return output_file; }
int tesseract::Tesseract::init_tesseract | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const GenericVector< STRING > * | vars_vec, | ||
const GenericVector< STRING > * | vars_values, | ||
bool | set_only_init_params | ||
) |
Definition at line 279 of file tessedit.cpp.
{ GenericVector<STRING> langs_to_load; GenericVector<STRING> langs_not_to_load; ParseLanguageString(language, &langs_to_load, &langs_not_to_load); sub_langs_.delete_data_pointers(); sub_langs_.clear(); // Find the first loadable lang and load into this. // Add any languages that this language requires bool loaded_primary = false; // Load the rest into sub_langs_. for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) { if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) { const char *lang_str = langs_to_load[lang_index].string(); Tesseract *tess_to_init; if (!loaded_primary) { tess_to_init = this; } else { tess_to_init = new Tesseract; } int result = tess_to_init->init_tesseract_internal( arg0, textbase, lang_str, oem, configs, configs_size, vars_vec, vars_values, set_only_non_debug_params); if (!loaded_primary) { if (result < 0) { tprintf("Failed loading language '%s'\n", lang_str); } else { if (tessdata_manager_debug_level) tprintf("Loaded language '%s' as main language\n", lang_str); ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), &langs_to_load, &langs_not_to_load); loaded_primary = true; } } else { if (result < 0) { tprintf("Failed loading language '%s'\n", lang_str); delete tess_to_init; } else { if (tessdata_manager_debug_level) tprintf("Loaded language '%s' as secondary language\n", lang_str); sub_langs_.push_back(tess_to_init); // Add any languages that this language requires ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), &langs_to_load, &langs_not_to_load); } } } } if (!loaded_primary) { tprintf("Tesseract couldn't load any languages!\n"); return -1; // Couldn't load any language! } if (!sub_langs_.empty()) { // In multilingual mode word ratings have to be directly comparable, // so use the same language model weights for all languages: // use the primary language's params model if // tessedit_use_primary_params_model is set, // otherwise use default language model weights. if (tessedit_use_primary_params_model) { for (int s = 0; s < sub_langs_.size(); ++s) { sub_langs_[s]->language_model_->getParamsModel().Copy( this->language_model_->getParamsModel()); } tprintf("Using params model of the primary language\n"); if (tessdata_manager_debug_level) { this->language_model_->getParamsModel().Print(); } } else { this->language_model_->getParamsModel().Clear(); for (int s = 0; s < sub_langs_.size(); ++s) { sub_langs_[s]->language_model_->getParamsModel().Clear(); } tprintf("Using default language params\n"); } } SetupUniversalFontIds(); return 0; }
int tesseract::Tesseract::init_tesseract | ( | const char * | datapath, |
const char * | language, | ||
OcrEngineMode | oem | ||
) | [inline] |
Definition at line 407 of file tesseractclass.h.
{ return init_tesseract(datapath, NULL, language, oem, NULL, 0, NULL, NULL, false); }
int tesseract::Tesseract::init_tesseract_internal | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const GenericVector< STRING > * | vars_vec, | ||
const GenericVector< STRING > * | vars_values, | ||
bool | set_only_init_params | ||
) |
Definition at line 382 of file tessedit.cpp.
{ if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, configs_size, vars_vec, vars_values, set_only_non_debug_params)) { return -1; } if (tessedit_init_config_only) { tessdata_manager.End(); return 0; } // If only Cube will be used, skip loading Tesseract classifier's // pre-trained templates. bool init_tesseract_classifier = (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY || tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED); // If only Cube will be used and if it has its own Unicharset, // skip initializing permuter and loading Tesseract Dawgs. bool init_dict = !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY && tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET)); program_editup(textbase, init_tesseract_classifier, init_dict); tessdata_manager.End(); return 0; //Normal exit }
bool tesseract::Tesseract::init_tesseract_lang_data | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const GenericVector< STRING > * | vars_vec, | ||
const GenericVector< STRING > * | vars_values, | ||
bool | set_only_init_params | ||
) |
Definition at line 81 of file tessedit.cpp.
{ // Set the basename, compute the data directory. main_setup(arg0, textbase); // Set the language data path prefix lang = language != NULL ? language : "eng"; language_data_path_prefix = datadir; language_data_path_prefix += lang; language_data_path_prefix += "."; // Initialize TessdataManager. STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix; if (!tessdata_manager.Init(tessdata_path.string(), tessdata_manager_debug_level)) { return false; } // If a language specific config file (lang.config) exists, load it in. if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) { ParamUtils::ReadParamsFromFp( tessdata_manager.GetDataFilePtr(), tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG), SET_PARAM_CONSTRAINT_NONE, this->params()); if (tessdata_manager_debug_level) { tprintf("Loaded language config file\n"); } } SetParamConstraint set_params_constraint = set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE; // Load tesseract variables from config files. This is done after loading // language-specific variables from [lang].traineddata file, so that custom // config files can override values in [lang].traineddata file. for (int i = 0; i < configs_size; ++i) { read_config_file(configs[i], set_params_constraint); } // Set params specified in vars_vec (done after setting params from config // files, so that params in vars_vec can override those from files). if (vars_vec != NULL && vars_values != NULL) { for (int i = 0; i < vars_vec->size(); ++i) { if (!ParamUtils::SetParam((*vars_vec)[i].string(), (*vars_values)[i].string(), set_params_constraint, this->params())) { tprintf("Error setting param %s\n", (*vars_vec)[i].string()); exit(1); } } } if (((STRING &)tessedit_write_params_to_file).length() > 0) { FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb"); if (params_file != NULL) { ParamUtils::PrintParams(params_file, this->params()); fclose(params_file); if (tessdata_manager_debug_level > 0) { tprintf("Wrote parameters to %s\n", tessedit_write_params_to_file.string()); } } else { tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.string()); } } // Determine which ocr engine(s) should be loaded and used for recognition. if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem); if (tessdata_manager_debug_level) { tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n", static_cast<int>(tessedit_ocr_engine_mode)); } // If we are only loading the config file (and so not planning on doing any // recognition) then there's nothing else do here. if (tessedit_init_config_only) { if (tessdata_manager_debug_level) { tprintf("Returning after loading config file\n"); } return true; } // Load the unicharset if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) || !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) { return false; } if (unicharset.size() > MAX_NUM_CLASSES) { tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); return false; } if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n"); right_to_left_ = unicharset.major_right_to_left(); // Setup initial unichar ambigs table and read universal ambigs. UNICHARSET encoder_unicharset; encoder_unicharset.CopyFrom(unicharset); unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption); unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset); if (!tessedit_ambigs_training && tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) { TFile ambigs_file; ambigs_file.Open(tessdata_manager.GetDataFilePtr(), tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1); unichar_ambigs.LoadUnicharAmbigs( encoder_unicharset, &ambigs_file, ambigs_debug_level, use_ambigs_for_adaption, &unicharset); if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n"); } // Load Cube objects if necessary. if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { ASSERT_HOST(init_cube_objects(false, &tessdata_manager)); if (tessdata_manager_debug_level) tprintf("Loaded Cube w/out combiner\n"); } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { ASSERT_HOST(init_cube_objects(true, &tessdata_manager)); if (tessdata_manager_debug_level) tprintf("Loaded Cube with combiner\n"); } // Init ParamsModel. // Load pass1 and pass2 weights (for now these two sets are the same, but in // the future separate sets of weights can be generated). for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) { language_model_->getParamsModel().SetPass( static_cast<ParamsModel::PassEnum>(p)); if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) { if (!language_model_->getParamsModel().LoadFromFp( lang.string(), tessdata_manager.GetDataFilePtr(), tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) { return false; } } } if (tessdata_manager_debug_level) language_model_->getParamsModel().Print(); return true; }
int tesseract::Tesseract::init_tesseract_lm | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language | ||
) |
Definition at line 453 of file tessedit.cpp.
{ if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, NULL, 0, NULL, NULL, false)) return -1; getDict().Load(Dict::GlobalDawgCache()); tessdata_manager.End(); return 0; }
void tesseract::Tesseract::join_words | ( | WERD_RES * | word, |
WERD_RES * | word2, | ||
BlamerBundle * | orig_bb | ||
) | const |
Definition at line 240 of file tfacepp.cpp.
{ TBOX prev_box = word->chopped_word->blobs.back()->bounding_box(); TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box(); // Tack the word2 outputs onto the end of the word outputs. word->chopped_word->blobs += word2->chopped_word->blobs; word->rebuild_word->blobs += word2->rebuild_word->blobs; word2->chopped_word->blobs.clear(); word2->rebuild_word->blobs.clear(); TPOINT split_pt; split_pt.x = (prev_box.right() + blob_box.left()) / 2; split_pt.y = (prev_box.top() + prev_box.bottom() + blob_box.top() + blob_box.bottom()) / 4; // Move the word2 seams onto the end of the word1 seam_array. // Since the seam list is one element short, an empty seam marking the // end of the last blob in the first word is needed first. word->seam_array.push_back(new SEAM(0.0f, split_pt, NULL, NULL, NULL)); word->seam_array += word2->seam_array; word2->seam_array.truncate(0); // Fix widths and gaps. word->blob_widths += word2->blob_widths; word->blob_gaps += word2->blob_gaps; // Fix the ratings matrix. int rat1 = word->ratings->dimension(); int rat2 = word2->ratings->dimension(); word->ratings->AttachOnCorner(word2->ratings); ASSERT_HOST(word->ratings->dimension() == rat1 + rat2); word->best_state += word2->best_state; // Append the word choices. *word->raw_choice += *word2->raw_choice; // How many alt choices from each should we try to get? const int kAltsPerPiece = 2; // When do we start throwing away extra alt choices? const int kTooManyAltChoices = 100; // Construct the cartesian product of the best_choices of word(1) and word2. WERD_CHOICE_LIST joined_choices; WERD_CHOICE_IT jc_it(&joined_choices); WERD_CHOICE_IT bc1_it(&word->best_choices); WERD_CHOICE_IT bc2_it(&word2->best_choices); int num_word1_choices = word->best_choices.length(); int total_joined_choices = num_word1_choices; // Nota Bene: For the main loop here, we operate only on the 2nd and greater // word2 choices, and put them in the joined_choices list. The 1st word2 // choice gets added to the original word1 choices in-place after we have // finished with them. int bc2_index = 1; for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) { if (total_joined_choices >= kTooManyAltChoices && bc2_index > kAltsPerPiece) break; int bc1_index = 0; for (bc1_it.move_to_first(); bc1_index < num_word1_choices; ++bc1_index, bc1_it.forward()) { if (total_joined_choices >= kTooManyAltChoices && bc1_index > kAltsPerPiece) break; WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data()); *wc += *bc2_it.data(); jc_it.add_after_then_move(wc); ++total_joined_choices; } } // Now that we've filled in as many alternates as we want, paste the best // choice for word2 onto the original word alt_choices. bc1_it.move_to_first(); bc2_it.move_to_first(); for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) { *bc1_it.data() += *bc2_it.data(); } bc1_it.move_to_last(); bc1_it.add_list_after(&joined_choices); // Restore the pointer to original blamer bundle and combine blamer // information recorded in the splits. if (orig_bb != NULL) { orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle, wordrec_debug_blamer); delete word->blamer_bundle; word->blamer_bundle = orig_bb; } word->SetupBoxWord(); word->reject_map.initialise(word->box_word->length()); delete word2; }
void tesseract::Tesseract::make_reject_map | ( | WERD_RES * | word, |
ROW * | row, | ||
inT16 | pass | ||
) |
void tesseract::Tesseract::match_current_words | ( | WERD_RES_LIST & | words, |
ROW * | row, | ||
BLOCK * | block | ||
) |
Definition at line 197 of file fixspace.cpp.
{ WERD_RES_IT word_it(&words); WERD_RES *word; // Since we are not using PAGE_RES to iterate over words, we need to update // prev_word_best_choice_ before calling classify_word_pass2(). prev_word_best_choice_ = NULL; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word = word_it.data(); if ((!word->part_of_combo) && (word->box_word == NULL)) { WordData word_data(block, row, word); SetupWordPassN(2, &word_data); classify_word_and_language(&Tesseract::classify_word_pass2, &word_data); } prev_word_best_choice_ = word->best_choice; } }
void tesseract::Tesseract::match_word_pass_n | ( | int | pass_n, |
WERD_RES * | word, | ||
ROW * | row, | ||
BLOCK * | block | ||
) |
match_word_pass2
Baseline normalize the word and pass it to Tess.
Definition at line 1077 of file control.cpp.
{ if (word->tess_failed) return; tess_segment_pass_n(pass_n, word); if (!word->tess_failed) { if (!word->word->flag (W_REP_CHAR)) { word->fix_quotes(); if (tessedit_fix_hyphens) word->fix_hyphens(); /* Dont trust fix_quotes! - though I think I've fixed the bug */ if (word->best_choice->length() != word->box_word->length()) { tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;" " #Blobs=%d\n", word->best_choice->debug_string().string(), word->best_choice->length(), word->box_word->length()); } word->tess_accepted = tess_acceptable_word(word); // Also sets word->done flag make_reject_map(word, row, pass_n); } } set_word_fonts(word); ASSERT_HOST(word->raw_choice != NULL); }
void tesseract::Tesseract::MaximallyChopWord | ( | const GenericVector< TBOX > & | boxes, |
BLOCK * | block, | ||
ROW * | row, | ||
WERD_RES * | word_res | ||
) |
Definition at line 241 of file applybox.cpp.
{ if (!word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, classify_bln_numeric_mode, textord_use_cjk_fp_model, poly_allow_detailed_fx, row, block)) { word_res->CloneChoppedToRebuild(); return; } if (chop_debug) { tprintf("Maximally chopping word at:"); word_res->word->bounding_box().print(); } GenericVector<BLOB_CHOICE*> blob_choices; ASSERT_HOST(!word_res->chopped_word->blobs.empty()); float rating = static_cast<float>(MAX_INT8); for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) { // The rating and certainty are not quite arbitrary. Since // select_blob_to_chop uses the worst certainty to choose, they all have // to be different, so starting with MAX_INT8, subtract 1/8 for each blob // in here, and then divide by e each time they are chopped, which // should guarantee a set of unequal values for the whole tree of blobs // produced, however much chopping is required. The chops are thus only // limited by the ability of the chopper to find suitable chop points, // and not by the value of the certainties. BLOB_CHOICE* choice = new BLOB_CHOICE(0, rating, -rating, -1, -1, 0, 0, 0, 0, BCC_FAKE); blob_choices.push_back(choice); rating -= 0.125f; } const double e = exp(1.0); // The base of natural logs. int blob_number; int right_chop_index = 0; if (!assume_fixed_pitch_char_segment) { // We only chop if the language is not fixed pitch like CJK. SEAM* seam = NULL; while ((seam = chop_one_blob(boxes, blob_choices, word_res, &blob_number)) != NULL) { word_res->InsertSeam(blob_number, seam); BLOB_CHOICE* left_choice = blob_choices[blob_number]; rating = left_choice->rating() / e; left_choice->set_rating(rating); left_choice->set_certainty(-rating); // combine confidence w/ serial # BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, -1, 0, 0, 0, 0, BCC_FAKE); blob_choices.insert(right_choice, blob_number + 1); } } word_res->CloneChoppedToRebuild(); word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]); }
Pix** tesseract::Tesseract::mutable_pix_binary | ( | ) | [inline] |
Definition at line 174 of file tesseractclass.h.
{ Clear(); return &pix_binary_; }
Textord* tesseract::Tesseract::mutable_textord | ( | ) | [inline] |
Definition at line 227 of file tesseractclass.h.
{
return &textord_;
}
void tesseract::Tesseract::nn_match_word | ( | WERD_RES * | word, |
ROW * | row | ||
) |
void tesseract::Tesseract::nn_recover_rejects | ( | WERD_RES * | word, |
ROW * | row | ||
) |
BOOL8 tesseract::Tesseract::noise_outlines | ( | TWERD * | word | ) |
Definition at line 982 of file docqual.cpp.
{ TBOX box; // BB of outline inT16 outline_count = 0; inT16 small_outline_count = 0; inT16 max_dimension; float small_limit = kBlnXHeight * crunch_small_outlines_size; for (int b = 0; b < word->NumBlobs(); ++b) { TBLOB* blob = word->blobs[b]; for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) { outline_count++; box = ol->bounding_box(); if (box.height() > box.width()) max_dimension = box.height(); else max_dimension = box.width(); if (max_dimension < small_limit) small_outline_count++; } } return small_outline_count >= outline_count; }
BOOL8 tesseract::Tesseract::non_0_digit | ( | const UNICHARSET & | ch_set, |
UNICHAR_ID | unichar_id | ||
) |
Definition at line 792 of file reject.cpp.
{ return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0"); }
BOOL8 tesseract::Tesseract::non_O_upper | ( | const UNICHARSET & | ch_set, |
UNICHAR_ID | unichar_id | ||
) |
Definition at line 788 of file reject.cpp.
{ return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O"); }
int tesseract::Tesseract::num_sub_langs | ( | ) | const [inline] |
Definition at line 234 of file tesseractclass.h.
{ return sub_langs_.size(); }
BOOL8 tesseract::Tesseract::one_ell_conflict | ( | WERD_RES * | word_res, |
BOOL8 | update_map | ||
) |
Definition at line 295 of file reject.cpp.
{ const char *word; const char *lengths; inT16 word_len; //its length inT16 first_alphanum_index_; inT16 first_alphanum_offset_; inT16 i; inT16 offset; BOOL8 non_conflict_set_char; //non conf set a/n? BOOL8 conflict = FALSE; BOOL8 allow_1s; ACCEPTABLE_WERD_TYPE word_type; BOOL8 dict_perm_type; BOOL8 dict_word_ok; int dict_word_type; word = word_res->best_choice->unichar_string().string (); lengths = word_res->best_choice->unichar_lengths().string(); word_len = strlen (lengths); /* If there are no occurrences of the conflict set characters then the word is OK. */ if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL) return FALSE; /* There is a conflict if there are NO other (confirmed) alphanumerics apart from those in the conflict set. */ for (i = 0, offset = 0, non_conflict_set_char = FALSE; (i < word_len) && !non_conflict_set_char; offset += lengths[i++]) non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) || word_res->uch_set->get_isdigit(word + offset, lengths[i])) && !STRING (conflict_set_I_l_1).contains (word[offset]); if (!non_conflict_set_char) { if (update_map) reject_I_1_L(word_res); return TRUE; } /* If the word is accepted by a dawg permuter, and the first alpha character is "I" or "l", check to see if the alternative is also a dawg word. If it is, then there is a potential error otherwise the word is ok. */ dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) || (word_res->best_choice->permuter () == USER_DAWG_PERM) || (rej_trust_doc_dawg && (word_res->best_choice->permuter () == DOC_DAWG_PERM)) || (word_res->best_choice->permuter () == FREQ_DAWG_PERM); dict_word_type = dict_word(*(word_res->best_choice)); dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM)); if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) || (dict_perm_type && dict_word_ok)) { first_alphanum_index_ = first_alphanum_index (word, lengths); first_alphanum_offset_ = first_alphanum_offset (word, lengths); if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; if (safe_dict_word(word_res) > 0) { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; if (update_map) word_res->reject_map[first_alphanum_index_]. setrej_1Il_conflict(); return TRUE; } else { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; return FALSE; } } if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; if (safe_dict_word(word_res) > 0) { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; if (update_map) word_res->reject_map[first_alphanum_index_]. setrej_1Il_conflict(); return TRUE; } else { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; return FALSE; } } return FALSE; } /* NEW 1Il code. The old code relied on permuter types too much. In fact, tess will use TOP_CHOICE permute for good things like "palette". In this code the string is examined independently to see if it looks like a well formed word. */ /* REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a dictionary word. */ first_alphanum_index_ = first_alphanum_index (word, lengths); first_alphanum_offset_ = first_alphanum_offset (word, lengths); if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; if (safe_dict_word(word_res) > 0) return FALSE; else word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; if (safe_dict_word(word_res) > 0) return FALSE; else word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; } /* For strings containing digits: If there are no alphas OR the numeric permuter liked the word, reject any non 1 conflict chs Else reject all conflict chs */ if (word_contains_non_1_digit (word, lengths)) { allow_1s = (alpha_count (word, lengths) == 0) || (word_res->best_choice->permuter () == NUMBER_PERM); inT16 offset; conflict = FALSE; for (i = 0, offset = 0; word[offset] != '\0'; offset += word_res->best_choice->unichar_lengths()[i++]) { if ((!allow_1s || (word[offset] != '1')) && STRING (conflict_set_I_l_1).contains (word[offset])) { if (update_map) word_res->reject_map[i].setrej_1Il_conflict (); conflict = TRUE; } } return conflict; } /* For anything else. See if it conforms to an acceptable word type. If so, treat accordingly. */ word_type = acceptable_word_string(*word_res->uch_set, word, lengths); if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) { first_alphanum_index_ = first_alphanum_index (word, lengths); first_alphanum_offset_ = first_alphanum_offset (word, lengths); if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) { if (update_map) word_res->reject_map[first_alphanum_index_]. setrej_1Il_conflict (); return TRUE; } else return FALSE; } else if (word_type == AC_UPPER_CASE) { return FALSE; } else { if (update_map) reject_I_1_L(word_res); return TRUE; } }
void tesseract::Tesseract::output_pass | ( | PAGE_RES_IT & | page_res_it, |
const TBOX * | target_word_box | ||
) |
Definition at line 69 of file output.cpp.
{ BLOCK_RES *block_of_last_word; BOOL8 force_eol; //During output BLOCK *nextblock; //block of next word WERD *nextword; //next word page_res_it.restart_page (); block_of_last_word = NULL; while (page_res_it.word () != NULL) { check_debug_pt (page_res_it.word (), 120); if (target_word_box) { TBOX current_word_box=page_res_it.word ()->word->bounding_box(); FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2); if (!target_word_box->contains(center_pt)) { page_res_it.forward (); continue; } } if (tessedit_write_block_separators && block_of_last_word != page_res_it.block ()) { block_of_last_word = page_res_it.block (); } force_eol = (tessedit_write_block_separators && (page_res_it.block () != page_res_it.next_block ())) || (page_res_it.next_word () == NULL); if (page_res_it.next_word () != NULL) nextword = page_res_it.next_word ()->word; else nextword = NULL; if (page_res_it.next_block () != NULL) nextblock = page_res_it.next_block ()->block; else nextblock = NULL; //regardless of tilde crunching write_results(page_res_it, determine_newline_type(page_res_it.word()->word, page_res_it.block()->block, nextword, nextblock), force_eol); page_res_it.forward(); } }
void tesseract::Tesseract::ParseLanguageString | ( | const char * | lang_str, |
GenericVector< STRING > * | to_load, | ||
GenericVector< STRING > * | not_to_load | ||
) |
Definition at line 243 of file tessedit.cpp.
{ STRING remains(lang_str); while (remains.length() > 0) { // Find the start of the lang code and which vector to add to. const char* start = remains.string(); while (*start == '+') ++start; GenericVector<STRING>* target = to_load; if (*start == '~') { target = not_to_load; ++start; } // Find the index of the end of the lang code in string start. int end = strlen(start); const char* plus = strchr(start, '+'); if (plus != NULL && plus - start < end) end = plus - start; STRING lang_code(start); lang_code.truncate_at(end); STRING next(start + end); remains = next; // Check whether lang_code is already in the target vector and add. if (!IsStrInList(lang_code, *target)) { if (tessdata_manager_debug_level) tprintf("Adding language '%s' to list\n", lang_code.string()); target->push_back(lang_code); } } }
void tesseract::Tesseract::pgeditor_main | ( | int | width, |
int | height, | ||
PAGE_RES * | page_res | ||
) |
Top level editor operation: Setup a new window and an according event handler
Definition at line 338 of file pgedit.cpp.
{ current_page_res = page_res; if (current_page_res->block_res_list.empty()) return; recog_done = false; stillRunning = true; build_image_window(width, height); word_display_mode.turn_on_bit(DF_EDGE_STEP); do_re_display(&tesseract::Tesseract::word_set_display); #ifndef GRAPHICS_DISABLED pe = new ParamsEditor(this, image_win); #endif PGEventHandler pgEventHandler(this); image_win->AddEventHandler(&pgEventHandler); image_win->AddMessageBox(); SVMenuNode* svMenuRoot = build_menu_new(); svMenuRoot->BuildMenu(image_win); image_win->SetVisible(true); image_win->AwaitEvent(SVET_DESTROY); image_win->AddEventHandler(NULL); }
Pix* tesseract::Tesseract::pix_binary | ( | ) | const [inline] |
Definition at line 178 of file tesseractclass.h.
{
return pix_binary_;
}
Pix* tesseract::Tesseract::pix_grey | ( | ) | const [inline] |
Definition at line 181 of file tesseractclass.h.
{
return pix_grey_;
}
BOOL8 tesseract::Tesseract::potential_word_crunch | ( | WERD_RES * | word, |
GARBAGE_LEVEL | garbage_level, | ||
BOOL8 | ok_dict_word | ||
) |
Definition at line 546 of file docqual.cpp.
{ float rating_per_ch; int adjusted_len; const char *str = word->best_choice->unichar_string().string(); const char *lengths = word->best_choice->unichar_lengths().string(); BOOL8 word_crunchable; int poor_indicator_count = 0; word_crunchable = !crunch_leave_accept_strings || word->reject_map.length() < 3 || (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word); adjusted_len = word->reject_map.length(); if (adjusted_len > 10) adjusted_len = 10; rating_per_ch = word->best_choice->rating() / adjusted_len; if (rating_per_ch > crunch_pot_poor_rate) { if (crunch_debug > 2) { tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().string()); } poor_indicator_count++; } if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) { if (crunch_debug > 2) { tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().string()); } poor_indicator_count++; } if (garbage_level != G_OK) { if (crunch_debug > 2) { tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().string()); } poor_indicator_count++; } return poor_indicator_count >= crunch_pot_indicators; }
Definition at line 573 of file tesseractclass.cpp.
{ textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model); pixDestroy(&cube_binary_); cube_binary_ = pixClone(pix_binary()); // Find the max splitter strategy over all langs. ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>( static_cast<inT32>(pageseg_devanagari_split_strategy)); for (int i = 0; i < sub_langs_.size(); ++i) { ShiroRekhaSplitter::SplitStrategy pageseg_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>( static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy)); if (pageseg_strategy > max_pageseg_strategy) max_pageseg_strategy = pageseg_strategy; // Clone the cube image to all the sub langs too. pixDestroy(&sub_langs_[i]->cube_binary_); sub_langs_[i]->cube_binary_ = pixClone(pix_binary()); pixDestroy(&sub_langs_[i]->pix_binary_); sub_langs_[i]->pix_binary_ = pixClone(pix_binary()); } // Perform shiro-rekha (top-line) splitting and replace the current image by // the newly splitted image. splitter_.set_orig_pix(pix_binary()); splitter_.set_pageseg_split_strategy(max_pageseg_strategy); if (splitter_.Split(true)) { ASSERT_HOST(splitter_.splitted_image()); pixDestroy(&pix_binary_); pix_binary_ = pixClone(splitter_.splitted_image()); } }
void tesseract::Tesseract::PrepareForTessOCR | ( | BLOCK_LIST * | block_list, |
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Definition at line 609 of file tesseractclass.cpp.
{ // Find the max splitter strategy over all langs. ShiroRekhaSplitter::SplitStrategy max_ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>( static_cast<inT32>(ocr_devanagari_split_strategy)); for (int i = 0; i < sub_langs_.size(); ++i) { ShiroRekhaSplitter::SplitStrategy ocr_strategy = static_cast<ShiroRekhaSplitter::SplitStrategy>( static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy)); if (ocr_strategy > max_ocr_strategy) max_ocr_strategy = ocr_strategy; } // Utilize the segmentation information available. splitter_.set_segmentation_block_list(block_list); splitter_.set_ocr_split_strategy(max_ocr_strategy); // Run the splitter for OCR bool split_for_ocr = splitter_.Split(false); // Restore pix_binary to the binarized original pix for future reference. ASSERT_HOST(splitter_.orig_pix()); pixDestroy(&pix_binary_); pix_binary_ = pixClone(splitter_.orig_pix()); // If the pageseg and ocr strategies are different, refresh the block list // (from the last SegmentImage call) with blobs from the real image to be used // for OCR. if (splitter_.HasDifferentSplitStrategies()) { BLOCK block("", TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_), pixGetHeight(pix_binary_)); Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() : splitter_.orig_pix(); extract_edges(pix_for_ocr, &block); splitter_.RefreshSegmentationWithNewBlobs(block.blob_list()); } // The splitter isn't needed any more after this, so save memory by clearing. splitter_.Clear(); }
void tesseract::Tesseract::PrerecAllWordsPar | ( | const GenericVector< WordData > & | words | ) |
Definition at line 36 of file par_control.cpp.
{ // Prepare all the blobs. GenericVector<BlobData> blobs; for (int w = 0; w < words.size(); ++w) { if (words[w].word->ratings != NULL && words[w].word->ratings->get(0, 0) == NULL) { for (int b = 0; b < words[w].word->chopped_word->NumBlobs(); ++b) { blobs.push_back(BlobData(b, this, *words[w].word)); } for (int s = 0; s < words[w].lang_words.size(); ++s) { const WERD_RES& word = words[w].lang_words[s]; for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) { blobs.push_back(BlobData(b, sub_langs_[s], word)); } } } } // Pre-classify all the blobs. if (tessedit_parallelize > 1) { #pragma omp parallel for num_threads(10) for (int b = 0; b < blobs.size(); ++b) { *blobs[b].choices = blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL); } } else { // TODO(AMD) parallelize this. for (int b = 0; b < blobs.size(); ++b) { *blobs[b].choices = blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL); } } }
BOOL8 tesseract::Tesseract::process_cmd_win_event | ( | inT32 | cmd_event, |
char * | new_value | ||
) |
Definition at line 398 of file pgedit.cpp.
{ char msg[160]; BOOL8 exit = FALSE; color_mode = CM_RAINBOW; // Run recognition on the full page if needed. switch (cmd_event) { case BLAMER_CMD_EVENT: case SHOW_SUBSCRIPT_CMD_EVENT: case SHOW_SUPERSCRIPT_CMD_EVENT: case SHOW_ITALIC_CMD_EVENT: case SHOW_BOLD_CMD_EVENT: case SHOW_UNDERLINE_CMD_EVENT: case SHOW_FIXEDPITCH_CMD_EVENT: case SHOW_SERIF_CMD_EVENT: case SHOW_SMALLCAPS_CMD_EVENT: case SHOW_DROPCAPS_CMD_EVENT: if (!recog_done) { recog_all_words(current_page_res, NULL, NULL, NULL, 0); recog_done = true; } break; default: break; } switch (cmd_event) { case NULL_CMD_EVENT: break; case CHANGE_DISP_CMD_EVENT: case DUMP_WERD_CMD_EVENT: case SHOW_POINT_CMD_EVENT: case SHOW_BLN_WERD_CMD_EVENT: case RECOG_WERDS: case RECOG_PSEUDO: case SHOW_BLOB_FEATURES: mode =(CMD_EVENTS) cmd_event; break; case DEBUG_WERD_CMD_EVENT: mode = DEBUG_WERD_CMD_EVENT; word_config_ = image_win->ShowInputDialog("Config File Name"); break; case BOUNDING_BOX_CMD_EVENT: if (new_value[0] == 'T') word_display_mode.turn_on_bit(DF_BOX); else word_display_mode.turn_off_bit(DF_BOX); mode = CHANGE_DISP_CMD_EVENT; break; case BLAMER_CMD_EVENT: if (new_value[0] == 'T') word_display_mode.turn_on_bit(DF_BLAMER); else word_display_mode.turn_off_bit(DF_BLAMER); do_re_display(&tesseract::Tesseract::word_display); mode = CHANGE_DISP_CMD_EVENT; break; case CORRECT_TEXT_CMD_EVENT: if (new_value[0] == 'T') word_display_mode.turn_on_bit(DF_TEXT); else word_display_mode.turn_off_bit(DF_TEXT); mode = CHANGE_DISP_CMD_EVENT; break; case POLYGONAL_CMD_EVENT: if (new_value[0] == 'T') word_display_mode.turn_on_bit(DF_POLYGONAL); else word_display_mode.turn_off_bit(DF_POLYGONAL); mode = CHANGE_DISP_CMD_EVENT; break; case BL_NORM_CMD_EVENT: if (new_value[0] == 'T') word_display_mode.turn_on_bit(DF_BN_POLYGONAL); else word_display_mode.turn_off_bit(DF_BN_POLYGONAL); mode = CHANGE_DISP_CMD_EVENT; break; case BITMAP_CMD_EVENT: if (new_value[0] == 'T') word_display_mode.turn_on_bit(DF_EDGE_STEP); else word_display_mode.turn_off_bit(DF_EDGE_STEP); mode = CHANGE_DISP_CMD_EVENT; break; case UNIFORM_DISP_CMD_EVENT: do_re_display(&tesseract::Tesseract::word_set_display); break; case IMAGE_CMD_EVENT: display_image =(new_value[0] == 'T'); do_re_display(&tesseract::Tesseract::word_display); break; case BLOCKS_CMD_EVENT: display_blocks =(new_value[0] == 'T'); do_re_display(&tesseract::Tesseract::word_display); break; case BASELINES_CMD_EVENT: display_baselines =(new_value[0] == 'T'); do_re_display(&tesseract::Tesseract::word_display); break; case SHOW_SUBSCRIPT_CMD_EVENT: color_mode = CM_SUBSCRIPT; do_re_display(&tesseract::Tesseract::word_display); break; case SHOW_SUPERSCRIPT_CMD_EVENT: color_mode = CM_SUPERSCRIPT; do_re_display(&tesseract::Tesseract::word_display); break; case SHOW_ITALIC_CMD_EVENT: color_mode = CM_ITALIC; do_re_display(&tesseract::Tesseract::word_display); break; case SHOW_BOLD_CMD_EVENT: color_mode = CM_BOLD; do_re_display(&tesseract::Tesseract::word_display); break; case SHOW_UNDERLINE_CMD_EVENT: color_mode = CM_UNDERLINE; do_re_display(&tesseract::Tesseract::word_display); break; case SHOW_FIXEDPITCH_CMD_EVENT: color_mode = CM_FIXEDPITCH; do_re_display(&tesseract::Tesseract::word_display); break; case SHOW_SERIF_CMD_EVENT: color_mode = CM_SERIF; do_re_display(&tesseract::Tesseract::word_display); break; case SHOW_SMALLCAPS_CMD_EVENT: color_mode = CM_SMALLCAPS; do_re_display(&tesseract::Tesseract::word_display); break; case SHOW_DROPCAPS_CMD_EVENT: color_mode = CM_DROPCAPS; do_re_display(&tesseract::Tesseract::word_display); break; case REFRESH_CMD_EVENT: do_re_display(&tesseract::Tesseract::word_display); break; case QUIT_CMD_EVENT: exit = TRUE; ScrollView::Exit(); break; default: sprintf(msg, "Unrecognised event " INT32FORMAT "(%s)", cmd_event, new_value); image_win->AddMessage(msg); break; } return exit; }
void tesseract::Tesseract::process_image_event | ( | const SVEvent & | event | ) |
User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.
Definition at line 566 of file pgedit.cpp.
{ // The following variable should remain static, since it is used by // debug editor, which uses a single Tesseract instance. static ICOORD down; ICOORD up; TBOX selection_box; char msg[80]; switch(event.type) { case SVET_SELECTION: if (event.type == SVET_SELECTION) { down.set_x(event.x + event.x_size); down.set_y(event.y + event.y_size); if (mode == SHOW_POINT_CMD_EVENT) show_point(current_page_res, event.x, event.y); } up.set_x(event.x); up.set_y(event.y); selection_box = TBOX(down, up); switch(mode) { case CHANGE_DISP_CMD_EVENT: process_selected_words( current_page_res, selection_box, &tesseract::Tesseract::word_blank_and_set_display); break; case DUMP_WERD_CMD_EVENT: process_selected_words(current_page_res, selection_box, &tesseract::Tesseract::word_dumper); break; case SHOW_BLN_WERD_CMD_EVENT: process_selected_words(current_page_res, selection_box, &tesseract::Tesseract::word_bln_display); break; case DEBUG_WERD_CMD_EVENT: debug_word(current_page_res, selection_box); break; case SHOW_POINT_CMD_EVENT: break; // ignore up event case RECOG_WERDS: image_win->AddMessage("Recogging selected words"); this->process_selected_words(current_page_res, selection_box, &Tesseract::recog_interactive); break; case RECOG_PSEUDO: image_win->AddMessage("Recogging selected blobs"); recog_pseudo_word(current_page_res, selection_box); break; case SHOW_BLOB_FEATURES: blob_feature_display(current_page_res, selection_box); break; default: sprintf(msg, "Mode %d not yet implemented", mode); image_win->AddMessage(msg); break; } default: break; } }
void tesseract::Tesseract::process_selected_words | ( | PAGE_RES * | page_res, |
TBOX & | selection_box, | ||
BOOL8(tesseract::Tesseract::*)(BLOCK *block, ROW *row, WERD_RES *word_res) | word_processor | ||
) |
Definition at line 30 of file pagewalk.cpp.
{ for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL; page_res_it.forward()) { WERD* word = page_res_it.word()->word; if (word->bounding_box().overlap(selection_box)) { if (!((this->*word_processor)(page_res_it.block()->block, page_res_it.row()->row, page_res_it.word()))) return; } } }
bool tesseract::Tesseract::ProcessTargetWord | ( | const TBOX & | word_box, |
const TBOX & | target_word_box, | ||
const char * | word_config, | ||
int | pass | ||
) |
Definition at line 128 of file control.cpp.
{ if (word_config != NULL) { if (word_box.major_overlap(target_word_box)) { if (backup_config_file_ == NULL) { backup_config_file_ = kBackUpConfigFile; FILE* config_fp = fopen(backup_config_file_, "wb"); ParamUtils::PrintParams(config_fp, params()); fclose(config_fp); ParamUtils::ReadParamsFile(word_config, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params()); } } else { if (backup_config_file_ != NULL) { ParamUtils::ReadParamsFile(backup_config_file_, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params()); backup_config_file_ = NULL; } } } else if (pass > 1 && !word_box.major_overlap(target_word_box)) { return false; } return true; }
void tesseract::Tesseract::quality_based_rejection | ( | PAGE_RES_IT & | page_res_it, |
BOOL8 | good_quality_doc | ||
) |
Definition at line 141 of file docqual.cpp.
{ if ((tessedit_good_quality_unrej && good_quality_doc)) unrej_good_quality_words(page_res_it); doc_and_block_rejection(page_res_it, good_quality_doc); if (unlv_tilde_crunching) { tilde_crunch(page_res_it); tilde_delete(page_res_it); } }
void tesseract::Tesseract::read_config_file | ( | const char * | filename, |
SetParamConstraint | constraint | ||
) |
Definition at line 50 of file tessedit.cpp.
{ STRING path = datadir; path += "configs/"; path += filename; FILE* fp; if ((fp = fopen(path.string(), "rb")) != NULL) { fclose(fp); } else { path = datadir; path += "tessconfigs/"; path += filename; if ((fp = fopen(path.string(), "rb")) != NULL) { fclose(fp); } else { path = filename; } } ParamUtils::ReadParamsFile(path.string(), constraint, this->params()); }
bool tesseract::Tesseract::recog_all_words | ( | PAGE_RES * | page_res, |
ETEXT_DESC * | monitor, | ||
const TBOX * | target_word_box, | ||
const char * | word_config, | ||
int | dopasses | ||
) |
Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.
page_res | page structure |
monitor | progress monitor |
word_config | word_config file |
target_word_box | specifies just to extract a rectangle |
dopasses | 0 - all, 1 just pass 1, 2 passes 2 and higher |
Definition at line 283 of file control.cpp.
{ PAGE_RES_IT page_res_it(page_res); if (tessedit_minimal_rej_pass1) { tessedit_test_adaption.set_value (TRUE); tessedit_minimal_rejection.set_value (TRUE); } if (dopasses==0 || dopasses==1) { page_res_it.restart_page(); // ****************** Pass 1 ******************* // Clear adaptive classifier at the beginning of the page if it is full. // This is done only at the beginning of the page to ensure that the // classifier is not reset at an arbitrary point while processing the page, // which would cripple Passes 2+ if the reset happens towards the end of // Pass 1 on a page with very difficult text. // TODO(daria): preemptively clear the classifier if it is almost full. if (AdaptiveClassifierIsFull()) ResetAdaptiveClassifierInternal(); // Now check the sub-langs as well. for (int i = 0; i < sub_langs_.size(); ++i) { if (sub_langs_[i]->AdaptiveClassifierIsFull()) sub_langs_[i]->ResetAdaptiveClassifierInternal(); } // Set up all words ready for recognition, so that if parallelism is on // all the input and output classes are ready to run the classifier. GenericVector<WordData> words; SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words); if (tessedit_parallelize) { PrerecAllWordsPar(words); } stats_.word_count = words.size(); stats_.dict_words = 0; stats_.doc_blob_quality = 0; stats_.doc_outline_errs = 0; stats_.doc_char_quality = 0; stats_.good_char_count = 0; stats_.doc_good_char_quality = 0; most_recently_used_ = this; // Run pass 1 word recognition. if (!RecogAllWordsPassN(1, monitor, &words)) return false; // Pass 1 post-processing. while (page_res_it.word() != NULL) { if (page_res_it.word()->word->flag(W_REP_CHAR)) { fix_rep_char(&page_res_it); page_res_it.forward(); continue; } // Count dict words. if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) ++(stats_.dict_words); // Update misadaption log (we only need to do it on pass 1, since // adaption only happens on this pass). if (page_res_it.word()->blamer_bundle != NULL && page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) { page_res->misadaption_log.push_back( page_res_it.word()->blamer_bundle->misadaption_debug()); } page_res_it.forward(); } } if (dopasses == 1) return true; // ****************** Pass 2 ******************* if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption) { page_res_it.restart_page(); GenericVector<WordData> words; SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words); if (tessedit_parallelize) { PrerecAllWordsPar(words); } most_recently_used_ = this; // Run pass 2 word recognition. if (!RecogAllWordsPassN(2, monitor, &words)) return false; // Pass 2 post-processing. while (page_res_it.word() != NULL) { WERD_RES* word = page_res_it.word(); if (word->word->flag(W_REP_CHAR) && !word->done) { fix_rep_char(&page_res_it); page_res_it.forward(); continue; } page_res_it.forward(); } } // The next passes can only be run if tesseract has been used, as cube // doesn't set all the necessary outputs in WERD_RES. if (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY || tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { // ****************** Pass 3 ******************* // Fix fuzzy spaces. set_global_loc_code(LOC_FUZZY_SPACE); if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word && !right_to_left()) fix_fuzzy_spaces(monitor, stats_.word_count, page_res); // ****************** Pass 4 ******************* if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res); // ****************** Pass 5,6 ******************* rejection_passes(page_res, monitor, target_word_box, word_config); // ****************** Pass 7 ******************* // Cube combiner. // If cube is loaded and its combiner is present, run it. if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { run_cube_combiner(page_res); } // ****************** Pass 8 ******************* font_recognition_pass(page_res); // ****************** Pass 9 ******************* // Check the correctness of the final results. blamer_pass(page_res); } script_pos_pass(page_res); // Write results pass. set_global_loc_code(LOC_WRITE_RESULTS); // This is now redundant, but retained commented so show how to obtain // bounding boxes and style information. // changed by jetsoft // needed for dll to output memory structure if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) output_pass(page_res_it, target_word_box); // end jetsoft PageSegMode pageseg_mode = static_cast<PageSegMode>( static_cast<int>(tessedit_pageseg_mode)); textord_.CleanupSingleRowResult(pageseg_mode, page_res); if (monitor != NULL) { monitor->progress = 100; } return true; }
BOOL8 tesseract::Tesseract::recog_interactive | ( | BLOCK * | block, |
ROW * | row, | ||
WERD_RES * | word_res | ||
) |
recog_interactive
Recognize a single word in interactive mode.
block | block |
row | row of word |
word_res | word to recognise |
Definition at line 96 of file control.cpp.
{ inT16 char_qual; inT16 good_char_qual; WordData word_data(block, row, word_res); SetupWordPassN(2, &word_data); classify_word_and_language(&Tesseract::classify_word_pass2, &word_data); if (tessedit_debug_quality_metrics) { word_char_quality(word_res, row, &char_qual, &good_char_qual); tprintf ("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n", word_res->reject_map.length(), word_blob_quality(word_res, row), word_outline_errs(word_res), char_qual, good_char_qual); } return TRUE; }
void tesseract::Tesseract::recog_pseudo_word | ( | PAGE_RES * | page_res, |
TBOX & | selection_box | ||
) |
Definition at line 71 of file control.cpp.
{ WERD *word; ROW *pseudo_row; // row of word BLOCK *pseudo_block; // block of word word = make_pseudo_word(page_res, selection_box, pseudo_block, pseudo_row); if (word != NULL) { WERD_RES word_res(word); recog_interactive(pseudo_block, pseudo_row, &word_res); delete word; } }
void tesseract::Tesseract::recog_training_segmented | ( | const STRING & | fname, |
PAGE_RES * | page_res, | ||
volatile ETEXT_DESC * | monitor, | ||
FILE * | output_file | ||
) |
Definition at line 83 of file recogtraining.cpp.
{ STRING box_fname = fname; const char *lastdot = strrchr(box_fname.string(), '.'); if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0'; box_fname += ".box"; // read_next_box() will close box_file FILE *box_file = open_file(box_fname.string(), "r"); PAGE_RES_IT page_res_it; page_res_it.page_res = page_res; page_res_it.restart_page(); STRING label; // Process all the words on this page. TBOX tbox; // tesseract-identified box TBOX bbox; // box from the box file bool keep_going; int line_number = 0; int examined_words = 0; do { keep_going = read_t(&page_res_it, &tbox); keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); // Align bottom left points of the TBOXes. while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) { keep_going = (bbox.bottom() < tbox.bottom()) ? read_t(&page_res_it, &tbox) : ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); } while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) { keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) : ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); } // OCR the word if top right points of the TBOXes are similar. if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) && NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) { ambigs_classify_and_output(page_res_it.prev_word(), page_res_it.prev_row(), page_res_it.prev_block(), label.string(), output_file); examined_words++; } } while (keep_going); // Set up scripts on all of the words that did not get sent to // ambigs_classify_and_output. They all should have, but if all the // werd_res's don't get uch_sets, tesseract will crash when you try // to iterate over them. :-( int total_words = 0; for (page_res_it.restart_page(); page_res_it.block() != NULL; page_res_it.forward()) { if (page_res_it.word()) { if (page_res_it.word()->uch_set == NULL) page_res_it.word()->SetupFake(unicharset); total_words++; } } if (examined_words < 0.85 * total_words) { tprintf("TODO(antonova): clean up recog_training_segmented; " " It examined only a small fraction of the ambigs image.\n"); } tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words); }
void tesseract::Tesseract::recog_word | ( | WERD_RES * | word | ) |
Definition at line 46 of file tfacepp.cpp.
{ if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL || word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) { if (classify_debug_level) tprintf("No truth for word - skipping\n"); word->tess_failed = true; return; } ASSERT_HOST(!word->chopped_word->blobs.empty()); recog_word_recursive(word); word->SetupBoxWord(); if (word->best_choice->length() != word->box_word->length()) { tprintf("recog_word ASSERT FAIL String:\"%s\"; " "Strlen=%d; #Blobs=%d\n", word->best_choice->debug_string().string(), word->best_choice->length(), word->box_word->length()); } ASSERT_HOST(word->best_choice->length() == word->box_word->length()); // Check that the ratings matrix size matches the sum of all the // segmentation states. if (!word->StatesAllValid()) { tprintf("Not all words have valid states relative to ratings matrix!!"); word->DebugWordChoices(true, NULL); ASSERT_HOST(word->StatesAllValid()); } if (tessedit_override_permuter) { /* Override the permuter type if a straight dictionary check disagrees. */ uinT8 perm_type = word->best_choice->permuter(); if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) { uinT8 real_dict_perm_type = dict_word(*word->best_choice); if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) || (real_dict_perm_type == USER_DAWG_PERM)) && (alpha_count(word->best_choice->unichar_string().string(), word->best_choice->unichar_lengths().string()) > 0)) { word->best_choice->set_permuter(real_dict_perm_type); // use dict perm } } if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) { tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter()); } } // Factored out from control.cpp ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL)); if (word->best_choice == NULL || word->best_choice->length() == 0 || static_cast<int>(strspn(word->best_choice->unichar_string().string(), " ")) == word->best_choice->length()) { word->tess_failed = true; word->reject_map.initialise(word->box_word->length()); word->reject_map.rej_word_tess_failure(); } else { word->tess_failed = false; } }
void tesseract::Tesseract::recog_word_recursive | ( | WERD_RES * | word | ) |
Definition at line 110 of file tfacepp.cpp.
{ int word_length = word->chopped_word->NumBlobs(); // no of blobs if (word_length > MAX_UNDIVIDED_LENGTH) { return split_and_recog_word(word); } cc_recog(word); word_length = word->rebuild_word->NumBlobs(); // No of blobs in output. // Do sanity checks and minor fixes on best_choice. if (word->best_choice->length() > word_length) { word->best_choice->make_bad(); // should never happen tprintf("recog_word: Discarded long string \"%s\"" " (%d characters vs %d blobs)\n", word->best_choice->unichar_string().string(), word->best_choice->length(), word_length); tprintf("Word is at:"); word->word->bounding_box().print(); } if (word->best_choice->length() < word_length) { UNICHAR_ID space_id = unicharset.unichar_to_id(" "); while (word->best_choice->length() < word_length) { word->best_choice->append_unichar_id(space_id, 1, 0.0, word->best_choice->certainty()); } } }
bool tesseract::Tesseract::RecogAllWordsPassN | ( | int | pass_n, |
ETEXT_DESC * | monitor, | ||
GenericVector< WordData > * | words | ||
) |
Definition at line 223 of file control.cpp.
{ // TODO(rays) Before this loop can be parallelized (it would yield a massive // speed-up) all remaining member globals need to be converted to local/heap // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be // added. The results will be significantly different with adaption on, and // deterioration will need investigation. for (int w = 0; w < words->size(); ++w) { WordData* word = &(*words)[w]; if (monitor != NULL) { monitor->ocr_alive = TRUE; if (pass_n == 1) monitor->progress = 30 + 50 * w / words->size(); else monitor->progress = 80 + 10 * w / words->size(); if (monitor->deadline_exceeded() || (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, words->size()))) { // Timeout. Fake out the rest of the words. for (; w < words->size(); ++w) { (*words)[w].word->SetupFake(unicharset); } return false; } } if (word->word->tess_failed) continue; WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1 : &Tesseract::classify_word_pass2; classify_word_and_language(recognizer, word); if (tessedit_dump_choices) { word_dumper(NULL, word->row, word->word); tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().string(), word->word->best_choice->debug_string().string()); } } return true; }
void tesseract::Tesseract::recognize_page | ( | STRING & | image_name | ) |
void tesseract::Tesseract::reject_edge_blobs | ( | WERD_RES * | word | ) |
Definition at line 266 of file reject.cpp.
{ TBOX word_box = word->word->bounding_box(); // Use the box_word as it is already denormed back to image coordinates. int blobcount = word->box_word->length(); if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border || word_box.right() + tessedit_image_border > ImageWidth() - 1 || word_box.top() + tessedit_image_border > ImageHeight() - 1) { ASSERT_HOST(word->reject_map.length() == blobcount); for (int blobindex = 0; blobindex < blobcount; blobindex++) { TBOX blob_box = word->box_word->BlobBox(blobindex); if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border || blob_box.right() + tessedit_image_border > ImageWidth() - 1 || blob_box.top() + tessedit_image_border > ImageHeight() - 1) { word->reject_map[blobindex].setrej_edge_char(); // Close to edge } } } }
void tesseract::Tesseract::reject_I_1_L | ( | WERD_RES * | word | ) |
Definition at line 194 of file reject.cpp.
{ inT16 i; inT16 offset; for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; offset += word->best_choice->unichar_lengths()[i], i += 1) { if (STRING (conflict_set_I_l_1). contains (word->best_choice->unichar_string()[offset])) { //rej 1Il conflict word->reject_map[i].setrej_1Il_conflict (); } } }
void tesseract::Tesseract::reject_mostly_rejects | ( | WERD_RES * | word | ) |
Definition at line 576 of file reject.cpp.
{ /* Reject the whole of the word if the fraction of rejects exceeds a limit */ if ((float) word->reject_map.reject_count() / word->reject_map.length() >= rej_whole_of_mostly_reject_word_fract) word->reject_map.rej_word_mostly_rej(); }
void tesseract::Tesseract::rejection_passes | ( | PAGE_RES * | page_res, |
ETEXT_DESC * | monitor, | ||
const TBOX * | target_word_box, | ||
const char * | word_config | ||
) |
Definition at line 582 of file control.cpp.
{ PAGE_RES_IT page_res_it(page_res); // ****************** Pass 5 ******************* // Gather statistics on rejects. int word_index = 0; while (!tessedit_test_adaption && page_res_it.word() != NULL) { set_global_loc_code(LOC_MM_ADAPT); WERD_RES* word = page_res_it.word(); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 95 + 5 * word_index / stats_.word_count; } if (word->rebuild_word == NULL) { // Word was not processed by tesseract. page_res_it.forward(); continue; } check_debug_pt(word, 70); // changed by jetsoft // specific to its needs to extract one word when need if (target_word_box && !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) { page_res_it.forward(); continue; } // end jetsoft page_res_it.rej_stat_word(); int chars_in_word = word->reject_map.length(); int rejects_in_word = word->reject_map.reject_count(); int blob_quality = word_blob_quality(word, page_res_it.row()->row); stats_.doc_blob_quality += blob_quality; int outline_errs = word_outline_errs(word); stats_.doc_outline_errs += outline_errs; inT16 all_char_quality; inT16 accepted_all_char_quality; word_char_quality(word, page_res_it.row()->row, &all_char_quality, &accepted_all_char_quality); stats_.doc_char_quality += all_char_quality; uinT8 permuter_type = word->best_choice->permuter(); if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) || (permuter_type == USER_DAWG_PERM)) { stats_.good_char_count += chars_in_word - rejects_in_word; stats_.doc_good_char_quality += accepted_all_char_quality; } check_debug_pt(word, 80); if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) word->reject_map.rej_word_bad_quality(); check_debug_pt(word, 90); page_res_it.forward(); } if (tessedit_debug_quality_metrics) { tprintf ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f" " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n", page_res->char_count, page_res->rej_count, page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality, stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs, stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality, stats_.doc_char_quality / static_cast<float>(page_res->char_count), stats_.doc_good_char_quality, (stats_.good_char_count > 0) ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count)) : 0.0); } BOOL8 good_quality_doc = ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) && (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) && (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) && (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc); // ****************** Pass 6 ******************* // Do whole document or whole block rejection pass if (!tessedit_test_adaption) { set_global_loc_code(LOC_DOC_BLK_REJ); quality_based_rejection(page_res_it, good_quality_doc); } }
BOOL8 tesseract::Tesseract::repeated_nonalphanum_wd | ( | WERD_RES * | word, |
ROW * | row | ||
) |
Definition at line 585 of file reject.cpp.
{ inT16 char_quality; inT16 accepted_char_quality; if (word->best_choice->unichar_lengths().length() <= 1) return FALSE; if (!STRING(ok_repeated_ch_non_alphanum_wds). contains(word->best_choice->unichar_string()[0])) return FALSE; UNICHAR_ID uch_id = word->best_choice->unichar_id(0); for (int i = 1; i < word->best_choice->length(); ++i) { if (word->best_choice->unichar_id(i) != uch_id) return FALSE; } word_char_quality(word, row, &char_quality, &accepted_char_quality); if ((word->best_choice->unichar_lengths().length () == char_quality) && (char_quality == accepted_char_quality)) return TRUE; else return FALSE; }
void tesseract::Tesseract::ReportFailedBox | ( | int | boxfile_lineno, |
TBOX | box, | ||
const char * | box_ch, | ||
const char * | err_msg | ||
) |
void tesseract::Tesseract::ReportXhtFixResult | ( | bool | accept_new_word, |
float | new_x_ht, | ||
WERD_RES * | word, | ||
WERD_RES * | new_word | ||
) |
Definition at line 948 of file control.cpp.
{ tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().string(), word->best_choice->debug_string().string()); word->reject_map.print(debug_fp); tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().string(), new_word->best_choice->debug_string().string()); new_word->reject_map.print(debug_fp); tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT", new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK", accept_new_word ? "ACCEPTED" : ""); }
void tesseract::Tesseract::ReSegmentByClassification | ( | PAGE_RES * | page_res | ) |
Definition at line 495 of file applybox.cpp.
{ PAGE_RES_IT pr_it(page_res); WERD_RES* word_res; for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { WERD* word = word_res->word; if (word->text() == NULL || word->text()[0] == '\0') continue; // Ignore words that have no text. // Convert the correct text to a vector of UNICHAR_ID GenericVector<UNICHAR_ID> target_text; if (!ConvertStringToUnichars(word->text(), &target_text)) { tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text()); pr_it.DeleteCurrentWord(); continue; } if (!FindSegmentation(target_text, word_res)) { tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text()); pr_it.DeleteCurrentWord(); continue; } } }
bool tesseract::Tesseract::ResegmentCharBox | ( | PAGE_RES * | page_res, |
const TBOX * | prev_box, | ||
const TBOX & | box, | ||
const TBOX & | next_box, | ||
const char * | correct_text | ||
) |
Definition at line 326 of file applybox.cpp.
{ if (applybox_debug > 1) { tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text); } PAGE_RES_IT page_res_it(page_res); WERD_RES* word_res; for (word_res = page_res_it.word(); word_res != NULL; word_res = page_res_it.forward()) { if (!word_res->box_word->bounding_box().major_overlap(box)) continue; if (applybox_debug > 1) { tprintf("Checking word box:"); word_res->box_word->bounding_box().print(); } int word_len = word_res->box_word->length(); for (int i = 0; i < word_len; ++i) { TBOX char_box = TBOX(); int blob_count = 0; for (blob_count = 0; i + blob_count < word_len; ++blob_count) { TBOX blob_box = word_res->box_word->BlobBox(i + blob_count); if (!blob_box.major_overlap(box)) break; if (word_res->correct_text[i + blob_count].length() > 0) break; // Blob is claimed already. double current_box_miss_metric = BoxMissMetric(blob_box, box); double next_box_miss_metric = BoxMissMetric(blob_box, next_box); if (applybox_debug > 2) { tprintf("Checking blob:"); blob_box.print(); tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, next_box_miss_metric); } if (current_box_miss_metric > next_box_miss_metric) break; // Blob is a better match for next box. char_box += blob_box; } if (blob_count > 0) { if (applybox_debug > 1) { tprintf("Index [%d, %d) seem good.\n", i, i + blob_count); } if (!char_box.almost_equal(box, 3) && (box.x_gap(next_box) < -3 || (prev_box != NULL && prev_box->x_gap(box) < -3))) { return false; } // We refine just the box_word, best_state and correct_text here. // The rebuild_word is made in TidyUp. // blob_count blobs are put together to match the box. Merge the // box_word boxes, save the blob_count in the state and the text. word_res->box_word->MergeBoxes(i, i + blob_count); word_res->best_state[i] = blob_count; word_res->correct_text[i] = correct_text; if (applybox_debug > 2) { tprintf("%d Blobs match: blob box:", blob_count); word_res->box_word->BlobBox(i).print(); tprintf("Matches box:"); box.print(); tprintf("With next box:"); next_box.print(); } // Eliminated best_state and correct_text entries for the consumed // blobs. for (int j = 1; j < blob_count; ++j) { word_res->best_state.remove(i + 1); word_res->correct_text.remove(i + 1); } // Assume that no box spans multiple source words, so we are done with // this box. if (applybox_debug > 1) { tprintf("Best state = "); for (int j = 0; j < word_res->best_state.size(); ++j) { tprintf("%d ", word_res->best_state[j]); } tprintf("\n"); tprintf("Correct text = [[ "); for (int j = 0; j < word_res->correct_text.size(); ++j) { tprintf("%s ", word_res->correct_text[j].string()); } tprintf("]]\n"); } return true; } } } if (applybox_debug > 0) { tprintf("FAIL!\n"); } return false; // Failure. }
bool tesseract::Tesseract::ResegmentWordBox | ( | BLOCK_LIST * | block_list, |
const TBOX & | box, | ||
const TBOX & | next_box, | ||
const char * | correct_text | ||
) |
Definition at line 424 of file applybox.cpp.
{ if (applybox_debug > 1) { tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); } WERD* new_word = NULL; BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); if (!box.major_overlap(block->bounding_box())) continue; ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { ROW* row = r_it.data(); if (!box.major_overlap(row->bounding_box())) continue; WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (applybox_debug > 2) { tprintf("Checking word:"); word->bounding_box().print(); } if (word->text() != NULL && word->text()[0] != '\0') continue; // Ignore words that are already done. if (!box.major_overlap(word->bounding_box())) continue; C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (!blob_box.major_overlap(box)) continue; double current_box_miss_metric = BoxMissMetric(blob_box, box); double next_box_miss_metric = BoxMissMetric(blob_box, next_box); if (applybox_debug > 2) { tprintf("Checking blob:"); blob_box.print(); tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, next_box_miss_metric); } if (current_box_miss_metric > next_box_miss_metric) continue; // Blob is a better match for next box. if (applybox_debug > 2) { tprintf("Blob match: blob:"); blob_box.print(); tprintf("Matches box:"); box.print(); tprintf("With next box:"); next_box.print(); } if (new_word == NULL) { // Make a new word with a single blob. new_word = word->shallow_copy(); new_word->set_text(correct_text); w_it.add_to_end(new_word); } C_BLOB_IT new_blob_it(new_word->cblob_list()); new_blob_it.add_to_end(blob_it.extract()); } } } } if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); return new_word != NULL; }
Definition at line 545 of file tesseractclass.cpp.
{ ResetAdaptiveClassifierInternal(); for (int i = 0; i < sub_langs_.size(); ++i) { sub_langs_[i]->ResetAdaptiveClassifierInternal(); } }
Definition at line 553 of file tesseractclass.cpp.
{ getDict().ResetDocumentDictionary(); for (int i = 0; i < sub_langs_.size(); ++i) { sub_langs_[i]->getDict().ResetDocumentDictionary(); } }
const FCOORD& tesseract::Tesseract::reskew | ( | ) | const [inline] |
Definition at line 170 of file tesseractclass.h.
{
return reskew_;
}
bool tesseract::Tesseract::RetryWithLanguage | ( | const WERD_RES & | best_word, |
WordData * | word_data, | ||
WERD_RES * | word, | ||
WordRecognizer | recognizer | ||
) |
Definition at line 770 of file control.cpp.
{ if (classify_debug_level || cube_debug_level) { tprintf("Retrying word using lang %s, oem %d\n", lang.string(), static_cast<int>(tessedit_ocr_engine_mode)); } // Run the recognizer on the word. // Initial version is a bit of a hack based on better certainty and rating // (to reduce false positives from cube) or a dictionary vs non-dictionary // word. (this->*recognizer)(word_data, word); bool new_is_better = NewWordBetter(best_word, *word, classify_max_rating_ratio, classify_max_certainty_margin); if (classify_debug_level || cube_debug_level) { if (word->best_choice == NULL) { tprintf("NULL result %s better!\n", new_is_better ? "IS" : "NOT"); } else { tprintf("New result %s better:%s, r=%g, c=%g\n", new_is_better ? "IS" : "NOT", word->best_choice->unichar_string().string(), word->best_choice->rating(), word->best_choice->certainty()); } } return new_is_better; }
bool tesseract::Tesseract::right_to_left | ( | ) | const [inline] |
Definition at line 231 of file tesseractclass.h.
{
return right_to_left_;
}
void tesseract::Tesseract::run_cube_combiner | ( | PAGE_RES * | page_res | ) |
Definition at line 193 of file cube_control.cpp.
{ if (page_res == NULL || tess_cube_combiner_ == NULL) return; PAGE_RES_IT page_res_it(page_res); // Iterate through the word results and call cube on each word. for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { BLOCK* block = page_res_it.block()->block; if (block->poly_block() != NULL && !block->poly_block()->IsText()) continue; // Don't deal with non-text blocks. WERD_RES* word = page_res_it.word(); // Skip cube entirely if tesseract's certainty is greater than threshold. int combiner_run_thresh = convert_prob_to_tess_certainty( cube_cntxt_->Params()->CombinerRunThresh()); if (word->best_choice->certainty() >= combiner_run_thresh) { continue; } // Use the same language as Tesseract used for the word. Tesseract* lang_tess = word->tesseract; // Setup a trial WERD_RES in which to classify with cube. WERD_RES cube_word; cube_word.InitForRetryRecognition(*word); cube_word.SetupForRecognition(lang_tess->unicharset, this, BestPix(), OEM_CUBE_ONLY, NULL, false, false, false, page_res_it.row()->row, page_res_it.block()->block); CubeObject *cube_obj = lang_tess->cube_recognize_word( page_res_it.block()->block, &cube_word); if (cube_obj != NULL) lang_tess->cube_combine_word(cube_obj, &cube_word, word); delete cube_obj; } }
bool tesseract::Tesseract::RunOldFixXht | ( | WERD_RES * | word, |
BLOCK * | block, | ||
ROW * | row | ||
) |
inT16 tesseract::Tesseract::safe_dict_word | ( | const WERD_RES * | werd_res | ) |
Definition at line 610 of file reject.cpp.
{ const WERD_CHOICE &word = *werd_res->best_choice; int dict_word_type = werd_res->tesseract->dict_word(word); return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type; }
Pix* tesseract::Tesseract::scaled_color | ( | ) | const [inline] |
Definition at line 214 of file tesseractclass.h.
{
return scaled_color_;
}
int tesseract::Tesseract::scaled_factor | ( | ) | const [inline] |
Definition at line 217 of file tesseractclass.h.
{
return scaled_factor_;
}
void tesseract::Tesseract::script_pos_pass | ( | PAGE_RES * | page_res | ) |
Definition at line 702 of file control.cpp.
{ PAGE_RES_IT page_res_it(page_res); for (page_res_it.restart_page(); page_res_it.word() != NULL; page_res_it.forward()) { WERD_RES* word = page_res_it.word(); if (word->word->flag(W_REP_CHAR)) { page_res_it.forward(); continue; } float x_height = page_res_it.block()->block->x_height(); float word_x_height = word->x_height; if (word_x_height < word->best_choice->min_x_height() || word_x_height > word->best_choice->max_x_height()) { word_x_height = (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f; } // Test for small caps. Word capheight must be close to block xheight, // and word must contain no lower case letters, and at least one upper case. double small_cap_xheight = x_height * kXHeightCapRatio; double small_cap_delta = (x_height - small_cap_xheight) / 2.0; if (word->uch_set->script_has_xheight() && small_cap_xheight - small_cap_delta <= word_x_height && word_x_height <= small_cap_xheight + small_cap_delta) { // Scan for upper/lower. int num_upper = 0; int num_lower = 0; for (int i = 0; i < word->best_choice->length(); ++i) { if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) ++num_upper; else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) ++num_lower; } if (num_upper > 0 && num_lower == 0) word->small_caps = true; } word->SetScriptPositions(); } }
void tesseract::Tesseract::SearchForText | ( | const GenericVector< BLOB_CHOICE_LIST * > * | choices, |
int | choices_pos, | ||
int | choices_length, | ||
const GenericVector< UNICHAR_ID > & | target_text, | ||
int | text_index, | ||
float | rating, | ||
GenericVector< int > * | segmentation, | ||
float * | best_rating, | ||
GenericVector< int > * | best_segmentation | ||
) |
Definition at line 607 of file applybox.cpp.
{ const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs(); for (int length = 1; length <= choices[choices_pos].size(); ++length) { // Rating of matching choice or worst choice if no match. float choice_rating = 0.0f; // Find the corresponding best BLOB_CHOICE. BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { BLOB_CHOICE* choice = choice_it.data(); choice_rating = choice->rating(); UNICHAR_ID class_id = choice->unichar_id(); if (class_id == target_text[text_index]) { break; } // Search ambigs table. if (class_id < table.size() && table[class_id] != NULL) { AmbigSpec_IT spec_it(table[class_id]); for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) { const AmbigSpec *ambig_spec = spec_it.data(); // We'll only do 1-1. if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID && ambig_spec->correct_ngram_id == target_text[text_index]) break; } if (!spec_it.cycled_list()) break; // Found an ambig. } } if (choice_it.cycled_list()) continue; // No match. segmentation->push_back(length); if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) { // This is a complete match. If the rating is good record a new best. if (applybox_debug > 2) { tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n", rating + choice_rating, *best_rating, segmentation->size(), best_segmentation->size()); } if (best_segmentation->empty() || rating + choice_rating < *best_rating) { *best_segmentation = *segmentation; *best_rating = rating + choice_rating; } } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) { if (applybox_debug > 3) { tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index], unicharset.id_to_unichar(target_text[text_index]), choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig", choices_pos, length); } SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1, rating + choice_rating, segmentation, best_rating, best_segmentation); if (applybox_debug > 3) { tprintf("End recursion for %d=%s\n", target_text[text_index], unicharset.id_to_unichar(target_text[text_index])); } } segmentation->truncate(segmentation->size() - 1); } }
int tesseract::Tesseract::SegmentPage | ( | const STRING * | input_file, |
BLOCK_LIST * | blocks, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be NULL. On return the blocks list owns all the constructed page layout.
Definition at line 109 of file pagesegmain.cpp.
{ ASSERT_HOST(pix_binary_ != NULL); int width = pixGetWidth(pix_binary_); int height = pixGetHeight(pix_binary_); // Get page segmentation mode. PageSegMode pageseg_mode = static_cast<PageSegMode>( static_cast<int>(tessedit_pageseg_mode)); // If a UNLV zone file can be found, use that instead of segmentation. if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != NULL && input_file->length() > 0) { STRING name = *input_file; const char* lastdot = strrchr(name.string(), '.'); if (lastdot != NULL) name[lastdot - name.string()] = '\0'; read_unlv_file(name, width, height, blocks); } if (blocks->empty()) { // No UNLV file present. Work according to the PageSegMode. // First make a single block covering the whole image. BLOCK_IT block_it(blocks); BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height); block->set_right_to_left(right_to_left()); block_it.add_to_end(block); } else { // UNLV file present. Use PSM_SINGLE_BLOCK. pageseg_mode = PSM_SINGLE_BLOCK; } int auto_page_seg_ret_val = 0; TO_BLOCK_LIST to_blocks; if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) { auto_page_seg_ret_val = AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr); if (pageseg_mode == PSM_OSD_ONLY) return auto_page_seg_ret_val; // To create blobs from the image region bounds uncomment this line: // to_blocks.clear(); // Uncomment to go back to the old mode. } else { deskew_ = FCOORD(1.0f, 0.0f); reskew_ = FCOORD(1.0f, 0.0f); if (pageseg_mode == PSM_CIRCLE_WORD) { Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_); if (pixcleaned != NULL) { pixDestroy(&pix_binary_); pix_binary_ = pixcleaned; } } } if (auto_page_seg_ret_val < 0) { return -1; } if (blocks->empty()) { if (textord_debug_tabfind) tprintf("Empty page\n"); return 0; // AutoPageSeg found an empty page. } bool splitting = pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT; bool cjk_mode = textord_use_cjk_fp_model; textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_, pix_grey_, splitting || cjk_mode, blocks, &to_blocks); return auto_page_seg_ret_val; }
void tesseract::Tesseract::set_done | ( | WERD_RES * | word, |
inT16 | pass | ||
) |
void tesseract::Tesseract::set_pix_grey | ( | Pix * | grey_pix | ) | [inline] |
Definition at line 184 of file tesseractclass.h.
{ pixDestroy(&pix_grey_); pix_grey_ = grey_pix; }
void tesseract::Tesseract::set_pix_thresholds | ( | Pix * | thresholds | ) | [inline] |
Definition at line 198 of file tesseractclass.h.
{ pixDestroy(&pix_thresholds_); pix_thresholds_ = thresholds; }
void tesseract::Tesseract::set_source_resolution | ( | int | ppi | ) | [inline] |
Definition at line 205 of file tesseractclass.h.
{ source_resolution_ = ppi; }
void tesseract::Tesseract::set_unlv_suspects | ( | WERD_RES * | word | ) |
Definition at line 308 of file output.cpp.
{ int len = word_res->reject_map.length(); const WERD_CHOICE &word = *(word_res->best_choice); const UNICHARSET &uchset = *word.unicharset(); int i; float rating_per_ch; if (suspect_level == 0) { for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected()) word_res->reject_map[i].setrej_minimal_rej_accept(); } return; } if (suspect_level >= 3) return; //Use defaults /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) { /* Unreject alphas in dictionary words */ for (i = 0; i < len; ++i) { if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) word_res->reject_map[i].setrej_minimal_rej_accept(); } } rating_per_ch = word.rating() / word_res->reject_map.length(); if (rating_per_ch >= suspect_rating_per_ch) return; //Dont touch bad ratings if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ for (i = 0; i < len; ++i) { if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) word_res->reject_map[i].setrej_minimal_rej_accept(); } } for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected()) { if (word_res->reject_map[i].flag(R_DOC_REJ)) word_res->reject_map[i].setrej_minimal_rej_accept(); if (word_res->reject_map[i].flag(R_BLOCK_REJ)) word_res->reject_map[i].setrej_minimal_rej_accept(); if (word_res->reject_map[i].flag(R_ROW_REJ)) word_res->reject_map[i].setrej_minimal_rej_accept(); } } if (suspect_level == 2) return; if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) { for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected()) { if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) || word_res->reject_map[i].flag(R_POSTNN_1IL))) word_res->reject_map[i].setrej_minimal_rej_accept(); if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) word_res->reject_map[i].setrej_minimal_rej_accept(); } } } if (acceptable_word_string(*word_res->uch_set, word.unichar_string().string(), word.unichar_lengths().string()) != AC_UNACCEPTABLE || acceptable_number_string(word.unichar_string().string(), word.unichar_lengths().string())) { if (word_res->reject_map.length() > suspect_short_words) { for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() || word_res->reject_map[i].flag (R_1IL_CONFLICT) || word_res->reject_map[i].flag (R_POSTNN_1IL) || word_res->reject_map[i].flag (R_MM_REJECT))) { word_res->reject_map[i].setrej_minimal_rej_accept(); } } } } }
void tesseract::Tesseract::set_word_fonts | ( | WERD_RES * | word | ) |
set_word_fonts
Get the fonts for the word.
Definition at line 1453 of file control.cpp.
{ // Don't try to set the word fonts for a cube word, as the configs // will be meaningless. if (word->chopped_word == NULL) return; ASSERT_HOST(word->best_choice != NULL); inT32 index; // char id index // character iterator BLOB_CHOICE_IT choice_it; // choice iterator int fontinfo_size = get_fontinfo_table().size(); int fontset_size = get_fontset_table().size(); if (fontinfo_size == 0 || fontset_size == 0) return; STATS fonts(0, fontinfo_size); // font counters word->italic = 0; word->bold = 0; if (!word->best_choice_fontinfo_ids.empty()) { word->best_choice_fontinfo_ids.clear(); } // Compute the modal font for the word for (index = 0; index < word->best_choice->length(); ++index) { UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index); choice_it.set_to_list(word->GetBlobChoices(index)); if (tessedit_debug_fonts) { tprintf("Examining fonts in %s\n", word->best_choice->debug_string().string()); } for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id(); if (blob_ch_id == word_ch_id) { if (tessedit_debug_fonts) { tprintf("%s font %s (%d) font2 %s (%d)\n", word->uch_set->id_to_unichar(blob_ch_id), choice_it.data()->fontinfo_id() < 0 ? "unknown" : fontinfo_table_.get(choice_it.data()->fontinfo_id()).name, choice_it.data()->fontinfo_id(), choice_it.data()->fontinfo_id2() < 0 ? "unknown" : fontinfo_table_.get(choice_it.data()->fontinfo_id2()).name, choice_it.data()->fontinfo_id2()); } // 1st choice font gets 2 pts, 2nd choice 1 pt. if (choice_it.data()->fontinfo_id() >= 0) { fonts.add(choice_it.data()->fontinfo_id(), 2); } if (choice_it.data()->fontinfo_id2() >= 0) { fonts.add(choice_it.data()->fontinfo_id2(), 1); } break; } } } inT16 font_id1, font_id2; find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count); find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count); word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL; word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL; // All the blobs get the word's best choice font. for (int i = 0; i < word->best_choice->length(); ++i) { word->best_choice_fontinfo_ids.push_back(font_id1); } if (word->fontinfo_id_count > 0) { FontInfo fi = fontinfo_table_.get(font_id1); if (tessedit_debug_fonts) { if (word->fontinfo_id2_count > 0) { tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name, word->fontinfo_id_count, fontinfo_table_.get(font_id2).name, word->fontinfo_id2_count); } else { tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count); } } // 1st choices got 2 pts, so we need to halve the score for the mode. word->italic = (fi.is_italic() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2; word->bold = (fi.is_bold() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2; } }
Definition at line 560 of file tesseractclass.cpp.
{ // Set the white and blacklists (if any) unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(), tessedit_char_whitelist.string()); // Black and white lists should apply to all loaded classifiers. for (int i = 0; i < sub_langs_.size(); ++i) { sub_langs_[i]->unicharset.set_black_and_whitelist( tessedit_char_blacklist.string(), tessedit_char_whitelist.string()); } }
void tesseract::Tesseract::SetEquationDetect | ( | EquationDetect * | detector | ) |
Definition at line 539 of file tesseractclass.cpp.
{ equ_detect_ = detector; equ_detect_->SetLangTesseract(this); }
void tesseract::Tesseract::SetScaledColor | ( | int | factor, |
Pix * | color | ||
) | [inline] |
Definition at line 220 of file tesseractclass.h.
{ scaled_factor_ = factor; scaled_color_ = color; }
void tesseract::Tesseract::SetupAllWordsPassN | ( | int | pass_n, |
const TBOX * | target_word_box, | ||
const char * | word_config, | ||
PAGE_RES * | page_res, | ||
GenericVector< WordData > * | words | ||
) |
Definition at line 158 of file control.cpp.
{ // Prepare all the words. PAGE_RES_IT page_res_it(page_res); for (page_res_it.restart_page(); page_res_it.word() != NULL; page_res_it.forward()) { if (pass_n == 1) page_res_it.word()->SetupFake(unicharset); if (target_word_box == NULL || ProcessTargetWord(page_res_it.word()->word->bounding_box(), *target_word_box, word_config, 1)) { words->push_back(WordData(page_res_it)); } } // Setup all the words for recognition with polygonal approximation. for (int w = 0; w < words->size(); ++w) { SetupWordPassN(pass_n, &(*words)[w]); if (w > 0) (*words)[w].prev_word = &(*words)[w - 1]; } }
PAGE_RES * tesseract::Tesseract::SetupApplyBoxes | ( | const GenericVector< TBOX > & | boxes, |
BLOCK_LIST * | block_list | ||
) |
Definition at line 196 of file applybox.cpp.
{ double median_xheight = MedianXHeight(block_list); double max_deviation = kMaxXHeightDeviationFraction * median_xheight; // Strip all fuzzy space markers to simplify the PAGE_RES. BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { ROW* row = r_it.data(); float diff = fabs(row->x_height() - median_xheight); if (diff > max_deviation) { if (applybox_debug) { tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight); } row->set_x_height(static_cast<float>(median_xheight)); } WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (word->cblob_list()->empty()) { delete w_it.extract(); } else { word->set_flag(W_FUZZY_SP, false); word->set_flag(W_FUZZY_NON, false); } } } } PAGE_RES* page_res = new PAGE_RES(block_list, NULL); PAGE_RES_IT pr_it(page_res); WERD_RES* word_res; while ((word_res = pr_it.word()) != NULL) { MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res); pr_it.forward(); } return page_res; }
ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation | ( | bool | single_column, |
bool | osd, | ||
bool | only_osd, | ||
BLOCK_LIST * | blocks, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr, | ||
TO_BLOCK_LIST * | to_blocks, | ||
Pix ** | photo_mask_pix, | ||
Pix ** | music_mask_pix | ||
) |
Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a NULL pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.
Definition at line 287 of file pagesegmain.cpp.
{ int vertical_x = 0; int vertical_y = 1; TabVector_LIST v_lines; TabVector_LIST h_lines; ICOORD bleft(0, 0); ASSERT_HOST(pix_binary_ != NULL); if (tessedit_dump_pageseg_images) { pixWrite("tessinput.png", pix_binary_, IFF_PNG); } // Leptonica is used to find the rule/separator lines in the input. LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_, &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines); if (tessedit_dump_pageseg_images) pixWrite("tessnolines.png", pix_binary_, IFF_PNG); // Leptonica is used to find a mask of the photo regions in the input. *photo_mask_pix = ImageFind::FindImages(pix_binary_); if (tessedit_dump_pageseg_images) pixWrite("tessnoimages.png", pix_binary_, IFF_PNG); if (single_column) v_lines.clear(); // The rest of the algorithm uses the usual connected components. textord_.find_components(pix_binary_, blocks, to_blocks); TO_BLOCK_IT to_block_it(to_blocks); // There must be exactly one input block. // TODO(rays) handle new textline finding with a UNLV zone file. ASSERT_HOST(to_blocks->singleton()); TO_BLOCK* to_block = to_block_it.data(); TBOX blkbox = to_block->block->bounding_box(); ColumnFinder* finder = NULL; if (to_block->line_size >= 2) { finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(), blkbox.topright(), source_resolution_, textord_use_cjk_fp_model, &v_lines, &h_lines, vertical_x, vertical_y); finder->SetupAndFilterNoise(*photo_mask_pix, to_block); if (equ_detect_) { equ_detect_->LabelSpecialText(to_block); } BLOBNBOX_CLIST osd_blobs; // osd_orientation is the number of 90 degree rotations to make the // characters upright. (See osdetect.h for precise definition.) // We want the text lines horizontal, (vertical text indicates vertical // textlines) which may conflict (eg vertically written CJK). int osd_orientation = 0; bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs); if (osd && osd_tess != NULL && osr != NULL) { os_detect_blobs(&osd_blobs, osr, osd_tess); if (only_osd) { delete finder; return NULL; } osd_orientation = osr->best_result.orientation_id; double osd_score = osr->orientations[osd_orientation]; double osd_margin = min_orientation_margin * 2; for (int i = 0; i < 4; ++i) { if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) { osd_margin = osd_score - osr->orientations[i]; } } int best_script_id = osr->best_result.script_id; const char* best_script_str = osd_tess->unicharset.get_script_from_script_id(best_script_id); bool cjk = best_script_id == osd_tess->unicharset.han_sid() || best_script_id == osd_tess->unicharset.hiragana_sid() || best_script_id == osd_tess->unicharset.katakana_sid() || strcmp("Japanese", best_script_str) == 0 || strcmp("Korean", best_script_str) == 0 || strcmp("Hangul", best_script_str) == 0; if (cjk) { finder->set_cjk_script(true); } if (osd_margin < min_orientation_margin) { // The margin is weak. if (!cjk && !vertical_text && osd_orientation == 2) { // upside down latin text is improbable with such a weak margin. tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: " "Don't rotate.\n", osd_margin); osd_orientation = 0; } else { tprintf("OSD: Weak margin (%.2f) for %d blob text block, " "but using orientation anyway: %d\n", osd_blobs.length(), osd_margin, osd_orientation); } } } osd_blobs.shallow_clear(); finder->CorrectOrientation(to_block, vertical_text, osd_orientation); } return finder; }
Definition at line 432 of file tessedit.cpp.
{ // Note that we can get away with bitwise copying FontInfo in // all_fonts, as it is a temporary structure and we avoid setting the // delete callback. UnicityTable<FontInfo> all_fonts; all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo)); // Create the universal ID table. CollectFonts(get_fontinfo_table(), &all_fonts); for (int i = 0; i < sub_langs_.size(); ++i) { CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts); } // Assign ids from the table to each font table. AssignIds(all_fonts, &get_fontinfo_table()); for (int i = 0; i < sub_langs_.size(); ++i) { AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table()); } font_table_size_ = all_fonts.size(); }
void tesseract::Tesseract::SetupWordPassN | ( | int | pass_n, |
WordData * | word | ||
) |
Definition at line 183 of file control.cpp.
{ if (pass_n == 1 || !word->word->done || tessedit_training_tess) { if (pass_n == 2) { // TODO(rays) Should we do this on pass1 too? word->word->caps_height = 0.0; if (word->word->x_height == 0.0f) word->word->x_height = word->row->x_height(); } // Cube doesn't get setup for pass2. if (pass_n != 2 || tessedit_ocr_engine_mode != OEM_CUBE_ONLY) { word->word->SetupForRecognition( unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, classify_bln_numeric_mode, textord_use_cjk_fp_model, poly_allow_detailed_fx, word->row, word->block); } } if (!sub_langs_.empty()) { if (word->lang_words.size() != sub_langs_.size()) { // Setup the words for all the sub-languages now. WERD_RES empty; word->lang_words.init_to_size(sub_langs_.size(), empty); } for (int s = 0; s < sub_langs_.size(); ++s) { Tesseract* lang_t = sub_langs_[s]; if (pass_n == 1 || (lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY && (!word->lang_words[s].done || lang_t->tessedit_training_tess))) { word->lang_words[s].InitForRetryRecognition(*word->word); word->lang_words[s].SetupForRecognition( lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, NULL, lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model, lang_t->poly_allow_detailed_fx, word->row, word->block); } } } }
void tesseract::Tesseract::SetupWordScripts | ( | BLOCK_LIST * | blocks | ) |
int tesseract::Tesseract::source_resolution | ( | ) | const [inline] |
Definition at line 202 of file tesseractclass.h.
{
return source_resolution_;
}
void tesseract::Tesseract::split_and_recog_word | ( | WERD_RES * | word | ) |
Definition at line 144 of file tfacepp.cpp.
{ // Find the biggest blob gap in the chopped_word. int bestgap = -MAX_INT32; int split_index = 0; for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) { TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box(); TBOX blob_box = word->chopped_word->blobs[b]->bounding_box(); int gap = blob_box.left() - prev_box.right(); if (gap > bestgap) { bestgap = gap; split_index = b; } } ASSERT_HOST(split_index > 0); WERD_RES *word2 = NULL; BlamerBundle *orig_bb = NULL; split_word(word, split_index, &word2, &orig_bb); // Recognize the first part of the word. recog_word_recursive(word); // Recognize the second part of the word. recog_word_recursive(word2); join_words(word, word2, orig_bb); }
void tesseract::Tesseract::split_word | ( | WERD_RES * | word, |
int | split_pt, | ||
WERD_RES ** | right_piece, | ||
BlamerBundle ** | orig_blamer_bundle | ||
) | const |
Definition at line 182 of file tfacepp.cpp.
{ ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs()); // Save a copy of the blamer bundle so we can try to reconstruct it below. BlamerBundle *orig_bb = word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : NULL; WERD_RES *word2 = new WERD_RES(*word); // blow away the copied chopped_word, as we want to work with // the blobs from the input chopped_word so seam_arrays can be merged. TWERD *chopped = word->chopped_word; TWERD *chopped2 = new TWERD; chopped2->blobs.reserve(chopped->NumBlobs() - split_pt); for (int i = split_pt; i < chopped->NumBlobs(); ++i) { chopped2->blobs.push_back(chopped->blobs[i]); } chopped->blobs.truncate(split_pt); word->chopped_word = NULL; delete word2->chopped_word; word2->chopped_word = NULL; const UNICHARSET &unicharset = *word->uch_set; word->ClearResults(); word2->ClearResults(); word->chopped_word = chopped; word2->chopped_word = chopped2; word->SetupBasicsFromChoppedWord(unicharset); word2->SetupBasicsFromChoppedWord(unicharset); // Try to adjust the blamer bundle. if (orig_bb != NULL) { // TODO(rays) Looks like a leak to me. // orig_bb should take, rather than copy. word->blamer_bundle = new BlamerBundle(); word2->blamer_bundle = new BlamerBundle(); orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(), word2->chopped_word->blobs[0]->bounding_box().left(), wordrec_debug_blamer, word->blamer_bundle, word2->blamer_bundle); } *right_piece = word2; *orig_blamer_bundle = orig_bb; }
bool tesseract::Tesseract::SubAndSuperscriptFix | ( | WERD_RES * | word | ) |
Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, acccept.
This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.
Definition at line 101 of file superscript.cpp.
{ if (word->tess_failed || word->word->flag(W_REP_CHAR) || !word->best_choice) { return false; } int num_leading, num_trailing; ScriptPos sp_leading, sp_trailing; float leading_certainty, trailing_certainty; float avg_certainty, unlikely_threshold; // Calculate the number of whole suspicious characters at the edges. GetSubAndSuperscriptCandidates( word, &num_leading, &sp_leading, &leading_certainty, &num_trailing, &sp_trailing, &trailing_certainty, &avg_certainty, &unlikely_threshold); const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super"; const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super"; int num_blobs = word->best_choice->length(); // Calculate the remainder (partial characters) at the edges. // This accounts for us having classified the best version of // a word as [speaker?'] when it was instead [speaker.^{21}] // (that is we accidentally thought the 2 was attached to the period). int num_remainder_leading = 0, num_remainder_trailing = 0; if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) { int super_y_bottom = kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom; int sub_y_top = kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top; int last_word_char = num_blobs - 1 - num_trailing; float last_char_certainty = word->best_choice->certainty(last_word_char); if (word->best_choice->unichar_id(last_word_char) != 0 && last_char_certainty <= unlikely_threshold) { ScriptPos rpos; YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top, NULL, NULL, &rpos, &num_remainder_trailing); if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0; if (num_remainder_trailing > 0 && last_char_certainty < trailing_certainty) { trailing_certainty = last_char_certainty; } } bool another_blob_available = (num_remainder_trailing == 0) || num_leading + num_trailing + 1 < num_blobs; int first_char_certainty = word->best_choice->certainty(num_leading); if (another_blob_available && word->best_choice->unichar_id(num_leading) != 0 && first_char_certainty <= unlikely_threshold) { ScriptPos lpos; YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, &lpos, &num_remainder_leading, NULL, NULL); if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0; if (num_remainder_leading > 0 && first_char_certainty < leading_certainty) { leading_certainty = first_char_certainty; } } } // If nothing to do, bail now. if (num_leading + num_trailing + num_remainder_leading + num_remainder_trailing == 0) { return false; } if (superscript_debug >= 1) { tprintf("Candidate for superscript detection: %s (", word->best_choice->unichar_string().string()); if (num_leading || num_remainder_leading) { tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading, leading_pos); } if (num_trailing || num_remainder_trailing) { tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing, trailing_pos); } tprintf(")\n"); } if (superscript_debug >= 3) { word->best_choice->print(); } if (superscript_debug >= 2) { tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ", avg_certainty, unlikely_threshold); if (num_leading) tprintf("Orig. leading (min): %.2f ", leading_certainty); if (num_trailing) tprintf("Orig. trailing (min): %.2f ", trailing_certainty); tprintf("\n"); } // We've now calculated the number of rebuilt blobs we want to carve off. // However, split_word() works from TBLOBs in chopped_word, so we need to // convert to those. int num_chopped_leading = LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading; int num_chopped_trailing = TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing; int retry_leading = 0; int retry_trailing = 0; bool is_good = false; WERD_RES *revised = TrySuperscriptSplits( num_chopped_leading, leading_certainty, sp_leading, num_chopped_trailing, trailing_certainty, sp_trailing, word, &is_good, &retry_leading, &retry_trailing); if (is_good) { word->ConsumeWordResults(revised); } else if (retry_leading || retry_trailing) { int retry_chopped_leading = LeadingUnicharsToChopped(revised, retry_leading); int retry_chopped_trailing = TrailingUnicharsToChopped(revised, retry_trailing); WERD_RES *revised2 = TrySuperscriptSplits( retry_chopped_leading, leading_certainty, sp_leading, retry_chopped_trailing, trailing_certainty, sp_trailing, revised, &is_good, &retry_leading, &retry_trailing); if (is_good) { word->ConsumeWordResults(revised2); } delete revised2; } delete revised; return is_good; }
BOOL8 tesseract::Tesseract::terrible_word_crunch | ( | WERD_RES * | word, |
GARBAGE_LEVEL | garbage_level | ||
) |
Definition at line 508 of file docqual.cpp.
{ float rating_per_ch; int adjusted_len; int crunch_mode = 0; if ((word->best_choice->unichar_string().length () == 0) || (strspn (word->best_choice->unichar_string().string(), " ") == word->best_choice->unichar_string().length ())) crunch_mode = 1; else { adjusted_len = word->reject_map.length (); if (adjusted_len > crunch_rating_max) adjusted_len = crunch_rating_max; rating_per_ch = word->best_choice->rating () / adjusted_len; if (rating_per_ch > crunch_terrible_rating) crunch_mode = 2; else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) crunch_mode = 3; else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) && (garbage_level != G_OK)) crunch_mode = 4; else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) crunch_mode = 5; } if (crunch_mode > 0) { if (crunch_debug > 2) { tprintf ("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode, word->best_choice->unichar_string().string()); } return TRUE; } else return FALSE; }
bool tesseract::Tesseract::tess_acceptable_word | ( | WERD_RES * | word | ) |
Definition at line 69 of file tessbox.cpp.
{ return getDict().AcceptableResult(word); }
void tesseract::Tesseract::tess_add_doc_word | ( | WERD_CHOICE * | word_choice | ) |
Definition at line 79 of file tessbox.cpp.
{ getDict().add_document_word(*word_choice); }
void tesseract::Tesseract::tess_segment_pass_n | ( | int | pass_n, |
WERD_RES * | word | ||
) |
Definition at line 39 of file tessbox.cpp.
{ int saved_enable_assoc = 0; int saved_chop_enable = 0; if (word->word->flag(W_DONT_CHOP)) { saved_enable_assoc = wordrec_enable_assoc; saved_chop_enable = chop_enable; wordrec_enable_assoc.set_value(0); chop_enable.set_value(0); } if (pass_n == 1) set_pass1(); else set_pass2(); recog_word(word); if (word->best_choice == NULL) word->SetupFake(*word->uch_set); if (word->word->flag(W_DONT_CHOP)) { wordrec_enable_assoc.set_value(saved_enable_assoc); chop_enable.set_value(saved_chop_enable); } }
const Textord& tesseract::Tesseract::textord | ( | ) | const [inline] |
Definition at line 224 of file tesseractclass.h.
{
return textord_;
}
void tesseract::Tesseract::TidyUp | ( | PAGE_RES * | page_res | ) |
Definition at line 684 of file applybox.cpp.
{ int ok_blob_count = 0; int bad_blob_count = 0; int ok_word_count = 0; int unlabelled_words = 0; PAGE_RES_IT pr_it(page_res); WERD_RES* word_res; for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { int ok_in_word = 0; int blob_count = word_res->correct_text.size(); WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count); word_choice->set_permuter(TOP_CHOICE_PERM); for (int c = 0; c < blob_count; ++c) { if (word_res->correct_text[c].length() > 0) { ++ok_in_word; } // Since we only need a fake word_res->best_choice, the actual // unichar_ids do not matter. Which is fortunate, since TidyUp() // can be called while training Tesseract, at the stage where // unicharset is not meaningful yet. word_choice->append_unichar_id_space_allocated( INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f); } if (ok_in_word > 0) { ok_blob_count += ok_in_word; bad_blob_count += word_res->correct_text.size() - ok_in_word; word_res->LogNewRawChoice(word_choice); word_res->LogNewCookedChoice(1, false, word_choice); } else { ++unlabelled_words; if (applybox_debug > 0) { tprintf("APPLY_BOXES: Unlabelled word at :"); word_res->word->bounding_box().print(); } pr_it.DeleteCurrentWord(); } } pr_it.restart_page(); for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { // Denormalize back to a BoxWord. word_res->RebuildBestState(); word_res->SetupBoxWord(); word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row()); word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row()); } if (applybox_debug > 0) { tprintf(" Found %d good blobs.\n", ok_blob_count); if (bad_blob_count > 0) { tprintf(" Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count); } if (unlabelled_words > 0) tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words); } }
void tesseract::Tesseract::tilde_crunch | ( | PAGE_RES_IT & | page_res_it | ) |
Definition at line 422 of file docqual.cpp.
{ WERD_RES *word; GARBAGE_LEVEL garbage_level; PAGE_RES_IT copy_it; BOOL8 prev_potential_marked = FALSE; BOOL8 found_terrible_word = FALSE; BOOL8 ok_dict_word; page_res_it.restart_page(); while (page_res_it.word() != NULL) { POLY_BLOCK* pb = page_res_it.block()->block->poly_block(); if (pb != NULL && !pb->IsText()) { page_res_it.forward(); continue; } word = page_res_it.word(); if (crunch_early_convert_bad_unlv_chs) convert_bad_unlv_chs(word); if (crunch_early_merge_tess_fails) word->merge_tess_fails(); if (word->reject_map.accept_count () != 0) { found_terrible_word = FALSE; //Forget earlier potential crunches prev_potential_marked = FALSE; } else { ok_dict_word = safe_dict_word(word); garbage_level = garbage_word (word, ok_dict_word); if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch (word, garbage_level))) { if (crunch_debug > 0) { tprintf ("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().string()); } word->unlv_crunch_mode = CR_KEEP_SPACE; if (prev_potential_marked) { while (copy_it.word () != word) { if (crunch_debug > 0) { tprintf ("P1 CRUNCHING: \"%s\"\n", copy_it.word()->best_choice->unichar_string().string()); } copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE; copy_it.forward (); } prev_potential_marked = FALSE; } found_terrible_word = TRUE; } else if ((garbage_level != G_NEVER_CRUNCH) && (potential_word_crunch (word, garbage_level, ok_dict_word))) { if (found_terrible_word) { if (crunch_debug > 0) { tprintf ("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().string()); } word->unlv_crunch_mode = CR_KEEP_SPACE; } else if (!prev_potential_marked) { copy_it = page_res_it; prev_potential_marked = TRUE; if (crunch_debug > 1) { tprintf ("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().string()); } } } else { found_terrible_word = FALSE; //Forget earlier potential crunches prev_potential_marked = FALSE; if (crunch_debug > 2) { tprintf ("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().string()); } } } page_res_it.forward (); } }
void tesseract::Tesseract::tilde_delete | ( | PAGE_RES_IT & | page_res_it | ) |
Definition at line 594 of file docqual.cpp.
{ WERD_RES *word; PAGE_RES_IT copy_it; BOOL8 deleting_from_bol = FALSE; BOOL8 marked_delete_point = FALSE; inT16 debug_delete_mode; CRUNCH_MODE delete_mode; inT16 x_debug_delete_mode; CRUNCH_MODE x_delete_mode; page_res_it.restart_page(); while (page_res_it.word() != NULL) { word = page_res_it.word(); delete_mode = word_deletable (word, debug_delete_mode); if (delete_mode != CR_NONE) { if (word->word->flag (W_BOL) || deleting_from_bol) { if (crunch_debug > 0) { tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode, word->best_choice->unichar_string().string()); } word->unlv_crunch_mode = delete_mode; deleting_from_bol = TRUE; } else if (word->word->flag(W_EOL)) { if (marked_delete_point) { while (copy_it.word() != word) { x_delete_mode = word_deletable (copy_it.word (), x_debug_delete_mode); if (crunch_debug > 0) { tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode, copy_it.word()->best_choice->unichar_string().string()); } copy_it.word ()->unlv_crunch_mode = x_delete_mode; copy_it.forward (); } } if (crunch_debug > 0) { tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode, word->best_choice->unichar_string().string()); } word->unlv_crunch_mode = delete_mode; deleting_from_bol = FALSE; marked_delete_point = FALSE; } else { if (!marked_delete_point) { copy_it = page_res_it; marked_delete_point = TRUE; } } } else { deleting_from_bol = FALSE; //Forget earlier potential crunches marked_delete_point = FALSE; } /* The following step has been left till now as the tess fails are used to determine if the word is deletable. */ if (!crunch_early_merge_tess_fails) word->merge_tess_fails(); page_res_it.forward (); } }
bool tesseract::Tesseract::TrainedXheightFix | ( | WERD_RES * | word, |
BLOCK * | block, | ||
ROW * | row | ||
) |
Definition at line 969 of file control.cpp.
{ bool accept_new_x_ht = false; int original_misfits = CountMisfitTops(word); if (original_misfits == 0) return false; float new_x_ht = ComputeCompatibleXheight(word); if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { WERD_RES new_x_ht_word(word->word); if (word->blamer_bundle != NULL) { new_x_ht_word.blamer_bundle = new BlamerBundle(); new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle)); } new_x_ht_word.x_height = new_x_ht; new_x_ht_word.caps_height = 0.0; new_x_ht_word.SetupForRecognition( unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, classify_bln_numeric_mode, textord_use_cjk_fp_model, poly_allow_detailed_fx, row, block); match_word_pass_n(2, &new_x_ht_word, row, block); if (!new_x_ht_word.tess_failed) { int new_misfits = CountMisfitTops(&new_x_ht_word); if (debug_x_ht_level >= 1) { tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits, word->x_height, new_misfits, new_x_ht); tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(), word->best_choice->certainty(), new_x_ht_word.best_choice->rating(), new_x_ht_word.best_choice->certainty()); } // The misfits must improve and either the rating or certainty. accept_new_x_ht = new_misfits < original_misfits && (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() || new_x_ht_word.best_choice->rating() < word->best_choice->rating()); if (debug_x_ht_level >= 1) { ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word); } } if (accept_new_x_ht) { word->ConsumeWordResults(&new_x_ht_word); return true; } } return false; }
WERD_RES * tesseract::Tesseract::TrySuperscriptSplits | ( | int | num_chopped_leading, |
float | leading_certainty, | ||
ScriptPos | leading_pos, | ||
int | num_chopped_trailing, | ||
float | trailing_certainty, | ||
ScriptPos | trailing_pos, | ||
WERD_RES * | word, | ||
bool * | is_good, | ||
int * | retry_rebuild_leading, | ||
int * | retry_rebuild_trailing | ||
) |
Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.
[in] | num_chopped_leading | how many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | leading_certainty | the (minimum) certainty had by the characters in the original leading section. |
[in] | leading_pos | "super" or "sub" (for debugging) |
[in] | num_chopped_trailing | how many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | trailing_certainty | the (minimum) certainty had by the characters in the original trailing section. |
[in] | trailing_pos | "super" or "sub" (for debugging) |
[in] | word | the word to try to chop up. |
[out] | is_good | do we believe our result? |
[out] | retry_rebuild_leading,retry_rebuild_trailing | If non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars. |
Definition at line 382 of file superscript.cpp.
{ int num_chopped = word->chopped_word->NumBlobs(); *retry_rebuild_leading = *retry_rebuild_trailing = 0; // Chop apart the word into up to three pieces. BlamerBundle *bb0 = NULL; BlamerBundle *bb1 = NULL; WERD_RES *prefix = NULL; WERD_RES *core = NULL; WERD_RES *suffix = NULL; if (num_chopped_leading > 0) { prefix = new WERD_RES(*word); split_word(prefix, num_chopped_leading, &core, &bb0); } else { core = new WERD_RES(*word); } if (num_chopped_trailing > 0) { int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading; split_word(core, split_pt, &suffix, &bb1); } // Recognize the pieces in turn. int saved_cp_multiplier = classify_class_pruner_multiplier; int saved_im_multiplier = classify_integer_matcher_multiplier; if (prefix) { // Turn off Tesseract's y-position penalties for the leading superscript. classify_class_pruner_multiplier.set_value(0); classify_integer_matcher_multiplier.set_value(0); // Adjust our expectations about the baseline for this prefix. if (superscript_debug >= 3) { tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading); } recog_word_recursive(prefix); if (superscript_debug >= 2) { tprintf(" The leading bits look like %s %s\n", ScriptPosToString(leading_pos), prefix->best_choice->unichar_string().string()); } // Restore the normal y-position penalties. classify_class_pruner_multiplier.set_value(saved_cp_multiplier); classify_integer_matcher_multiplier.set_value(saved_im_multiplier); } if (superscript_debug >= 3) { tprintf(" recognizing middle %d chopped blobs\n", num_chopped - num_chopped_leading - num_chopped_trailing); } if (suffix) { // Turn off Tesseract's y-position penalties for the trailing superscript. classify_class_pruner_multiplier.set_value(0); classify_integer_matcher_multiplier.set_value(0); if (superscript_debug >= 3) { tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing); } recog_word_recursive(suffix); if (superscript_debug >= 2) { tprintf(" The trailing bits look like %s %s\n", ScriptPosToString(trailing_pos), suffix->best_choice->unichar_string().string()); } // Restore the normal y-position penalties. classify_class_pruner_multiplier.set_value(saved_cp_multiplier); classify_integer_matcher_multiplier.set_value(saved_im_multiplier); } // Evaluate whether we think the results are believably better // than what we already had. bool good_prefix = !prefix || BelievableSuperscript( superscript_debug >= 1, *prefix, superscript_bettered_certainty * leading_certainty, retry_rebuild_leading, NULL); bool good_suffix = !suffix || BelievableSuperscript( superscript_debug >= 1, *suffix, superscript_bettered_certainty * trailing_certainty, NULL, retry_rebuild_trailing); *is_good = good_prefix && good_suffix; if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) { // None of it is any good. Quit now. delete core; delete prefix; delete suffix; return NULL; } recog_word_recursive(core); // Now paste the results together into core. if (suffix) { suffix->SetAllScriptPositions(trailing_pos); join_words(core, suffix, bb1); } if (prefix) { prefix->SetAllScriptPositions(leading_pos); join_words(prefix, core, bb0); core = prefix; prefix = NULL; } if (superscript_debug >= 1) { tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT", core->best_choice->unichar_string().string()); } return core; }
void tesseract::Tesseract::unrej_good_chs | ( | WERD_RES * | word, |
ROW * | row | ||
) |
Definition at line 118 of file docqual.cpp.
{ if (word->bln_boxes == NULL || word->rebuild_word == NULL || word->rebuild_word->blobs.empty()) return; DocQualCallbacks cb(word); word->bln_boxes->ProcessMatchedBlobs( *word->rebuild_word, NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality)); }
void tesseract::Tesseract::unrej_good_quality_words | ( | PAGE_RES_IT & | page_res_it | ) |
Definition at line 164 of file docqual.cpp.
{ WERD_RES *word; ROW_RES *current_row; BLOCK_RES *current_block; int i; page_res_it.restart_page (); while (page_res_it.word () != NULL) { check_debug_pt (page_res_it.word (), 100); if (bland_unrej) { word = page_res_it.word (); for (i = 0; i < word->reject_map.length (); i++) { if (word->reject_map[i].accept_if_good_quality ()) word->reject_map[i].setrej_quality_accept (); } page_res_it.forward (); } else if ((page_res_it.row ()->char_count > 0) && ((page_res_it.row ()->rej_count / (float) page_res_it.row ()->char_count) <= quality_rowrej_pc)) { word = page_res_it.word (); if (word->reject_map.quality_recoverable_rejects() && (tessedit_unrej_any_wd || acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().string(), word->best_choice->unichar_lengths().string()) != AC_UNACCEPTABLE)) { unrej_good_chs(word, page_res_it.row ()->row); } page_res_it.forward (); } else { /* Skip to end of dodgy row */ current_row = page_res_it.row (); while ((page_res_it.word () != NULL) && (page_res_it.row () == current_row)) page_res_it.forward (); } check_debug_pt (page_res_it.word (), 110); } page_res_it.restart_page (); page_res_it.page_res->char_count = 0; page_res_it.page_res->rej_count = 0; current_block = NULL; current_row = NULL; while (page_res_it.word () != NULL) { if (current_block != page_res_it.block ()) { current_block = page_res_it.block (); current_block->char_count = 0; current_block->rej_count = 0; } if (current_row != page_res_it.row ()) { current_row = page_res_it.row (); current_row->char_count = 0; current_row->rej_count = 0; current_row->whole_word_rej_count = 0; } page_res_it.rej_stat_word (); page_res_it.forward (); } }
BOOL8 tesseract::Tesseract::word_adaptable | ( | WERD_RES * | word, |
uinT16 | mode | ||
) |
Definition at line 46 of file adaptions.cpp.
{ if (tessedit_adaption_debug) { tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n", word->best_choice == NULL ? "" : word->best_choice->unichar_string().string(), word->best_choice->rating(), word->best_choice->certainty()); } BOOL8 status = FALSE; BITS16 flags(mode); enum MODES { ADAPTABLE_WERD, ACCEPTABLE_WERD, CHECK_DAWGS, CHECK_SPACES, CHECK_ONE_ELL_CONFLICT, CHECK_AMBIG_WERD }; /* 0: NO adaption */ if (mode == 0) { if (tessedit_adaption_debug) tprintf("adaption disabled\n"); return FALSE; } if (flags.bit (ADAPTABLE_WERD)) { status |= word->tess_would_adapt; // result of Classify::AdaptableWord() if (tessedit_adaption_debug && !status) { tprintf("tess_would_adapt bit is false\n"); } } if (flags.bit (ACCEPTABLE_WERD)) { status |= word->tess_accepted; if (tessedit_adaption_debug && !status) { tprintf("tess_accepted bit is false\n"); } } if (!status) { // If not set then return FALSE; // ignore other checks } if (flags.bit (CHECK_DAWGS) && (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && (word->best_choice->permuter () != FREQ_DAWG_PERM) && (word->best_choice->permuter () != USER_DAWG_PERM) && (word->best_choice->permuter () != NUMBER_PERM)) { if (tessedit_adaption_debug) tprintf("word not in dawgs\n"); return FALSE; } if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) { if (tessedit_adaption_debug) tprintf("word has ell conflict\n"); return FALSE; } if (flags.bit (CHECK_SPACES) && (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) { if (tessedit_adaption_debug) tprintf("word contains spaces\n"); return FALSE; } if (flags.bit (CHECK_AMBIG_WERD) && word->best_choice->dangerous_ambig_found()) { if (tessedit_adaption_debug) tprintf("word is ambiguous\n"); return FALSE; } if (tessedit_adaption_debug) { tprintf("returning status %d\n", status); } return status; }
BOOL8 tesseract::Tesseract::word_blank_and_set_display | ( | BLOCK * | block, |
ROW * | row, | ||
WERD_RES * | word_res | ||
) |
Definition at line 717 of file pgedit.cpp.
{ word_res->word->bounding_box().plot(image_win, ScrollView::BLACK, ScrollView::BLACK); return word_set_display(block, row, word_res); }
BOOL8 tesseract::Tesseract::word_bln_display | ( | BLOCK * | block, |
ROW * | row, | ||
WERD_RES * | word_res | ||
) |
Normalize word and display in word window
Definition at line 730 of file pgedit.cpp.
{ TWERD *bln_word = word_res->chopped_word; if (bln_word == NULL) { word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, classify_bln_numeric_mode, textord_use_cjk_fp_model, poly_allow_detailed_fx, row, block); bln_word = word_res->chopped_word; } bln_word_window_handle()->Clear(); display_bln_lines(bln_word_window_handle(), ScrollView::CYAN, 1.0, 0.0f, -1000.0f, 1000.0f); C_BLOB_IT it(word_res->word->cblob_list()); ScrollView::Color color = WERD::NextColor(ScrollView::BLACK); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN, bln_word_window_handle()); color = WERD::NextColor(color); } bln_word_window_handle()->Update(); return TRUE; }
inT16 tesseract::Tesseract::word_blob_quality | ( | WERD_RES * | word, |
ROW * | row | ||
) |
Definition at line 66 of file docqual.cpp.
{ if (word->bln_boxes == NULL || word->rebuild_word == NULL || word->rebuild_word->blobs.empty()) return 0; DocQualCallbacks cb(word); word->bln_boxes->ProcessMatchedBlobs( *word->rebuild_word, NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs)); return cb.match_count; }
void tesseract::Tesseract::word_char_quality | ( | WERD_RES * | word, |
ROW * | row, | ||
inT16 * | match_count, | ||
inT16 * | accepted_match_count | ||
) |
Definition at line 98 of file docqual.cpp.
{ if (word->bln_boxes == NULL || word->rebuild_word == NULL || word->rebuild_word->blobs.empty()) return; DocQualCallbacks cb(word); word->bln_boxes->ProcessMatchedBlobs( *word->rebuild_word, NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs)); *match_count = cb.match_count; *accepted_match_count = cb.accepted_match_count; }
BOOL8 tesseract::Tesseract::word_contains_non_1_digit | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 512 of file reject.cpp.
{ inT16 i; inT16 offset; for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) { if (unicharset.get_isdigit (word + offset, word_lengths[i]) && (word_lengths[i] != 1 || word[offset] != '1')) return TRUE; } return FALSE; }
CRUNCH_MODE tesseract::Tesseract::word_deletable | ( | WERD_RES * | word, |
inT16 & | delete_mode | ||
) |
Definition at line 899 of file docqual.cpp.
{ int word_len = word->reject_map.length (); float rating_per_ch; TBOX box; //BB of word if (word->unlv_crunch_mode == CR_NONE) { delete_mode = 0; return CR_NONE; } if (word_len == 0) { delete_mode = 1; return CR_DELETE; } if (word->rebuild_word != NULL) { // Cube leaves rebuild_word NULL. box = word->rebuild_word->bounding_box(); if (box.height () < crunch_del_min_ht * kBlnXHeight) { delete_mode = 4; return CR_DELETE; } if (noise_outlines(word->rebuild_word)) { delete_mode = 5; return CR_DELETE; } } if ((failure_count (word) * 1.5) > word_len) { delete_mode = 2; return CR_LOOSE_SPACE; } if (word->best_choice->certainty () < crunch_del_cert) { delete_mode = 7; return CR_LOOSE_SPACE; } rating_per_ch = word->best_choice->rating () / word_len; if (rating_per_ch > crunch_del_rating) { delete_mode = 8; return CR_LOOSE_SPACE; } if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) { delete_mode = 9; return CR_LOOSE_SPACE; } if (box.bottom () > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) { delete_mode = 10; return CR_LOOSE_SPACE; } if (box.height () > crunch_del_max_ht * kBlnXHeight) { delete_mode = 11; return CR_LOOSE_SPACE; } if (box.width () < crunch_del_min_width * kBlnXHeight) { delete_mode = 3; return CR_LOOSE_SPACE; } delete_mode = 0; return CR_NONE; }
BOOL8 tesseract::Tesseract::word_display | ( | BLOCK * | block, |
ROW * | row, | ||
WERD_RES * | word_res | ||
) |
word_display() Word Processor
Display a word according to its display modes
Definition at line 762 of file pgedit.cpp.
{ WERD* word = word_res->word; TBOX word_bb; // word bounding box int word_height; // ht of word BB BOOL8 displayed_something = FALSE; float shift; // from bot left C_BLOB_IT c_it; // cblob iterator if (color_mode != CM_RAINBOW && word_res->box_word != NULL) { BoxWord* box_word = word_res->box_word; WERD_CHOICE* best_choice = word_res->best_choice; int length = box_word->length(); if (word_res->fontinfo == NULL) return false; const FontInfo& font_info = *word_res->fontinfo; for (int i = 0; i < length; ++i) { ScrollView::Color color = ScrollView::GREEN; switch (color_mode) { case CM_SUBSCRIPT: if (best_choice->BlobPosition(i) == SP_SUBSCRIPT) color = ScrollView::RED; break; case CM_SUPERSCRIPT: if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT) color = ScrollView::RED; break; case CM_ITALIC: if (font_info.is_italic()) color = ScrollView::RED; break; case CM_BOLD: if (font_info.is_bold()) color = ScrollView::RED; break; case CM_FIXEDPITCH: if (font_info.is_fixed_pitch()) color = ScrollView::RED; break; case CM_SERIF: if (font_info.is_serif()) color = ScrollView::RED; break; case CM_SMALLCAPS: if (word_res->small_caps) color = ScrollView::RED; break; case CM_DROPCAPS: if (best_choice->BlobPosition(i) == SP_DROPCAP) color = ScrollView::RED; break; // TODO(rays) underline is currently completely unsupported. case CM_UNDERLINE: default: break; } image_win->Pen(color); TBOX box = box_word->BlobBox(i); image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top()); } return true; } /* Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color) etc. are to keep the compiler happy. */ // display bounding box if (word->display_flag(DF_BOX)) { word->bounding_box().plot(image_win, (ScrollView::Color)((inT32) editor_image_word_bb_color), (ScrollView::Color)((inT32) editor_image_word_bb_color)); ScrollView::Color c = (ScrollView::Color) ((inT32) editor_image_blob_bb_color); image_win->Pen(c); c_it.set_to_list(word->cblob_list()); for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) c_it.data()->bounding_box().plot(image_win); displayed_something = TRUE; } // display edge steps if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available word->plot(image_win); // rainbow colors displayed_something = TRUE; } // display poly approx if (word->display_flag(DF_POLYGONAL)) { // need to convert TWERD* tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word); tword->plot(image_win); delete tword; displayed_something = TRUE; } // Display correct text and blamer information. STRING text; STRING blame; if (word->display_flag(DF_TEXT) && word->text() != NULL) { text = word->text(); } if (word->display_flag(DF_BLAMER) && !(word_res->blamer_bundle != NULL && word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) { text = ""; const BlamerBundle *blamer_bundle = word_res->blamer_bundle; if (blamer_bundle == NULL) { text += "NULL"; } else { text = blamer_bundle->TruthString(); } text += " -> "; STRING best_choice_str; if (word_res->best_choice == NULL) { best_choice_str = "NULL"; } else { word_res->best_choice->string_and_lengths(&best_choice_str, NULL); } text += best_choice_str; IncorrectResultReason reason = (blamer_bundle == NULL) ? IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason(); ASSERT_HOST(reason < IRR_NUM_REASONS) blame += " ["; blame += BlamerBundle::IncorrectReasonName(reason); blame += "]"; } if (text.length() > 0) { word_bb = word->bounding_box(); image_win->Pen(ScrollView::RED); word_height = word_bb.height(); int text_height = 0.50 * word_height; if (text_height > 20) text_height = 20; image_win->TextAttributes("Arial", text_height, false, false, false); shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f; image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height, text.string()); if (blame.length() > 0) { image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height - text_height, blame.string()); } displayed_something = TRUE; } if (!displayed_something) // display BBox anyway word->bounding_box().plot(image_win, (ScrollView::Color)((inT32) editor_image_word_bb_color), (ScrollView::Color)((inT32) editor_image_word_bb_color)); return TRUE; }
BOOL8 tesseract::Tesseract::word_dumper | ( | BLOCK * | block, |
ROW * | row, | ||
WERD_RES * | word_res | ||
) |
Dump members to the debug window
Definition at line 922 of file pgedit.cpp.
{ if (block != NULL) { tprintf("\nBlock data...\n"); block->print(NULL, FALSE); } tprintf("\nRow data...\n"); row->print(NULL); tprintf("\nWord data...\n"); word_res->word->print(); if (word_res->blamer_bundle != NULL && wordrec_debug_blamer && word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) { tprintf("Current blamer debug: %s\n", word_res->blamer_bundle->debug().string()); } return TRUE; }
Definition at line 78 of file docqual.cpp.
{ inT16 i = 0; inT16 err_count = 0; if (word->rebuild_word != NULL) { for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) { TBLOB* blob = word->rebuild_word->blobs[b]; err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines()); i++; } } return err_count; }
BOOL8 tesseract::Tesseract::word_set_display | ( | BLOCK * | block, |
ROW * | row, | ||
WERD_RES * | word_res | ||
) |
word_set_display() Word processor
Display word according to current display mode settings
Definition at line 945 of file pgedit.cpp.
{ WERD* word = word_res->word; word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX)); word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT)); word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL)); word->set_display_flag(DF_EDGE_STEP, word_display_mode.bit(DF_EDGE_STEP)); word->set_display_flag(DF_BN_POLYGONAL, word_display_mode.bit(DF_BN_POLYGONAL)); word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER)); return word_display(block, row, word_res); }
inT16 tesseract::Tesseract::worst_noise_blob | ( | WERD_RES * | word_res, |
float * | worst_noise_score | ||
) |
Definition at line 684 of file fixspace.cpp.
{ float noise_score[512]; int i; int min_noise_blob; // 1st contender int max_noise_blob; // last contender int non_noise_count; int worst_noise_blob; // Worst blob float small_limit = kBlnXHeight * fixsp_small_outlines_size; float non_noise_limit = kBlnXHeight * 0.8; if (word_res->rebuild_word == NULL) return -1; // Can't handle cube words. // Normalised. int blob_count = word_res->box_word->length(); ASSERT_HOST(blob_count <= 512); if (blob_count < 5) return -1; // too short to split /* Get the noise scores for all blobs */ #ifndef SECURE_NAMES if (debug_fix_space_level > 5) tprintf("FP fixspace Noise metrics for \"%s\": ", word_res->best_choice->unichar_string().string()); #endif for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) { TBLOB* blob = word_res->rebuild_word->blobs[i]; if (word_res->reject_map[i].accepted()) noise_score[i] = non_noise_limit; else noise_score[i] = blob_noise_score(blob); if (debug_fix_space_level > 5) tprintf("%1.1f ", noise_score[i]); } if (debug_fix_space_level > 5) tprintf("\n"); /* Now find the worst one which is far enough away from the end of the word */ non_noise_count = 0; for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) { if (noise_score[i] >= non_noise_limit) { non_noise_count++; } } if (non_noise_count < fixsp_non_noise_limit) return -1; min_noise_blob = i; non_noise_count = 0; for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) { if (noise_score[i] >= non_noise_limit) { non_noise_count++; } } if (non_noise_count < fixsp_non_noise_limit) return -1; max_noise_blob = i; if (min_noise_blob > max_noise_blob) return -1; *worst_noise_score = small_limit; worst_noise_blob = -1; for (i = min_noise_blob; i <= max_noise_blob; i++) { if (noise_score[i] < *worst_noise_score) { worst_noise_blob = i; *worst_noise_score = noise_score[i]; } } return worst_noise_blob; }
void tesseract::Tesseract::write_results | ( | PAGE_RES_IT & | page_res_it, |
char | newline_type, | ||
BOOL8 | force_eol | ||
) |
Definition at line 133 of file output.cpp.
{ // override tilde crunch? WERD_RES *word = page_res_it.word(); const UNICHARSET &uchset = *word->uch_set; int i; BOOL8 need_reject = FALSE; UNICHAR_ID space = uchset.unichar_to_id(" "); if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->length() == 0) && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { if ((word->unlv_crunch_mode != CR_DELETE) && (!stats_.tilde_crunch_written || ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)))) { if (!word->word->flag (W_BOL) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)) { stats_.last_char_was_tilde = false; } need_reject = TRUE; } if ((need_reject && !stats_.last_char_was_tilde) || (force_eol && stats_.write_results_empty_block)) { /* Write a reject char - mark as rejected unless zero_rejection mode */ stats_.last_char_was_tilde = TRUE; stats_.tilde_crunch_written = true; stats_.last_char_was_newline = false; stats_.write_results_empty_block = false; } if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) { stats_.tilde_crunch_written = false; stats_.last_char_was_newline = true; stats_.last_char_was_tilde = false; } if (force_eol) stats_.write_results_empty_block = true; return; } /* NORMAL PROCESSING of non tilde crunched words */ stats_.tilde_crunch_written = false; if (newline_type) stats_.last_char_was_newline = true; else stats_.last_char_was_newline = false; stats_.write_results_empty_block = force_eol; // about to write a real word if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) && !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && (word->best_choice->unichar_id(0) == space)) { /* Prevent adjacent tilde across words - we know that adjacent tildes within words have been removed */ word->MergeAdjacentBlobs(0); } if (newline_type || (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) stats_.last_char_was_tilde = false; else { if (word->reject_map.length () > 0) { if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) stats_.last_char_was_tilde = true; else stats_.last_char_was_tilde = false; } else if (word->word->space () > 0) stats_.last_char_was_tilde = false; /* else it is unchanged as there are no output chars */ } ASSERT_HOST (word->best_choice->length() == word->reject_map.length()); set_unlv_suspects(word); check_debug_pt (word, 120); if (tessedit_rejection_debug) { tprintf ("Dict word: \"%s\": %d\n", word->best_choice->debug_string().string(), dict_word(*(word->best_choice))); } if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) { if (tessedit_zero_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if (word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } if (tessedit_minimal_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } } }
"Debug level"
Definition at line 738 of file tesseractclass.h.
char* tesseract::Tesseract::applybox_exposure_pattern = ".exp" |
"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"
Definition at line 743 of file tesseractclass.h.
"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."
Definition at line 747 of file tesseractclass.h.
bool tesseract::Tesseract::applybox_learn_ngrams_mode = false |
"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."
Definition at line 750 of file tesseractclass.h.
"Page number to apply boxes from"
Definition at line 739 of file tesseractclass.h.
double tesseract::Tesseract::bestrate_pruning_factor = 2.0 |
"Multiplying factor of" " current best rate to prune other hypotheses"
Definition at line 985 of file tesseractclass.h.
int tesseract::Tesseract::bidi_debug = 0 |
"Debug level for BiDi"
Definition at line 737 of file tesseractclass.h.
bool tesseract::Tesseract::bland_unrej = false |
"unrej potential with no chekcs"
Definition at line 831 of file tesseractclass.h.
char* tesseract::Tesseract::chs_leading_punct = "('`\"" |
"Leading punctuation"
Definition at line 771 of file tesseractclass.h.
char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!" |
"1st Trailing punctuation"
Definition at line 772 of file tesseractclass.h.
char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\"" |
"2nd Trailing punctuation"
Definition at line 773 of file tesseractclass.h.
char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]" |
"Il1 conflict set"
Definition at line 942 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_accept_ok = true |
"Use acceptability in okstring"
Definition at line 858 of file tesseractclass.h.
"As it says"
Definition at line 867 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_cert = -10.0 |
"POTENTIAL crunch cert lt this"
Definition at line 847 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_high_word = 1.5 |
"Del if word gt xht x this above bl"
Definition at line 852 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_low_word = 0.5 |
"Del if word gt xht x this below bl"
Definition at line 853 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_max_ht = 3.0 |
"Del if word ht gt xht x this"
Definition at line 849 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_min_ht = 0.7 |
"Del if word ht lt xht x this"
Definition at line 848 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_min_width = 3.0 |
"Del if word width lt xht x this"
Definition at line 850 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_rating = 60 |
"POTENTIAL crunch rating lt this"
Definition at line 846 of file tesseractclass.h.
"Take out ~^ early?"
Definition at line 837 of file tesseractclass.h.
"Before word crunch?"
Definition at line 836 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_include_numerals = false |
"Fiddle alpha figures"
Definition at line 861 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_leave_accept_strings = false |
"Dont pot crunch sensible strings"
Definition at line 860 of file tesseractclass.h.
"Dont crunch words with long lower case strings"
Definition at line 863 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_leave_ok_strings = true |
"Dont touch sensible strings"
Definition at line 857 of file tesseractclass.h.
"Dont crunch words with long lower case strings"
Definition at line 865 of file tesseractclass.h.
"Crunch words with long repetitions"
Definition at line 866 of file tesseractclass.h.
double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0 |
"crunch garbage cert lt this"
Definition at line 841 of file tesseractclass.h.
double tesseract::Tesseract::crunch_poor_garbage_rate = 60 |
"crunch garbage rating lt this"
Definition at line 842 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_pot_garbage = true |
"POTENTIAL crunch garbage"
Definition at line 845 of file tesseractclass.h.
"How many potential indicators needed"
Definition at line 856 of file tesseractclass.h.
double tesseract::Tesseract::crunch_pot_poor_cert = -8.0 |
"POTENTIAL crunch cert lt this"
Definition at line 844 of file tesseractclass.h.
double tesseract::Tesseract::crunch_pot_poor_rate = 40 |
"POTENTIAL crunch rating lt this"
Definition at line 843 of file tesseractclass.h.
"For adj length in rating per ch"
Definition at line 855 of file tesseractclass.h.
double tesseract::Tesseract::crunch_small_outlines_size = 0.6 |
"Small if lt xht x this"
Definition at line 854 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_terrible_garbage = true |
"As it says"
Definition at line 839 of file tesseractclass.h.
double tesseract::Tesseract::crunch_terrible_rating = 80.0 |
"crunch rating lt this"
Definition at line 838 of file tesseractclass.h.
"Print cube debug info."
Definition at line 795 of file tesseractclass.h.
bool tesseract::Tesseract::debug_acceptable_wds = false |
"Dump word pass/fail chk"
Definition at line 770 of file tesseractclass.h.
"Contextual fixspace debug"
Definition at line 873 of file tesseractclass.h.
"Reestimate debug"
Definition at line 769 of file tesseractclass.h.
bool tesseract::Tesseract::docqual_excuse_outline_errs = false |
"Allow outline errs in unrejection?"
Definition at line 799 of file tesseractclass.h.
bool tesseract::Tesseract::enable_new_segsearch = false |
"Enable new segmentation search path."
Definition at line 1025 of file tesseractclass.h.
char* tesseract::Tesseract::file_type = ".tif" |
"Filename extension"
Definition at line 949 of file tesseractclass.h.
"What constitues done for spacing"
Definition at line 872 of file tesseractclass.h.
"How many non-noise blbs either side?"
Definition at line 869 of file tesseractclass.h.
double tesseract::Tesseract::fixsp_small_outlines_size = 0.28 |
"Small if lt xht x this"
Definition at line 870 of file tesseractclass.h.
double tesseract::Tesseract::heuristic_max_char_wh_ratio = 2.0 |
"max char width-to-height ratio allowed in segmentation"
Definition at line 1023 of file tesseractclass.h.
double tesseract::Tesseract::heuristic_segcost_rating_base = 1.25 |
"base factor for adding segmentation cost into word rating." "It's a multiplying factor, the larger the value above 1, " "the bigger the effect of segmentation cost."
Definition at line 1014 of file tesseractclass.h.
"weight associated with char rating in combined cost of state"
Definition at line 1016 of file tesseractclass.h.
"weight associated with seam cut in combined cost of state"
Definition at line 1021 of file tesseractclass.h.
double tesseract::Tesseract::heuristic_weight_width = 1000.0 |
"weight associated with width evidence in combined cost of" " state"
Definition at line 1019 of file tesseractclass.h.
bool tesseract::Tesseract::interactive_display_mode = false |
"Run interactively?"
Definition at line 948 of file tesseractclass.h.
"Depth of blob choice lists to explore" " when fixed length dawgs are on"
Definition at line 1008 of file tesseractclass.h.
bool tesseract::Tesseract::load_fixed_length_dawgs = true |
"Load fixed length" " dawgs (e.g. for non-space delimited languages)"
Definition at line 981 of file tesseractclass.h.
double tesseract::Tesseract::min_orientation_margin = 7.0 |
"Min acceptable orientation margin"
Definition at line 960 of file tesseractclass.h.
"Reject any x-ht lt or eq than this"
Definition at line 943 of file tesseractclass.h.
bool tesseract::Tesseract::ngram_permuter_activated = false |
"Activate character-level n-gram-based permuter"
Definition at line 1004 of file tesseractclass.h.
char* tesseract::Tesseract::numeric_punctuation = ".," |
"Punct. chs expected WITHIN numbers"
Definition at line 875 of file tesseractclass.h.
"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."
Definition at line 732 of file tesseractclass.h.
char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075" |
"Allow NN to unrej"
Definition at line 941 of file tesseractclass.h.
char* tesseract::Tesseract::outlines_2 = "ij!?%\":;" |
"Non standard number of outlines"
Definition at line 797 of file tesseractclass.h.
char* tesseract::Tesseract::outlines_odd = "%| " |
"Non standard number of outlines"
Definition at line 796 of file tesseractclass.h.
int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT |
"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."
Definition at line 728 of file tesseractclass.h.
"Print paragraph debug info."
Definition at line 791 of file tesseractclass.h.
bool tesseract::Tesseract::paragraph_text_based = true |
"Run paragraph detection on the post-text-recognition " "(more accurate)"
Definition at line 794 of file tesseractclass.h.
"Turn on character type (property) consistency permuter"
Definition at line 997 of file tesseractclass.h.
bool tesseract::Tesseract::permute_debug = 0 |
"char permutation debug"
Definition at line 983 of file tesseractclass.h.
"Turn on fixed-length phrasebook search permuter"
Definition at line 995 of file tesseractclass.h.
bool tesseract::Tesseract::permute_only_top = false |
"Run only the top choice permuter"
Definition at line 1005 of file tesseractclass.h.
"Turn on word script consistency permuter"
Definition at line 987 of file tesseractclass.h.
bool tesseract::Tesseract::poly_allow_detailed_fx = false |
"Allow feature extractors to see the original outline"
Definition at line 964 of file tesseractclass.h.
double tesseract::Tesseract::quality_blob_pc = 0.0 |
"good_quality_doc gte good blobs limit"
Definition at line 775 of file tesseractclass.h.
double tesseract::Tesseract::quality_char_pc = 0.95 |
"good_quality_doc gte good char limit"
Definition at line 778 of file tesseractclass.h.
"alphas in a good word"
Definition at line 779 of file tesseractclass.h.
double tesseract::Tesseract::quality_outline_pc = 1.0 |
"good_quality_doc lte outline error limit"
Definition at line 777 of file tesseractclass.h.
double tesseract::Tesseract::quality_rej_pc = 0.08 |
"good_quality_doc lte rejection limit"
Definition at line 774 of file tesseractclass.h.
double tesseract::Tesseract::quality_rowrej_pc = 1.1 |
"good_quality_doc gte good char limit"
Definition at line 833 of file tesseractclass.h.
"Dont double check"
Definition at line 932 of file tesseractclass.h.
bool tesseract::Tesseract::rej_1Il_use_dict_word = false |
"Use dictword test"
Definition at line 931 of file tesseractclass.h.
bool tesseract::Tesseract::rej_alphas_in_number_perm = false |
"Extend permuter check"
Definition at line 937 of file tesseractclass.h.
bool tesseract::Tesseract::rej_trust_doc_dawg = false |
"Use DOC dawg in 11l conf. detector"
Definition at line 930 of file tesseractclass.h.
bool tesseract::Tesseract::rej_use_good_perm = true |
"Individual rejection control"
Definition at line 935 of file tesseractclass.h.
bool tesseract::Tesseract::rej_use_sensible_wd = false |
"Extend permuter check"
Definition at line 936 of file tesseractclass.h.
bool tesseract::Tesseract::rej_use_tess_accepted = true |
"Individual rejection control"
Definition at line 933 of file tesseractclass.h.
bool tesseract::Tesseract::rej_use_tess_blanks = true |
"Individual rejection control"
Definition at line 934 of file tesseractclass.h.
"if >this fract"
Definition at line 938 of file tesseractclass.h.
"Debug the whole segmentation process"
Definition at line 982 of file tesseractclass.h.
double tesseract::Tesseract::segment_reward_chartype = 0.97 |
"Score multipler for char type consistency within a word. "
Definition at line 999 of file tesseractclass.h.
double tesseract::Tesseract::segment_reward_ngram_best_choice = 0.99 |
"Score multipler for ngram permuter's best choice" " (only used in the Han script path)."
Definition at line 1002 of file tesseractclass.h.
double tesseract::Tesseract::segment_reward_script = 0.95 |
"Score multipler for script consistency within a word. " "Being a 'reward' factor, it should be <= 1. " "Smaller value implies bigger reward."
Definition at line 993 of file tesseractclass.h.
"incorporate segmentation cost in word rating?"
Definition at line 989 of file tesseractclass.h.
"Maximum character width-to-height ratio for" "fixed pitch fonts"
Definition at line 1028 of file tesseractclass.h.
double tesseract::Tesseract::subscript_max_y_top = 0.5 |
"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript."
Definition at line 894 of file tesseractclass.h.
double tesseract::Tesseract::superscript_bettered_certainty = 0.97 |
"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%"
Definition at line 886 of file tesseractclass.h.
"Debug level for sub & superscript fixer"
Definition at line 879 of file tesseractclass.h.
double tesseract::Tesseract::superscript_min_y_bottom = 0.3 |
"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript."
Definition at line 898 of file tesseractclass.h.
double tesseract::Tesseract::superscript_scaledown_ratio = 0.4 |
"A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size."
Definition at line 890 of file tesseractclass.h.
double tesseract::Tesseract::superscript_worse_certainty = 2.0 |
"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?"
Definition at line 882 of file tesseractclass.h.
double tesseract::Tesseract::suspect_accept_rating = -999.9 |
"Accept good rating limit"
Definition at line 915 of file tesseractclass.h.
bool tesseract::Tesseract::suspect_constrain_1Il = false |
"UNLV keep 1Il chars rejected"
Definition at line 913 of file tesseractclass.h.
int tesseract::Tesseract::suspect_level = 99 |
"Suspect marker level"
Definition at line 908 of file tesseractclass.h.
double tesseract::Tesseract::suspect_rating_per_ch = 999.9 |
"Dont touch bad rating limit"
Definition at line 914 of file tesseractclass.h.
"Dont Suspect dict wds longer than this"
Definition at line 912 of file tesseractclass.h.
"Min suspect level for rejecting spaces"
Definition at line 910 of file tesseractclass.h.
"Debug level for TessdataManager functions."
Definition at line 952 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_adaption_debug = false |
"Generate and print debug information for adaption"
Definition at line 736 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_ambigs_training = false |
"Perform training for ambiguities"
Definition at line 724 of file tesseractclass.h.
"Amount of debug output for bigram " "correction."
Definition at line 768 of file tesseractclass.h.
"Blacklist of chars not to recognize"
Definition at line 720 of file tesseractclass.h.
"Whitelist of chars to recognize"
Definition at line 722 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_consistent_reps = true |
"Force all rep chars the same"
Definition at line 922 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_create_boxfile = false |
"Output text with boxes"
Definition at line 944 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_create_hocr = false |
"Write .html hOCR output file"
Definition at line 904 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_create_pdf = false |
"Write .pdf output file"
Definition at line 905 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_debug_block_rejection = false |
"Block and Row stats"
Definition at line 764 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_debug_doc_rejection = false |
"Page stats"
Definition at line 828 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_debug_fonts = false |
"Output font info per char"
Definition at line 763 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_debug_quality_metrics = false |
"Output data to debug file"
Definition at line 830 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_display_outwords = false |
"Draw output words"
Definition at line 751 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false |
"Use word segmentation quality metric"
Definition at line 817 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false |
"Use word segmentation quality metric"
Definition at line 819 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_dump_choices = false |
"Dump char choices"
Definition at line 753 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_dump_pageseg_images = false |
"Dump intermediate images made during page segmentation"
Definition at line 710 of file tesseractclass.h.
"Enable correction based on the word bigram dictionary."
Definition at line 766 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_enable_doc_dict = true |
"Add words to the document dictionary"
Definition at line 762 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true |
"Try to improve fuzzy spaces"
Definition at line 756 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_fix_hyphens = true |
"Crunch double hyphens?"
Definition at line 759 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_flip_0O = true |
"Contextual 0O O0 flips"
Definition at line 925 of file tesseractclass.h.
"rej good doc wd if more than this fraction rejected"
Definition at line 825 of file tesseractclass.h.
"Reduce rejection on good docs"
Definition at line 801 of file tesseractclass.h.
"Rej blbs near image edge limit"
Definition at line 939 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_init_config_only = false |
"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."
Definition at line 967 of file tesseractclass.h.
char* tesseract::Tesseract::tessedit_load_sublangs = "" |
"List of languages to load with this one"
Definition at line 954 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5 |
"Aspect ratio dot/hyphen test"
Definition at line 927 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false |
"Generate more boxes from boxed chars"
Definition at line 708 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_matcher_log = false |
"Log matcher activity"
Definition at line 785 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false |
"Do minimal rejection on pass 1 output"
Definition at line 783 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_minimal_rejection = false |
"Only reject tess failures"
Definition at line 916 of file tesseractclass.h.
"Which OCR engine(s) to run (Tesseract, Cube, both). Defaults" " to loading and running only Tesseract (no Cube, no combiner)." " (Values from OcrEngineMode enum in tesseractclass.h)"
Definition at line 718 of file tesseractclass.h.
"Acceptance decision algorithm"
Definition at line 979 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_override_permuter = true |
"According to dict_word"
Definition at line 950 of file tesseractclass.h.
"-1 -> All pages, else specifc page to process"
Definition at line 946 of file tesseractclass.h.
"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"
Definition at line 714 of file tesseractclass.h.
"Run in parallel where possible"
Definition at line 969 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_prefer_joined_punct = false |
"Reward punctation joins"
Definition at line 871 of file tesseractclass.h.
"Only rej partially rejected words in block rejection"
Definition at line 813 of file tesseractclass.h.
"Only preserve wds longer than this"
Definition at line 821 of file tesseractclass.h.
"Only rej partially rejected words in row rejection"
Definition at line 815 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_redo_xheight = true |
"Check/Correct x-height"
Definition at line 760 of file tesseractclass.h.
"Reject all bad quality wds"
Definition at line 827 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_reject_block_percent = 45.00 |
"%rej allowed before rej whole block"
Definition at line 806 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00 |
"%rej allowed before rej whole doc"
Definition at line 804 of file tesseractclass.h.
"Rejection algorithm"
Definition at line 923 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_reject_row_percent = 40.00 |
"%rej allowed before rej whole row"
Definition at line 808 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_rejection_debug = false |
"Adaption debug"
Definition at line 924 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_resegment_from_boxes = false |
"Take segmentation and labeling from box file"
Definition at line 702 of file tesseractclass.h.
"Conversion of word/line box file to char box file"
Definition at line 704 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_row_rej_good_docs = true |
"Apply row rejection to good docs"
Definition at line 823 of file tesseractclass.h.
"Adaptation decision algorithm for tess"
Definition at line 781 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_test_adaption = false |
"Test adaption criteria"
Definition at line 784 of file tesseractclass.h.
"Adaptation decision algorithm for tess"
Definition at line 787 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_timing_debug = false |
"Print timing stats"
Definition at line 754 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_train_from_boxes = false |
"Generate training data from boxed chars"
Definition at line 706 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_training_tess = false |
"Call Tess to learn blobs"
Definition at line 752 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_unrej_any_wd = false |
"Dont bother with word plausibility"
Definition at line 758 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8 |
"Aspect ratio dot/hyphen test"
Definition at line 929 of file tesseractclass.h.
"In multilingual mode use params model of the primary language"
Definition at line 956 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_use_reject_spaces = true |
"Reject spaces?"
Definition at line 802 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00 |
"Number of row rejects in whole word rejects" "which prevents whole row rejection"
Definition at line 811 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_word_for_word = false |
"Make output have exactly one word per WERD"
Definition at line 919 of file tesseractclass.h.
"Write block separators in output"
Definition at line 900 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_write_images = false |
"Capture the image from the IPE"
Definition at line 947 of file tesseractclass.h.
"Write all parameters to the given file."
Definition at line 734 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_write_rep_codes = false |
"Write repetition char code"
Definition at line 902 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_write_unlv = false |
"Write .unlv output file"
Definition at line 903 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false |
"Dont reject ANYTHING AT ALL"
Definition at line 921 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_zero_rejection = false |
"Dont reject ANYTHING"
Definition at line 917 of file tesseractclass.h.
bool tesseract::Tesseract::test_pt = false |
"Test for point"
Definition at line 788 of file tesseractclass.h.
double tesseract::Tesseract::test_pt_x = 99999.99 |
"xcoord"
Definition at line 789 of file tesseractclass.h.
double tesseract::Tesseract::test_pt_y = 99999.99 |
"ycoord"
Definition at line 790 of file tesseractclass.h.
bool tesseract::Tesseract::textord_equation_detect = false |
"Turn on equation detector"
Definition at line 968 of file tesseractclass.h.
bool tesseract::Tesseract::textord_tabfind_show_vlines = false |
"Debug line finding"
Definition at line 961 of file tesseractclass.h.
bool tesseract::Tesseract::textord_use_cjk_fp_model = FALSE |
"Use CJK fixed pitch model"
Definition at line 962 of file tesseractclass.h.
bool tesseract::Tesseract::unlv_tilde_crunching = true |
"Mark v.bad words for tilde crunch"
Definition at line 835 of file tesseractclass.h.
char* tesseract::Tesseract::unrecognised_char = "|" |
"Output char for unidentified blobs"
Definition at line 907 of file tesseractclass.h.
bool tesseract::Tesseract::use_new_state_cost = FALSE |
"use new state cost heuristics for segmentation state evaluation"
Definition at line 1010 of file tesseractclass.h.
"Max allowed deviation of blob top outside of font data"
Definition at line 877 of file tesseractclass.h.
"Min change in xht before actually trying it"
Definition at line 878 of file tesseractclass.h.