tesseract
3.03
|
#include "tessvars.h"
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "genericvector.h"
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include "secname.h"
#include "globaloc.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"
Go to the source code of this file.
Namespaces | |
namespace | tesseract |
Functions | |
CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract | |
void | reject_blanks (WERD_RES *word) |
void | reject_poor_matches (WERD_RES *word) |
float | compute_reject_threshold (WERD_CHOICE *word) |
Definition at line 51 of file reject.cpp.
{ void Tesseract::set_done(WERD_RES *word, inT16 pass) { word->done = word->tess_accepted && (strchr(word->best_choice->unichar_string().string(), ' ') == NULL); bool word_is_ambig = word->best_choice->dangerous_ambig_found(); bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM || word->best_choice->permuter() == FREQ_DAWG_PERM || word->best_choice->permuter() == USER_DAWG_PERM; if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) && one_ell_conflict(word, FALSE)) { if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n"); word->done = FALSE; } if (word->done && ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) { if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n"); word->done = FALSE; } if (tessedit_rejection_debug) { tprintf("set_done(): done=%d\n", word->done); word->best_choice->print(""); } } /************************************************************************* * make_reject_map() * * Sets the done flag to indicate whether the resylt is acceptable. * * Sets a reject map for the word. *************************************************************************/ void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) { int i; int offset; flip_0O(word); check_debug_pt(word, -1); // For trap only set_done(word, pass); // Set acceptance word->reject_map.initialise(word->best_choice->unichar_lengths().length()); reject_blanks(word); /* 0: Rays original heuristic - the baseline */ if (tessedit_reject_mode == 0) { if (!word->done) reject_poor_matches(word); } else if (tessedit_reject_mode == 5) { /* 5: Reject I/1/l from words where there is no strong contextual confirmation; the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); and the whole of any words which are very small */ if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) { word->reject_map.rej_word_small_xht(); } else { one_ell_conflict(word, TRUE); /* Originally the code here just used the done flag. Now I have duplicated and unpacked the conditions for setting the done flag so that each mechanism can be turned on or off independently. This works WITHOUT affecting the done flag setting. */ if (rej_use_tess_accepted && !word->tess_accepted) word->reject_map.rej_word_not_tess_accepted (); if (rej_use_tess_blanks && (strchr (word->best_choice->unichar_string().string (), ' ') != NULL)) word->reject_map.rej_word_contains_blanks (); WERD_CHOICE* best_choice = word->best_choice; if (rej_use_good_perm) { if ((best_choice->permuter() == SYSTEM_DAWG_PERM || best_choice->permuter() == FREQ_DAWG_PERM || best_choice->permuter() == USER_DAWG_PERM) && (!rej_use_sensible_wd || acceptable_word_string(*word->uch_set, best_choice->unichar_string().string(), best_choice->unichar_lengths().string()) != AC_UNACCEPTABLE)) { // PASSED TEST } else if (best_choice->permuter() == NUMBER_PERM) { if (rej_alphas_in_number_perm) { for (i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0'; offset += best_choice->unichar_lengths()[i++]) { if (word->reject_map[i].accepted() && word->uch_set->get_isalpha( best_choice->unichar_string().string() + offset, best_choice->unichar_lengths()[i])) word->reject_map[i].setrej_bad_permuter(); // rej alpha } } } else { word->reject_map.rej_word_bad_permuter(); } } /* Ambig word rejection was here once !!*/ } } else { tprintf("BAD tessedit_reject_mode\n"); err_exit(); } if (tessedit_image_border > -1) reject_edge_blobs(word); check_debug_pt (word, 10); if (tessedit_rejection_debug) { tprintf("Permuter Type = %d\n", word->best_choice->permuter ()); tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty (), word->best_choice->rating ()); tprintf("Dict word: %d\n", dict_word(*(word->best_choice))); } flip_hyphens(word); check_debug_pt(word, 20); } } // namespace tesseract
float compute_reject_threshold | ( | WERD_CHOICE * | word | ) |
Definition at line 229 of file reject.cpp.
{ float threshold; // rejection threshold float bestgap = 0.0f; // biggest gap float gapstart; // bottom of gap // super iterator BLOB_CHOICE_IT choice_it; // real iterator int blob_count = word->length(); GenericVector<float> ratings; ratings.init_to_size(blob_count, 0.0f); for (int i = 0; i < blob_count; ++i) { ratings[i] = word->certainty(i); } ratings.sort(); gapstart = ratings[0] - 1; // all reject if none better if (blob_count >= 3) { for (int index = 0; index < blob_count - 1; index++) { if (ratings[index + 1] - ratings[index] > bestgap) { bestgap = ratings[index + 1] - ratings[index]; // find biggest gapstart = ratings[index]; } } } threshold = gapstart + bestgap / 2; return threshold; }
void reject_blanks | ( | WERD_RES * | word | ) |
Definition at line 181 of file reject.cpp.
{ inT16 i; inT16 offset; for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0'; offset += word->best_choice->unichar_lengths()[i], i += 1) { if (word->best_choice->unichar_string()[offset] == ' ') //rej unrecognised blobs word->reject_map[i].setrej_tess_failure (); } }
void reject_poor_matches | ( | WERD_RES * | word | ) |
Definition at line 210 of file reject.cpp.
{ float threshold = compute_reject_threshold(word->best_choice); for (int i = 0; i < word->best_choice->length(); ++i) { if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) word->reject_map[i].setrej_tess_failure(); else if (word->best_choice->certainty(i) < threshold) word->reject_map[i].setrej_poor_match(); } }