tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/reject.cpp File Reference
#include "tessvars.h"
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "genericvector.h"
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include "secname.h"
#include "globaloc.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"

Go to the source code of this file.

Namespaces

namespace  tesseract

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
void reject_blanks (WERD_RES *word)
void reject_poor_matches (WERD_RES *word)
float compute_reject_threshold (WERD_CHOICE *word)

Function Documentation

Definition at line 51 of file reject.cpp.

                    {
void Tesseract::set_done(WERD_RES *word, inT16 pass) {
  word->done = word->tess_accepted &&
      (strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
      word->best_choice->permuter() == FREQ_DAWG_PERM ||
      word->best_choice->permuter() == USER_DAWG_PERM;
  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
      one_ell_conflict(word, FALSE)) {
    if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
    word->done = FALSE;
  }
  if (word->done && ((!word_from_dict &&
      word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
    if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
      word->done = FALSE;
  }
  if (tessedit_rejection_debug) {
    tprintf("set_done(): done=%d\n", word->done);
    word->best_choice->print("");
  }
}


/*************************************************************************
 * make_reject_map()
 *
 * Sets the done flag to indicate whether the resylt is acceptable.
 *
 * Sets a reject map for the word.
 *************************************************************************/
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
  int i;
  int offset;

  flip_0O(word);
  check_debug_pt(word, -1);     // For trap only
  set_done(word, pass);  // Set acceptance
  word->reject_map.initialise(word->best_choice->unichar_lengths().length());
  reject_blanks(word);
  /*
  0: Rays original heuristic - the baseline
  */
  if (tessedit_reject_mode == 0) {
    if (!word->done)
      reject_poor_matches(word);
  } else if (tessedit_reject_mode == 5) {
    /*
    5: Reject I/1/l from words where there is no strong contextual confirmation;
      the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
      and the whole of any words which are very small
    */
    if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
      word->reject_map.rej_word_small_xht();
    } else {
      one_ell_conflict(word, TRUE);
      /*
        Originally the code here just used the done flag. Now I have duplicated
        and unpacked the conditions for setting the done flag so that each
        mechanism can be turned on or off independently. This works WITHOUT
        affecting the done flag setting.
      */
      if (rej_use_tess_accepted && !word->tess_accepted)
        word->reject_map.rej_word_not_tess_accepted ();

      if (rej_use_tess_blanks &&
        (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
        word->reject_map.rej_word_contains_blanks ();

      WERD_CHOICE* best_choice = word->best_choice;
      if (rej_use_good_perm) {
        if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
             best_choice->permuter() == FREQ_DAWG_PERM ||
             best_choice->permuter() == USER_DAWG_PERM) &&
            (!rej_use_sensible_wd ||
             acceptable_word_string(*word->uch_set,
                                    best_choice->unichar_string().string(),
                                    best_choice->unichar_lengths().string()) !=
                                        AC_UNACCEPTABLE)) {
          // PASSED TEST
        } else if (best_choice->permuter() == NUMBER_PERM) {
          if (rej_alphas_in_number_perm) {
            for (i = 0, offset = 0;
                 best_choice->unichar_string()[offset] != '\0';
                 offset += best_choice->unichar_lengths()[i++]) {
              if (word->reject_map[i].accepted() &&
                  word->uch_set->get_isalpha(
                      best_choice->unichar_string().string() + offset,
                      best_choice->unichar_lengths()[i]))
                word->reject_map[i].setrej_bad_permuter();
              // rej alpha
            }
          }
        } else {
          word->reject_map.rej_word_bad_permuter();
        }
      }
      /* Ambig word rejection was here once !!*/
    }
  } else {
    tprintf("BAD tessedit_reject_mode\n");
    err_exit();
  }

  if (tessedit_image_border > -1)
    reject_edge_blobs(word);

  check_debug_pt (word, 10);
  if (tessedit_rejection_debug) {
    tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
    tprintf("Certainty: %f     Rating: %f\n",
      word->best_choice->certainty (), word->best_choice->rating ());
    tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
  }

  flip_hyphens(word);
  check_debug_pt(word, 20);
}
}  // namespace tesseract

Definition at line 229 of file reject.cpp.

                                                  {
  float threshold;               // rejection threshold
  float bestgap = 0.0f;          // biggest gap
  float gapstart;                // bottom of gap
                                 // super iterator
  BLOB_CHOICE_IT choice_it;      // real iterator

  int blob_count = word->length();
  GenericVector<float> ratings;
  ratings.init_to_size(blob_count, 0.0f);
  for (int i = 0; i < blob_count; ++i) {
    ratings[i] = word->certainty(i);
  }
  ratings.sort();
  gapstart = ratings[0] - 1;     // all reject if none better
  if (blob_count >= 3) {
    for (int index = 0; index < blob_count - 1; index++) {
      if (ratings[index + 1] - ratings[index] > bestgap) {
        bestgap = ratings[index + 1] - ratings[index];
        // find biggest
        gapstart = ratings[index];
      }
    }
  }
  threshold = gapstart + bestgap / 2;

  return threshold;
}
void reject_blanks ( WERD_RES word)

Definition at line 181 of file reject.cpp.

                                   {
  inT16 i;
  inT16 offset;

  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
       offset += word->best_choice->unichar_lengths()[i], i += 1) {
    if (word->best_choice->unichar_string()[offset] == ' ')
                                 //rej unrecognised blobs
      word->reject_map[i].setrej_tess_failure ();
  }
}
void reject_poor_matches ( WERD_RES word)

Definition at line 210 of file reject.cpp.

                                         {
  float threshold = compute_reject_threshold(word->best_choice);
  for (int i = 0; i < word->best_choice->length(); ++i) {
    if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
      word->reject_map[i].setrej_tess_failure();
    else if (word->best_choice->certainty(i) < threshold)
      word->reject_map[i].setrej_poor_match();
  }
}
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines