tesseract  3.03
tesseract::LanguageModel Class Reference

#include <language_model.h>

List of all members.

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 ~LanguageModel ()
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
bool UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
bool AcceptableChoiceFound ()
void SetAcceptableChoiceFound (bool val)
ParamsModelgetParamsModel ()

Static Public Member Functions

static void ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])

Public Attributes

int language_model_debug_level = 0
bool language_model_ngram_on = false
int language_model_ngram_order = 8
int language_model_viterbi_list_max_num_prunable = 10
int language_model_viterbi_list_max_size = 500
double language_model_ngram_small_prob = 0.000001
double language_model_ngram_nonmatch_score = -40.0
bool language_model_ngram_use_only_first_uft8_step = false
double language_model_ngram_scale_factor = 0.03
double language_model_ngram_rating_factor = 16.0
bool language_model_ngram_space_delimited_language = true
int language_model_min_compound_length = 3
double language_model_penalty_non_freq_dict_word = 0.1
double language_model_penalty_non_dict_word = 0.15
double language_model_penalty_punc = 0.2
double language_model_penalty_case = 0.1
double language_model_penalty_script = 0.5
double language_model_penalty_chartype = 0.3
double language_model_penalty_font = 0.00
double language_model_penalty_spacing = 0.05
double language_model_penalty_increment = 0.01
int wordrec_display_segmentations = 0
bool language_model_use_sigmoidal_certainty = false

Static Public Attributes

static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
static const LanguageModelFlagsType kDigitFlag = 0x8
static const LanguageModelFlagsType kXhtConsistentFlag = 0x10
static const float kMaxAvgNgramCost = 25.0f

Protected Member Functions

float CertaintyScore (float cert)
float ComputeAdjustment (int num_problems, float penalty)
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
float ComputeAdjustedPathCost (ViterbiStateEntry *vse)
bool GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
int SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const
ViterbiStateEntryGetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
bool AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
void GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
void UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
WERD_CHOICEConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
bool PrunablePath (const ViterbiStateEntry &vse)
bool AcceptablePath (const ViterbiStateEntry &vse)

Protected Attributes

DawgArgsdawg_args_
float rating_cert_scale_
const UnicityTable< FontInfo > * fontinfo_table_
Dictdict_
bool fixed_pitch_
float max_char_wh_ratio_
STRING prev_word_str_
int prev_word_unichar_step_len_
DawgPositionVectorvery_beginning_active_dawgs_
DawgPositionVectorbeginning_active_dawgs_
bool acceptable_choice_found_
bool correct_segmentation_explored_
ParamsModel params_model_

Detailed Description

Definition at line 42 of file language_model.h.


Constructor & Destructor Documentation

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

Definition at line 45 of file language_model.cpp.

  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
               dict->getCCUtil()->params()),
    BOOL_INIT_MEMBER(language_model_ngram_on, false,
                     "Turn on/off the use of character ngram model",
                     dict->getCCUtil()->params()),
    INT_MEMBER(language_model_ngram_order, 8,
               "Maximum order of the character ngram model",
               dict->getCCUtil()->params()),
    INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
               "Maximum number of prunable (those for which"
               " PrunablePath() is true) entries in each viterbi list"
               " recorded in BLOB_CHOICEs",
               dict->getCCUtil()->params()),
    INT_MEMBER(language_model_viterbi_list_max_size, 500,
               "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
               dict->getCCUtil()->params()),
    double_MEMBER(language_model_ngram_small_prob, 0.000001,
                  "To avoid overly small denominators use this as the "
                  "floor of the probability returned by the ngram model.",
                  dict->getCCUtil()->params()),
    double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
                  "Average classifier score of a non-matching unichar.",
                  dict->getCCUtil()->params()),
    BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
                "Use only the first UTF8 step of the given string"
                " when computing log probabilities.",
                dict->getCCUtil()->params()),
    double_MEMBER(language_model_ngram_scale_factor, 0.03,
                  "Strength of the character ngram model relative to the"
                  " character classifier ",
                  dict->getCCUtil()->params()),
    double_MEMBER(language_model_ngram_rating_factor, 16.0,
                  "Factor to bring log-probs into the same range as ratings"
                  " when multiplied by outline length ",
                  dict->getCCUtil()->params()),
    BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
                "Words are delimited by space",
                dict->getCCUtil()->params()),
    INT_MEMBER(language_model_min_compound_length, 3,
               "Minimum length of compound words",
               dict->getCCUtil()->params()),
    double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
                  "Penalty for words not in the frequent word dictionary",
                  dict->getCCUtil()->params()),
    double_MEMBER(language_model_penalty_non_dict_word, 0.15,
                  "Penalty for non-dictionary words",
                  dict->getCCUtil()->params()),
    double_MEMBER(language_model_penalty_punc, 0.2,
                  "Penalty for inconsistent punctuation",
                  dict->getCCUtil()->params()),
    double_MEMBER(language_model_penalty_case, 0.1,
                  "Penalty for inconsistent case",
                  dict->getCCUtil()->params()),
    double_MEMBER(language_model_penalty_script, 0.5,
                  "Penalty for inconsistent script",
                  dict->getCCUtil()->params()),
    double_MEMBER(language_model_penalty_chartype, 0.3,
                  "Penalty for inconsistent character type",
                  dict->getCCUtil()->params()),
    // TODO(daria, rays): enable font consistency checking
    // after improving font analysis.
    double_MEMBER(language_model_penalty_font, 0.00,
                  "Penalty for inconsistent font",
                  dict->getCCUtil()->params()),
    double_MEMBER(language_model_penalty_spacing, 0.05,
                  "Penalty for inconsistent spacing",
                  dict->getCCUtil()->params()),
    double_MEMBER(language_model_penalty_increment, 0.01,
                  "Penalty increment",
                  dict->getCCUtil()->params()),
    INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
               dict->getCCUtil()->params()),
    BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
                     "Use sigmoidal score for certainty",
                     dict->getCCUtil()->params()),
  fontinfo_table_(fontinfo_table), dict_(dict),
  fixed_pitch_(false), max_char_wh_ratio_(0.0),
  acceptable_choice_found_(false) {
  ASSERT_HOST(dict_ != NULL);
  dawg_args_ = new DawgArgs(NULL, new DawgPositionVector(), NO_PERM);
  very_beginning_active_dawgs_ = new DawgPositionVector();
  beginning_active_dawgs_ = new DawgPositionVector();
}

Member Function Documentation

Definition at line 95 of file language_model.h.

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse) [inline, protected]

Definition at line 299 of file language_model.h.

                                                           {
    return (vse.dawg_info != NULL || vse.Consistent() ||
            (vse.ngram_info != NULL && !vse.ngram_info->pruned));
  }
bool tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
LanguageModelState curr_state,
ViterbiStateEntry parent_vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
) [protected]

Definition at line 548 of file language_model.cpp.

                                 {
  ViterbiStateEntry_IT vit;
  if (language_model_debug_level > 1) {
    tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
            " certainty=%.4f top_choice_flags=0x%x",
            dict_->getUnicharset().id_to_unichar(b->unichar_id()),
            b->rating(), b->certainty(), top_choice_flags);
    if (language_model_debug_level > 5)
      tprintf(" parent_vse=%p\n", parent_vse);
    else
      tprintf("\n");
  }
  // Check whether the list is full.
  if (curr_state != NULL &&
      curr_state->viterbi_state_entries_length >=
          language_model_viterbi_list_max_size) {
    if (language_model_debug_level > 1) {
      tprintf("AddViterbiStateEntry: viterbi list is full!\n");
    }
    return false;
  }

  // Invoke Dawg language model component.
  LanguageModelDawgInfo *dawg_info =
    GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);

  float outline_length =
      AssociateUtils::ComputeOutlineLength(rating_cert_scale_, *b);
  // Invoke Ngram language model component.
  LanguageModelNgramInfo *ngram_info = NULL;
  if (language_model_ngram_on) {
    ngram_info = GenerateNgramInfo(
        dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->certainty(),
        denom, curr_col, curr_row, outline_length, parent_vse);
    ASSERT_HOST(ngram_info != NULL);
  }
  bool liked_by_language_model = dawg_info != NULL ||
      (ngram_info != NULL && !ngram_info->pruned);
  // Quick escape if not liked by the language model, can't be consistent
  // xheight, and not top choice.
  if (!liked_by_language_model && top_choice_flags == 0) {
    if (language_model_debug_level > 1) {
      tprintf("Language model components very early pruned this entry\n");
    }
    delete ngram_info;
    delete dawg_info;
    return false;
  }

  // Check consistency of the path and set the relevant consistency_info.
  LMConsistencyInfo consistency_info(
    parent_vse != NULL ? &parent_vse->consistency_info : NULL);
  // Start with just the x-height consistency, as it provides significant
  // pruning opportunity.
  consistency_info.ComputeXheightConsistency(
      b, dict_->getUnicharset().get_ispunctuation(b->unichar_id()));
  // Turn off xheight consistent flag if not consistent.
  if (consistency_info.InconsistentXHeight()) {
    top_choice_flags &= ~kXhtConsistentFlag;
  }

  // Quick escape if not liked by the language model, not consistent xheight,
  // and not top choice.
  if (!liked_by_language_model && top_choice_flags == 0) {
    if (language_model_debug_level > 1) {
      tprintf("Language model components early pruned this entry\n");
    }
    delete ngram_info;
    delete dawg_info;
    return false;
  }

  // Compute the rest of the consistency info.
  FillConsistencyInfo(curr_col, word_end, b, parent_vse,
                      word_res, &consistency_info);
  if (dawg_info != NULL && consistency_info.invalid_punc) {
    consistency_info.invalid_punc = false;  // do not penalize dict words
  }

  // Compute cost of associating the blobs that represent the current unichar.
  AssociateStats associate_stats;
  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
                        parent_vse, word_res, &associate_stats);
  if (parent_vse != NULL) {
    associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
    associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
  }

  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
  ViterbiStateEntry *new_vse = new ViterbiStateEntry(
      parent_vse, b, 0.0, outline_length,
      consistency_info, associate_stats, top_choice_flags, dawg_info,
      ngram_info, (language_model_debug_level > 0) ?
          dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);
  new_vse->cost = ComputeAdjustedPathCost(new_vse);

  // Invoke Top Choice language model component to make the final adjustments
  // to new_vse->top_choice_flags.
  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
    GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
  }

  // If language model components did not like this unichar - return.
  bool keep = new_vse->top_choice_flags || liked_by_language_model;
  if (!(top_choice_flags & kSmallestRatingFlag) &&  // no non-top choice paths
      consistency_info.inconsistent_script) {       // with inconsistent script
    keep = false;
  }
  if (!keep) {
    if (language_model_debug_level > 1) {
      tprintf("Language model components did not like this entry\n");
    }
    delete new_vse;
    return false;
  }

  // Discard this entry if it represents a prunable path and
  // language_model_viterbi_list_max_num_prunable such entries with a lower
  // cost have already been recorded.
  if (PrunablePath(*new_vse) &&
      (curr_state->viterbi_state_entries_prunable_length >=
       language_model_viterbi_list_max_num_prunable) &&
      new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
    if (language_model_debug_level > 1) {
      tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
              new_vse->cost,
              curr_state->viterbi_state_entries_prunable_max_cost);
    }
    delete new_vse;
    return false;
  }

  // Update best choice if needed.
  if (word_end) {
    UpdateBestChoice(new_vse, pain_points, word_res,
                     best_choice_bundle, blamer_bundle);
    // Discard the entry if UpdateBestChoice() found flaws in it.
    if (new_vse->cost >= WERD_CHOICE::kBadRating &&
        new_vse != best_choice_bundle->best_vse) {
      if (language_model_debug_level > 1) {
        tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
      }
      delete new_vse;
      return false;
    }
  }

  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
                                               false, new_vse);
  curr_state->viterbi_state_entries_length++;
  if (PrunablePath(*new_vse)) {
    curr_state->viterbi_state_entries_prunable_length++;
  }

  // Update lms->viterbi_state_entries_prunable_max_cost and clear
  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
  if ((curr_state->viterbi_state_entries_prunable_length >=
       language_model_viterbi_list_max_num_prunable) ||
      new_vse->top_choice_flags) {
    ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
    int prunable_counter = language_model_viterbi_list_max_num_prunable;
    vit.set_to_list(&(curr_state->viterbi_state_entries));
    for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
      ViterbiStateEntry *curr_vse = vit.data();
      // Clear the appropriate top choice flags of the entries in the
      // list that have cost higher thank new_entry->cost
      // (since they will not be top choices any more).
      if (curr_vse->top_choice_flags && curr_vse != new_vse &&
          curr_vse->cost > new_vse->cost) {
        curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
      }
      if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
      // Update curr_state->viterbi_state_entries_prunable_max_cost.
      if (prunable_counter == 0) {
        curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
        if (language_model_debug_level > 1) {
          tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
                  curr_state->viterbi_state_entries_prunable_max_cost);
        }
        prunable_counter = -1;  // stop counting
      }
    }
  }

  // Print the newly created ViterbiStateEntry.
  if (language_model_debug_level > 2) {
    new_vse->Print("New");
    if (language_model_debug_level > 5)
      curr_state->Print("Updated viterbi list");
  }

  return true;
}
float tesseract::LanguageModel::CertaintyScore ( float  cert) [inline, protected]

Definition at line 104 of file language_model.h.

                                          {
    if (language_model_use_sigmoidal_certainty) {
      // cert is assumed to be between 0 and -dict_->certainty_scale.
      // If you enable language_model_use_sigmoidal_certainty, you
      // need to adjust language_model_ngram_nonmatch_score as well.
      cert = -cert / dict_->certainty_scale;
      return 1.0f / (1.0f + exp(10.0f * cert));
    } else {
      return (-1.0f / cert);
    }
  }

Definition at line 1177 of file language_model.cpp.

                                                                   {
  ASSERT_HOST(vse != NULL);
  if (params_model_.Initialized()) {
    float features[PTRAIN_NUM_FEATURE_TYPES];
    ExtractFeaturesFromPath(*vse, features);
    float cost = params_model_.ComputeCost(features);
    if (language_model_debug_level > 3) {
      tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
      if (language_model_debug_level >= 5) {
        for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
          tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
        }
      }
    }
    return cost * vse->outline_length;
  } else {
    float adjustment = 1.0f;
    if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
      adjustment += language_model_penalty_non_freq_dict_word;
    }
    if (vse->dawg_info == NULL) {
      adjustment += language_model_penalty_non_dict_word;
      if (vse->length > language_model_min_compound_length) {
        adjustment += ((vse->length - language_model_min_compound_length) *
            language_model_penalty_increment);
      }
    }
    if (vse->associate_stats.shape_cost > 0) {
      adjustment += vse->associate_stats.shape_cost /
          static_cast<float>(vse->length);
    }
    if (language_model_ngram_on) {
      ASSERT_HOST(vse->ngram_info != NULL);
      return vse->ngram_info->ngram_and_classifier_cost * adjustment;
    } else {
      adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
                                                 vse->consistency_info);
      return vse->ratings_sum * adjustment;
    }
  }
}
float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
) [inline, protected]

Definition at line 116 of file language_model.h.

                                                                  {
    if (num_problems == 0) return 0.0f;
    if (num_problems == 1) return penalty;
    return (penalty + (language_model_penalty_increment *
                       static_cast<float>(num_problems-1)));
  }
void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
AssociateStats associate_stats 
) [inline, protected]

Definition at line 270 of file language_model.h.

                                                                     {
    AssociateUtils::ComputeStats(
        col, row,
        (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
        (parent_vse != NULL) ? parent_vse->length : 0,
        fixed_pitch_, max_char_wh_ratio,
        word_res, language_model_debug_level > 2, associate_stats);
  }
float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LMConsistencyInfo consistency_info 
) [inline, protected]

Definition at line 127 of file language_model.h.

                                                 {
    if (dawg_info != NULL) {
      return ComputeAdjustment(consistency_info.NumInconsistentCase(),
                               language_model_penalty_case);
    }
    return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
                              language_model_penalty_punc) +
            ComputeAdjustment(consistency_info.NumInconsistentCase(),
                              language_model_penalty_case) +
            ComputeAdjustment(consistency_info.NumInconsistentChartype(),
                              language_model_penalty_chartype) +
            ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
                              language_model_penalty_spacing) +
            (consistency_info.inconsistent_script ?
             language_model_penalty_script : 0.0f) +
            (consistency_info.inconsistent_font ?
             language_model_penalty_font : 0.0f));
  }
float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list) [protected]

Definition at line 978 of file language_model.cpp.

                                                             {
  if (curr_list->empty()) return 1.0f;
  float denom = 0.0f;
  int len = 0;
  BLOB_CHOICE_IT c_it(curr_list);
  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
    ASSERT_HOST(c_it.data() != NULL);
    ++len;
    denom += CertaintyScore(c_it.data()->certainty());
  }
  assert(len != 0);
  // The ideal situation would be to have the classifier scores for
  // classifying each position as each of the characters in the unicharset.
  // Since we can not do this because of speed, we add a very crude estimate
  // of what these scores for the "missing" classifications would sum up to.
  denom += (dict_->getUnicharset().size() - len) *
    CertaintyScore(language_model_ngram_nonmatch_score);

  return denom;
}
float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
) [protected]

Definition at line 918 of file language_model.cpp.

                                                         {
  const char *context_ptr = context;
  char *modified_context = NULL;
  char *modified_context_end = NULL;
  const char *unichar_ptr = unichar;
  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
  float prob = 0.0f;
  int step = 0;
  while (unichar_ptr < unichar_end &&
         (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
    if (language_model_debug_level > 1) {
      tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
              dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
    }
    prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
    ++(*unichar_step_len);
    if (language_model_ngram_use_only_first_uft8_step) break;
    unichar_ptr += step;
    // If there are multiple UTF8 characters present in unichar, context is
    // updated to include the previously examined characters from str,
    // unless use_only_first_uft8_step is true.
    if (unichar_ptr < unichar_end) {
      if (modified_context == NULL) {
        int context_len = strlen(context);
        modified_context =
          new char[context_len + strlen(unichar_ptr) + step + 1];
        strncpy(modified_context, context, context_len);
        modified_context_end = modified_context + context_len;
        context_ptr = modified_context;
      }
      strncpy(modified_context_end, unichar_ptr - step, step);
      modified_context_end += step;
      *modified_context_end = '\0';
    }
  }
  prob /= static_cast<float>(*unichar_step_len);  // normalize
  if (prob < language_model_ngram_small_prob) {
    if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
    *found_small_prob = true;
    prob = language_model_ngram_small_prob;
  }
  *ngram_cost = -1.0*log2(prob);
  float ngram_and_classifier_cost =
      -1.0*log2(CertaintyScore(certainty)/denom) +
      *ngram_cost * language_model_ngram_scale_factor;
  if (language_model_debug_level > 1) {
    tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
            unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
            ngram_and_classifier_cost);
  }
  if (modified_context != NULL) delete[] modified_context;
  return ngram_and_classifier_cost;
}
WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( ViterbiStateEntry vse,
WERD_RES word_res,
DANGERR fixpt,
BlamerBundle blamer_bundle,
bool *  truth_path 
) [protected]

Definition at line 1368 of file language_model.cpp.

                      {
  if (truth_path != NULL) {
    *truth_path =
        (blamer_bundle != NULL &&
         vse->length == blamer_bundle->correct_segmentation_length());
  }
  BLOB_CHOICE *curr_b = vse->curr_b;
  ViterbiStateEntry *curr_vse = vse;

  int i;
  bool compound = dict_->hyphenated();  // treat hyphenated words as compound

  // Re-compute the variance of the width-to-height ratios (since we now
  // can compute the mean over the whole word).
  float full_wh_ratio_mean = 0.0f;
  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
    vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
    full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
                          static_cast<float>(vse->length));
    vse->associate_stats.full_wh_ratio_var = 0.0f;
  }

  // Construct a WERD_CHOICE by tracing parent pointers.
  WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
  word->set_length(vse->length);
  int total_blobs = 0;
  for (i = (vse->length-1); i >= 0; --i) {
    if (blamer_bundle != NULL && truth_path != NULL && *truth_path &&
        !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
        *truth_path = false;
    }
    // The number of blobs used for this choice is row - col + 1.
    int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
    total_blobs += num_blobs;
    word->set_blob_choice(i, num_blobs, curr_b);
    // Update the width-to-height ratio variance. Useful non-space delimited
    // languages to ensure that the blobs are of uniform width.
    // Skip leading and trailing punctuation when computing the variance.
    if ((full_wh_ratio_mean != 0.0f &&
         ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
          !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
      vse->associate_stats.full_wh_ratio_var +=
        pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
      if (language_model_debug_level > 2) {
        tprintf("full_wh_ratio_var += (%g-%g)^2\n",
                full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
      }
    }

    // Mark the word as compound if compound permuter was set for any of
    // the unichars on the path (usually this will happen for unichars
    // that are compounding operators, like "-" and "/").
    if (!compound && curr_vse->dawg_info &&
        curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;

    // Update curr_* pointers.
    curr_vse = curr_vse->parent_vse;
    if (curr_vse == NULL) break;
    curr_b = curr_vse->curr_b;
  }
  ASSERT_HOST(i == 0);  // check that we recorded all the unichar ids.
  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
  // Re-adjust shape cost to include the updated width-to-height variance.
  if (full_wh_ratio_mean != 0.0f) {
    vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
  }

  word->set_rating(vse->ratings_sum);
  word->set_certainty(vse->min_certainty);
  word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
                      vse->consistency_info.BodyMaxXHeight());
  if (vse->dawg_info != NULL) {
    word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
    word->set_permuter(NGRAM_PERM);
  } else if (vse->top_choice_flags) {
    word->set_permuter(TOP_CHOICE_PERM);
  } else {
    word->set_permuter(NO_PERM);
  }
  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
                                                            word_res->ratings));
  return word;
}
void tesseract::LanguageModel::ExtractFeaturesFromPath ( const ViterbiStateEntry vse,
float  features[] 
) [static]

Definition at line 1319 of file language_model.cpp.

                                                    {
  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
  // Record dictionary match info.
  int len = vse.length <= kMaxSmallWordUnichars ? 0 :
      vse.length <= kMaxMediumWordUnichars ? 1 : 2;
  if (vse.dawg_info != NULL) {
    int permuter = vse.dawg_info->permuter;
    if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
      if (vse.consistency_info.num_digits == vse.length) {
        features[PTRAIN_DIGITS_SHORT+len] = 1.0;
      } else {
        features[PTRAIN_NUM_SHORT+len] = 1.0;
      }
    } else if (permuter == DOC_DAWG_PERM) {
      features[PTRAIN_DOC_SHORT+len] = 1.0;
    } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
        permuter == COMPOUND_PERM) {
      features[PTRAIN_DICT_SHORT+len] = 1.0;
    } else if (permuter == FREQ_DAWG_PERM) {
      features[PTRAIN_FREQ_SHORT+len] = 1.0;
    }
  }
  // Record shape cost feature (normalized by path length).
  features[PTRAIN_SHAPE_COST_PER_CHAR] =
      vse.associate_stats.shape_cost / static_cast<float>(vse.length);
  // Record ngram cost. (normalized by the path length).
  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
  if (vse.ngram_info != NULL) {
    features[PTRAIN_NGRAM_COST_PER_CHAR] =
        vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
  }
  // Record consistency-related features.
  // Disabled this feature for due to its poor performance.
  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
  features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
  features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
  features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ?
      vse.consistency_info.NumInconsistentChartype() : 0.0;
  features[PTRAIN_NUM_BAD_SPACING] =
      vse.consistency_info.NumInconsistentSpaces();
  // Disabled this feature for now due to its poor performance.
  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;

  // Classifier-related features.
  features[PTRAIN_RATING_PER_CHAR] =
      vse.ratings_sum / static_cast<float>(vse.outline_length);
}
void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
LMConsistencyInfo consistency_info 
) [protected]

Definition at line 999 of file language_model.cpp.

                                         {
  const UNICHARSET &unicharset = dict_->getUnicharset();
  UNICHAR_ID unichar_id = b->unichar_id();
  BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL;

  // Check punctuation validity.
  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
  if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
    if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
        (unicharset.get_isalpha(parent_b->unichar_id()) ||
         unicharset.get_isdigit(parent_b->unichar_id()))) {
      // reset punc_ref for compound words
      consistency_info->punc_ref = NO_EDGE;
    } else {
      bool is_apos = dict_->is_apostrophe(unichar_id);
      bool prev_is_numalpha = (parent_b != NULL &&
          (unicharset.get_isalpha(parent_b->unichar_id()) ||
           unicharset.get_isdigit(parent_b->unichar_id())));
      UNICHAR_ID pattern_unichar_id =
        (unicharset.get_isalpha(unichar_id) ||
         unicharset.get_isdigit(unichar_id) ||
         (is_apos && prev_is_numalpha)) ?
        Dawg::kPatternUnicharID : unichar_id;
      if (consistency_info->punc_ref == NO_EDGE ||
          pattern_unichar_id != Dawg::kPatternUnicharID ||
          dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
          Dawg::kPatternUnicharID) {
        NODE_REF node = Dict::GetStartingNode(dict_->GetPuncDawg(),
                                              consistency_info->punc_ref);
        consistency_info->punc_ref =
          (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
              node, pattern_unichar_id, word_end) : NO_EDGE;
        if (consistency_info->punc_ref == NO_EDGE) {
          consistency_info->invalid_punc = true;
        }
      }
    }
  }

  // Update case related counters.
  if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
    // Reset counters if we are dealing with a compound word.
    consistency_info->num_lower = 0;
    consistency_info->num_non_first_upper = 0;
  }
  else if (unicharset.get_islower(unichar_id)) {
    consistency_info->num_lower++;
  } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
    if (unicharset.get_isupper(parent_b->unichar_id()) ||
        consistency_info->num_lower > 0 ||
        consistency_info->num_non_first_upper > 0) {
      consistency_info->num_non_first_upper++;
    }
  }

  // Initialize consistency_info->script_id (use script of unichar_id
  // if it is not Common, use script id recorded by the parent otherwise).
  // Set inconsistent_script to true if the script of the current unichar
  // is not consistent with that of the parent.
  consistency_info->script_id = unicharset.get_script(unichar_id);
  // Hiragana and Katakana can mix with Han.
  if (dict_->getUnicharset().han_sid() != dict_->getUnicharset().null_sid()) {
    if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
         consistency_info->script_id == unicharset.hiragana_sid()) ||
        (unicharset.katakana_sid() != unicharset.null_sid() &&
         consistency_info->script_id == unicharset.katakana_sid())) {
      consistency_info->script_id = dict_->getUnicharset().han_sid();
    }
  }

  if (parent_vse != NULL &&
      (parent_vse->consistency_info.script_id !=
       dict_->getUnicharset().common_sid())) {
    int parent_script_id = parent_vse->consistency_info.script_id;
    // If script_id is Common, use script id of the parent instead.
    if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
      consistency_info->script_id = parent_script_id;
    }
    if (consistency_info->script_id != parent_script_id) {
      consistency_info->inconsistent_script = true;
    }
  }

  // Update chartype related counters.
  if (unicharset.get_isalpha(unichar_id)) {
    consistency_info->num_alphas++;
  } else if (unicharset.get_isdigit(unichar_id)) {
    consistency_info->num_digits++;
  } else if (!unicharset.get_ispunctuation(unichar_id)) {
    consistency_info->num_other++;
  }

  // Check font and spacing consistency.
  if (fontinfo_table_->size() > 0 && parent_b != NULL) {
    int fontinfo_id = -1;
    if (parent_b->fontinfo_id() == b->fontinfo_id() ||
        parent_b->fontinfo_id2() == b->fontinfo_id()) {
      fontinfo_id = b->fontinfo_id();
    } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
                parent_b->fontinfo_id2() == b->fontinfo_id2()) {
      fontinfo_id = b->fontinfo_id2();
    }
    if(language_model_debug_level > 1) {
      tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
              (parent_b->fontinfo_id() >= 0) ?
                  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
              (parent_b->fontinfo_id2() >= 0) ?
                  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
              (b->fontinfo_id() >= 0) ?
                  fontinfo_table_->get(b->fontinfo_id()).name : "",
              (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
              (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
              fontinfo_id);
    }
    if (!word_res->blob_widths.empty()) {  // if we have widths/gaps info
      bool expected_gap_found = false;
      float expected_gap;
      int temp_gap;
      if (fontinfo_id >= 0) {  // found a common font
        ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
        if (fontinfo_table_->get(fontinfo_id).get_spacing(
            parent_b->unichar_id(), unichar_id, &temp_gap)) {
          expected_gap = temp_gap;
          expected_gap_found = true;
        }
      } else {
        consistency_info->inconsistent_font = true;
        // Get an average of the expected gaps in each font
        int num_addends = 0;
        expected_gap = 0;
        int temp_fid;
        for (int i = 0; i < 4; ++i) {
          if (i == 0) {
            temp_fid = parent_b->fontinfo_id();
          } else if (i == 1) {
            temp_fid = parent_b->fontinfo_id2();
          } else if (i == 2) {
            temp_fid = b->fontinfo_id();
          } else {
            temp_fid = b->fontinfo_id2();
          }
          ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
          if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
              parent_b->unichar_id(), unichar_id, &temp_gap)) {
            expected_gap += temp_gap;
            num_addends++;
          }
        }
        expected_gap_found = (num_addends > 0);
        if (num_addends > 0) {
          expected_gap /= static_cast<float>(num_addends);
        }
      }
      if (expected_gap_found) {
        float actual_gap =
            static_cast<float>(word_res->GetBlobsGap(curr_col-1));
        float gap_ratio = expected_gap / actual_gap;
        // TODO(daria): find a good way to tune this heuristic estimate.
        if (gap_ratio < 1/2 || gap_ratio > 2) {
          consistency_info->num_inconsistent_spaces++;
        }
        if (language_model_debug_level > 1) {
          tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
                  unicharset.id_to_unichar(parent_b->unichar_id()),
                  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
                  unichar_id, curr_col, expected_gap, actual_gap);
        }
      }
    }
  }
}
LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse 
) [protected]

Definition at line 770 of file language_model.cpp.

                                         {
  // Initialize active_dawgs from parent_vse if it is not NULL.
  // Otherwise use very_beginning_active_dawgs_.
  if (parent_vse == NULL) {
    dawg_args_->active_dawgs = very_beginning_active_dawgs_;
    dawg_args_->permuter = NO_PERM;
  } else {
    if (parent_vse->dawg_info == NULL) return NULL;  // not a dict word path
    dawg_args_->active_dawgs = parent_vse->dawg_info->active_dawgs;
    dawg_args_->permuter = parent_vse->dawg_info->permuter;
  }

  // Deal with hyphenated words.
  if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
    if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
    return new LanguageModelDawgInfo(dawg_args_->active_dawgs,
                                     COMPOUND_PERM);
  }

  // Deal with compound words.
  if (dict_->compound_marker(b.unichar_id()) &&
      (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
    if (language_model_debug_level > 0) tprintf("Found compound marker\n");
    // Do not allow compound operators at the beginning and end of the word.
    // Do not allow more than one compound operator per word.
    // Do not allow compounding of words with lengths shorter than
    // language_model_min_compound_length
    if (parent_vse == NULL || word_end ||
        dawg_args_->permuter == COMPOUND_PERM ||
        parent_vse->length < language_model_min_compound_length) return NULL;

    int i;
    // Check a that the path terminated before the current character is a word.
    bool has_word_ending = false;
    for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) {
      const DawgPosition &pos = (*parent_vse->dawg_info->active_dawgs)[i];
      const Dawg *pdawg = pos.dawg_index < 0
          ? NULL : dict_->GetDawg(pos.dawg_index);
      if (pdawg == NULL || pos.back_to_punc) continue;;
      if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
          pdawg->end_of_word(pos.dawg_ref)) {
        has_word_ending = true;
        break;
      }
    }
    if (!has_word_ending) return NULL;

    if (language_model_debug_level > 0) tprintf("Compound word found\n");
    return new LanguageModelDawgInfo(beginning_active_dawgs_, COMPOUND_PERM);
  }  // done dealing with compound words

  LanguageModelDawgInfo *dawg_info = NULL;

  // Call LetterIsOkay().
  // Use the normalized IDs so that all shapes of ' can be allowed in words
  // like don't.
  const GenericVector<UNICHAR_ID>& normed_ids =
      dict_->getUnicharset().normed_ids(b.unichar_id());
  DawgPositionVector tmp_active_dawgs;
  for (int i = 0; i < normed_ids.size(); ++i) {
    if (language_model_debug_level > 2)
      tprintf("Test Letter OK for unichar %d, normed %d\n",
              b.unichar_id(), normed_ids[i]);
    dict_->LetterIsOkay(dawg_args_, normed_ids[i],
                        word_end && i == normed_ids.size() - 1);
    if (dawg_args_->permuter == NO_PERM) {
      break;
    } else if (i < normed_ids.size() - 1) {
      tmp_active_dawgs = *dawg_args_->updated_dawgs;
      dawg_args_->active_dawgs = &tmp_active_dawgs;
    }
    if (language_model_debug_level > 2)
      tprintf("Letter was OK for unichar %d, normed %d\n",
              b.unichar_id(), normed_ids[i]);
  }
  dawg_args_->active_dawgs = NULL;
  if (dawg_args_->permuter != NO_PERM) {
    dawg_info = new LanguageModelDawgInfo(dawg_args_->updated_dawgs,
                                          dawg_args_->permuter);
  } else if (language_model_debug_level > 3) {
    tprintf("Letter %s not OK!\n",
            dict_->getUnicharset().id_to_unichar(b.unichar_id()));
  }

  return dawg_info;
}
LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
float  outline_length,
const ViterbiStateEntry parent_vse 
) [protected]

Definition at line 861 of file language_model.cpp.

                                         {
  // Initialize parent context.
  const char *pcontext_ptr = "";
  int pcontext_unichar_step_len = 0;
  if (parent_vse == NULL) {
    pcontext_ptr = prev_word_str_.string();
    pcontext_unichar_step_len = prev_word_unichar_step_len_;
  } else {
    pcontext_ptr = parent_vse->ngram_info->context.string();
    pcontext_unichar_step_len =
      parent_vse->ngram_info->context_unichar_step_len;
  }
  // Compute p(unichar | parent context).
  int unichar_step_len = 0;
  bool pruned = false;
  float ngram_cost;
  float ngram_and_classifier_cost =
      ComputeNgramCost(unichar, certainty, denom,
                       pcontext_ptr, &unichar_step_len,
                       &pruned, &ngram_cost);
  // Normalize just the ngram_and_classifier_cost by outline_length.
  // The ngram_cost is used by the params_model, so it needs to be left as-is,
  // and the params model cost will be normalized by outline_length.
  ngram_and_classifier_cost *=
      outline_length / language_model_ngram_rating_factor;
  // Add the ngram_cost of the parent.
  if (parent_vse != NULL) {
    ngram_and_classifier_cost +=
        parent_vse->ngram_info->ngram_and_classifier_cost;
    ngram_cost += parent_vse->ngram_info->ngram_cost;
  }

  // Shorten parent context string by unichar_step_len unichars.
  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
                    language_model_ngram_order);
  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
  while (num_remove > 0 && *pcontext_ptr != '\0') {
    pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
    --num_remove;
  }

  // Decide whether to prune this ngram path and update changed accordingly.
  if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;

  // Construct and return the new LanguageModelNgramInfo.
  LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
      pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
      ngram_and_classifier_cost);
  ngram_info->context += unichar;
  ngram_info->context_unichar_step_len += unichar_step_len;
  assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
  return ngram_info;
}
void tesseract::LanguageModel::GenerateTopChoiceInfo ( ViterbiStateEntry new_vse,
const ViterbiStateEntry parent_vse,
LanguageModelState lms 
) [protected]

Definition at line 754 of file language_model.cpp.

                                                                   {
  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
  for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
       new_vse->cost >= vit.data()->cost; vit.forward()) {
    // Clear the appropriate flags if the list already contains
    // a top choice entry with a lower cost.
    new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
  }
  if (language_model_debug_level > 2) {
    tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
            new_vse->top_choice_flags);
  }
}
ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE ( bool  just_classified,
bool  mixed_alnum,
const BLOB_CHOICE bc,
LanguageModelFlagsType  blob_choice_flags,
const UNICHARSET unicharset,
WERD_RES word_res,
ViterbiStateEntry_IT *  vse_it,
LanguageModelFlagsType top_choice_flags 
) const [protected]

Definition at line 487 of file language_model.cpp.

                                                    {
  for (; !vse_it->cycled_list(); vse_it->forward()) {
    ViterbiStateEntry* parent_vse = vse_it->data();
    // Only consider the parent if it has been updated or
    // if the current ratings cell has just been classified.
    if (!just_classified && !parent_vse->updated) continue;
    if (language_model_debug_level > 2)
      parent_vse->Print("Considering");
    // If the parent is non-alnum, then upper counts as lower.
    *top_choice_flags = blob_choice_flags;
    if ((blob_choice_flags & kUpperCaseFlag) &&
        !parent_vse->HasAlnumChoice(unicharset)) {
      *top_choice_flags |= kLowerCaseFlag;
    }
    *top_choice_flags &= parent_vse->top_choice_flags;
    UNICHAR_ID unichar_id = bc->unichar_id();
    const BLOB_CHOICE* parent_b = parent_vse->curr_b;
    UNICHAR_ID parent_id = parent_b->unichar_id();
    // Digits do not bind to alphas if there is a mix in both parent and current
    // or if the alpha is not the top choice.
    if (unicharset.get_isdigit(unichar_id) &&
        unicharset.get_isalpha(parent_id) &&
        (mixed_alnum || *top_choice_flags == 0))
      continue;  // Digits don't bind to alphas.
    // Likewise alphas do not bind to digits if there is a mix in both or if
    // the digit is not the top choice.
    if (unicharset.get_isalpha(unichar_id) &&
        unicharset.get_isdigit(parent_id) &&
        (mixed_alnum || *top_choice_flags == 0))
      continue;  // Alphas don't bind to digits.
    // If there is a case mix of the same alpha in the parent list, then
    // competing_vse is non-null and will be used to determine whether
    // or not to bind the current blob choice.
    if (parent_vse->competing_vse != NULL) {
      const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
      UNICHAR_ID other_id = competing_b->unichar_id();
      if (language_model_debug_level >= 5) {
        tprintf("Parent %s has competition %s\n",
                unicharset.id_to_unichar(parent_id),
                unicharset.id_to_unichar(other_id));
      }
      if (unicharset.SizesDistinct(parent_id, other_id)) {
        // If other_id matches bc wrt position and size, and parent_id, doesn't,
        // don't bind to the current parent.
        if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
                                language_model_debug_level >= 5) &&
            !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
                                language_model_debug_level >= 5))
          continue;  // Competing blobchoice has a better vertical match.
      }
    }
    vse_it->forward();
    return parent_vse;  // This one is good!
  }
  return NULL;  // Ran out of possibilities.
}

Definition at line 100 of file language_model.h.

{ return params_model_; }
bool tesseract::LanguageModel::GetTopLowerUpperDigit ( BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper,
BLOB_CHOICE **  first_digit 
) const [protected]

Definition at line 374 of file language_model.cpp.

                                                                           {
  BLOB_CHOICE_IT c_it(curr_list);
  const UNICHARSET &unicharset = dict_->getUnicharset();
  BLOB_CHOICE *first_unichar = NULL;
  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
    UNICHAR_ID unichar_id = c_it.data()->unichar_id();
    if (unicharset.get_fragment(unichar_id)) continue;  // skip fragments
    if (first_unichar == NULL) first_unichar = c_it.data();
    if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
      *first_lower = c_it.data();
    }
    if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) &&
        !unicharset.get_islower(unichar_id)) {
      *first_upper = c_it.data();
    }
    if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) {
      *first_digit = c_it.data();
    }
  }
  ASSERT_HOST(first_unichar != NULL);
  bool mixed = (*first_lower != NULL || *first_upper != NULL) &&
      *first_digit != NULL;
  if (*first_lower == NULL) *first_lower = first_unichar;
  if (*first_upper == NULL) *first_upper = first_unichar;
  if (*first_digit == NULL) *first_digit = first_unichar;
  return mixed;
}
void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  max_char_wh_ratio,
float  rating_cert_scale 
)

Definition at line 138 of file language_model.cpp.

                                                         {
  fixed_pitch_ = fixed_pitch;
  max_char_wh_ratio_ = max_char_wh_ratio;
  rating_cert_scale_ = rating_cert_scale;
  acceptable_choice_found_ = false;
  correct_segmentation_explored_ = false;

  // Initialize vectors with beginning DawgInfos.
  very_beginning_active_dawgs_->clear();
  dict_->init_active_dawgs(very_beginning_active_dawgs_, false);
  beginning_active_dawgs_->clear();
  dict_->default_dawgs(beginning_active_dawgs_, false);

  // Fill prev_word_str_ with the last language_model_ngram_order
  // unichars from prev_word.
  if (language_model_ngram_on) {
    if (prev_word != NULL && prev_word->unichar_string() != NULL) {
      prev_word_str_ = prev_word->unichar_string();
      if (language_model_ngram_space_delimited_language) prev_word_str_ += ' ';
    } else {
      prev_word_str_ = " ";
    }
    const char *str_ptr = prev_word_str_.string();
    const char *str_end = str_ptr + prev_word_str_.length();
    int step;
    prev_word_unichar_step_len_ = 0;
    while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
      str_ptr += step;
      ++prev_word_unichar_step_len_;
    }
    ASSERT_HOST(str_ptr == str_end);
  }
}
bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry vse) [inline, protected]

Definition at line 289 of file language_model.h.

                                                         {
    if (vse.top_choice_flags) return false;
    if (vse.dawg_info != NULL &&
        (vse.dawg_info->permuter == SYSTEM_DAWG_PERM ||
         vse.dawg_info->permuter == USER_DAWG_PERM ||
         vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
    return true;
  }

Definition at line 96 of file language_model.h.

Definition at line 412 of file language_model.cpp.

                                           {
  if (parent_node == NULL) return -1;
  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
  ViterbiStateEntry* top_lower = NULL;
  ViterbiStateEntry* top_upper = NULL;
  ViterbiStateEntry* top_digit = NULL;
  ViterbiStateEntry* top_choice = NULL;
  float lower_rating = 0.0f;
  float upper_rating = 0.0f;
  float digit_rating = 0.0f;
  float top_rating = 0.0f;
  const UNICHARSET &unicharset = dict_->getUnicharset();
  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
    ViterbiStateEntry* vse = vit.data();
    // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
    // back to the real character if needed.
    ViterbiStateEntry* unichar_vse = vse;
    UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
    float rating = unichar_vse->curr_b->rating();
    while (unichar_id == INVALID_UNICHAR_ID &&
           unichar_vse->parent_vse != NULL) {
      unichar_vse = unichar_vse->parent_vse;
      unichar_id = unichar_vse->curr_b->unichar_id();
      rating = unichar_vse->curr_b->rating();
    }
    if (unichar_id != INVALID_UNICHAR_ID) {
      if (unicharset.get_islower(unichar_id)) {
        if (top_lower == NULL || lower_rating > rating) {
          top_lower = vse;
          lower_rating = rating;
        }
      } else if (unicharset.get_isalpha(unichar_id)) {
        if (top_upper == NULL || upper_rating > rating) {
          top_upper = vse;
          upper_rating = rating;
        }
      } else if (unicharset.get_isdigit(unichar_id)) {
        if (top_digit == NULL || digit_rating > rating) {
          top_digit = vse;
          digit_rating = rating;
        }
      }
    }
    if (top_choice == NULL || top_rating > rating) {
      top_choice = vse;
      top_rating = rating;
      top_id = unichar_id;
    }
  }
  if (top_choice == NULL) return -1;
  bool mixed = (top_lower != NULL || top_upper != NULL) &&
      top_digit != NULL;
  if (top_lower == NULL) top_lower = top_choice;
  top_lower->top_choice_flags |= kLowerCaseFlag;
  if (top_upper == NULL) top_upper = top_choice;
  top_upper->top_choice_flags |= kUpperCaseFlag;
  if (top_digit == NULL) top_digit = top_choice;
  top_digit->top_choice_flags |= kDigitFlag;
  top_choice->top_choice_flags |= kSmallestRatingFlag;
  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
      (top_choice->top_choice_flags &
          (kLowerCaseFlag | kUpperCaseFlag | kDigitFlag))) {
    // If the compound marker top choice carries any of the top alnum flags,
    // then give it all of them, allowing words like I-295 to be chosen.
    top_choice->top_choice_flags |=
        kLowerCaseFlag | kUpperCaseFlag | kDigitFlag;
  }
  return mixed ? 1 : 0;
}
void tesseract::LanguageModel::UpdateBestChoice ( ViterbiStateEntry vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
) [protected]

Definition at line 1219 of file language_model.cpp.

                                 {
  bool truth_path;
  WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
                                    blamer_bundle, &truth_path);
  ASSERT_HOST(word != NULL);
  if (dict_->stopper_debug_level >= 1) {
    STRING word_str;
    word->string_and_lengths(&word_str, NULL);
    vse->Print(word_str.string());
  }
  if (language_model_debug_level > 0) {
    word->print("UpdateBestChoice() constructed word");
  }
  // Record features from the current path if necessary.
  ParamsTrainingHypothesis curr_hyp;
  if (blamer_bundle != NULL) {
    if (vse->dawg_info != NULL) vse->dawg_info->permuter =
        static_cast<PermuterType>(word->permuter());
    ExtractFeaturesFromPath(*vse, curr_hyp.features);
    word->string_and_lengths(&(curr_hyp.str), NULL);
    curr_hyp.cost = vse->cost;  // record cost for error rate computations
    if (language_model_debug_level > 0) {
      tprintf("Raw features extracted from %s (cost=%g) [ ",
              curr_hyp.str.string(), curr_hyp.cost);
      for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
        tprintf("%g ", curr_hyp.features[deb_i]);
      }
      tprintf("]\n");
    }
    // Record the current hypothesis in params_training_bundle.
    blamer_bundle->AddHypothesis(curr_hyp);
    if (truth_path)
      blamer_bundle->UpdateBestRating(word->rating());
  }
  if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) {
    // The word was constructed solely for blamer_bundle->AddHypothesis, so
    // we no longer need it.
    delete word;
    return;
  }
  if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty())
    word->SetScriptPositions(false, word_res->chopped_word);
  // Update and log new raw_choice if needed.
  if (word_res->raw_choice == NULL ||
      word->rating() < word_res->raw_choice->rating()) {
    if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
      tprintf("Updated raw choice\n");
  }
  // Set the modified rating for best choice to vse->cost and log best choice.
  word->set_rating(vse->cost);
  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
  // computes adjust_factor that is used by the adaption code (e.g. by
  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
  // Note: the rating of the word is not adjusted.
  dict_->adjust_word(word, vse->dawg_info == NULL,
                     vse->consistency_info.xht_decision, 0.0,
                     false, language_model_debug_level > 0);
  // Hand ownership of the word over to the word_res.
  if (!word_res->LogNewCookedChoice(dict_->tessedit_truncate_wordchoice_log,
                                    dict_->stopper_debug_level >= 1, word)) {
    // The word was so bad that it was deleted.
    return;
  }
  if (word_res->best_choice == word) {
    // Word was the new best.
    if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
        AcceptablePath(*vse)) {
      acceptable_choice_found_ = true;
    }
    // Update best_choice_bundle.
    best_choice_bundle->updated = true;
    best_choice_bundle->best_vse = vse;
    if (language_model_debug_level > 0) {
      tprintf("Updated best choice\n");
      word->print_state("New state ");
    }
    // Update hyphen state if we are dealing with a dictionary word.
    if (vse->dawg_info != NULL) {
      if (dict_->has_hyphen_end(*word)) {
        dict_->set_hyphen_word(*word, *(dawg_args_->active_dawgs));
      } else {
        dict_->reset_hyphen_vars(true);
      }
    }

    if (blamer_bundle != NULL) {
      blamer_bundle->set_best_choice_is_dict_and_top_choice(
          vse->dawg_info != NULL && vse->top_choice_flags);
    }
  }
  if (wordrec_display_segmentations) {
    word->DisplaySegmentation(word_res->chopped_word);
  }
}
bool tesseract::LanguageModel::UpdateState ( bool  just_classified,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
LanguageModelState parent_node,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 246 of file language_model.cpp.

                                 {
  if (language_model_debug_level > 0) {
    tprintf("\nUpdateState: col=%d row=%d %s",
            curr_col, curr_row, just_classified ? "just_classified" : "");
    if (language_model_debug_level > 5)
      tprintf("(parent=%p)\n", parent_node);
    else
      tprintf("\n");
  }
  // Initialize helper variables.
  bool word_end = (curr_row+1 >= word_res->ratings->dimension());
  bool new_changed = false;
  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
  const UNICHARSET& unicharset = dict_->getUnicharset();
  BLOB_CHOICE *first_lower = NULL;
  BLOB_CHOICE *first_upper = NULL;
  BLOB_CHOICE *first_digit = NULL;
  bool has_alnum_mix = false;
  if (parent_node != NULL) {
    int result = SetTopParentLowerUpperDigit(parent_node);
    if (result < 0) {
      if (language_model_debug_level > 0)
        tprintf("No parents found to process\n");
      return false;
    }
    if (result > 0)
      has_alnum_mix = true;
  }
  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
                             &first_digit))
    has_alnum_mix = false;;
  ScanParentsForCaseMix(unicharset, parent_node);
  if (language_model_debug_level > 3 && parent_node != NULL) {
    parent_node->Print("Parent viterbi list");
  }
  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];

  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
  ViterbiStateEntry_IT vit;
  BLOB_CHOICE_IT c_it(curr_list);
  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
    BLOB_CHOICE* choice = c_it.data();
    // TODO(antonova): make sure commenting this out if ok for ngram
    // model scoring (I think this was introduced to fix ngram model quirks).
    // Skip NULL unichars unless it is the only choice.
    //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
    UNICHAR_ID unichar_id = choice->unichar_id();
    if (unicharset.get_fragment(unichar_id)) {
      continue;  // skip fragments
    }
    // Set top choice flags.
    LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
    if (c_it.at_first() || !new_changed)
      blob_choice_flags |= kSmallestRatingFlag;
    if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
    if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
    if (first_digit == choice) blob_choice_flags |= kDigitFlag;

    if (parent_node == NULL) {
      // Process the beginning of a word.
      // If there is a better case variant that is not distinguished by size,
      // skip this blob choice, as we have no choice but to accept the result
      // of the character classifier to distinguish between them, even if
      // followed by an upper case.
      // With words like iPoc, and other CamelBackWords, the lower-upper
      // transition can only be achieved if the classifier has the correct case
      // as the top choice, and leaving an initial I lower down the list
      // increases the chances of choosing IPoc simply because it doesn't
      // include such a transition. iPoc will beat iPOC and ipoc because
      // the other words are baseline/x-height inconsistent.
      if (HasBetterCaseVariant(unicharset, choice, curr_list))
        continue;
      // Upper counts as lower at the beginning of a word.
      if (blob_choice_flags & kUpperCaseFlag)
        blob_choice_flags |= kLowerCaseFlag;
      new_changed |= AddViterbiStateEntry(
          blob_choice_flags, denom, word_end, curr_col, curr_row,
          choice, curr_state, NULL, pain_points,
          word_res, best_choice_bundle, blamer_bundle);
    } else {
      // Get viterbi entries from each parent ViterbiStateEntry.
      vit.set_to_list(&parent_node->viterbi_state_entries);
      int vit_counter = 0;
      vit.mark_cycle_pt();
      ViterbiStateEntry* parent_vse = NULL;
      LanguageModelFlagsType top_choice_flags;
      while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
                                            c_it.data(), blob_choice_flags,
                                            unicharset, word_res, &vit,
                                            &top_choice_flags)) != NULL) {
        // Skip pruned entries and do not look at prunable entries if already
        // examined language_model_viterbi_list_max_num_prunable of those.
        if (PrunablePath(*parent_vse) &&
            (++vit_counter > language_model_viterbi_list_max_num_prunable ||
             (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
          continue;
        }
        // If the parent has no alnum choice, (ie choice is the first in a
        // string of alnum), and there is a better case variant that is not
        // distinguished by size, skip this blob choice/parent, as with the
        // initial blob treatment above.
        if (!parent_vse->HasAlnumChoice(unicharset) &&
            HasBetterCaseVariant(unicharset, choice, curr_list))
          continue;
        // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
        // looks good according to the Dawgs or character ngram model.
        new_changed |= AddViterbiStateEntry(
            top_choice_flags, denom, word_end, curr_col, curr_row,
            c_it.data(), curr_state, parent_vse, pain_points,
            word_res, best_choice_bundle, blamer_bundle);
      }
    }
  }
  return new_changed;
}

Member Data Documentation

Definition at line 406 of file language_model.h.

Definition at line 354 of file language_model.h.

Definition at line 373 of file language_model.h.

Definition at line 380 of file language_model.h.

Definition at line 369 of file language_model.h.

Definition at line 48 of file language_model.h.

Definition at line 46 of file language_model.h.

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f [static]

Definition at line 53 of file language_model.h.

Definition at line 47 of file language_model.h.

"Language model debug level"

Definition at line 306 of file language_model.h.

"Minimum length of compound words"

Definition at line 333 of file language_model.h.

"Average classifier score of a non-matching unichar"

Definition at line 320 of file language_model.h.

"Turn on/off the use of character ngram model"

Definition at line 308 of file language_model.h.

"Maximum order of the character ngram model"

Definition at line 310 of file language_model.h.

"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "

Definition at line 329 of file language_model.h.

"Strength of the character ngram model relative to the" " character classifier "

Definition at line 326 of file language_model.h.

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

Definition at line 318 of file language_model.h.

"Words are delimited by space"

Definition at line 331 of file language_model.h.

"Use only the first UTF8 step of the given string" " when computing log probabilities"

Definition at line 323 of file language_model.h.

"Penalty for inconsistent case"

Definition at line 342 of file language_model.h.

"Penalty for inconsistent character type"

Definition at line 346 of file language_model.h.

"Penalty for inconsistent font"

Definition at line 348 of file language_model.h.

"Penalty increment"

Definition at line 351 of file language_model.h.

"Penalty for non-dictionary words"

Definition at line 338 of file language_model.h.

"Penalty for words not in the frequent word dictionary"

Definition at line 336 of file language_model.h.

"Penalty for inconsistent punctuation"

Definition at line 340 of file language_model.h.

"Penalty for inconsistent script"

Definition at line 344 of file language_model.h.

"Penalty for inconsistent spacing"

Definition at line 350 of file language_model.h.

"Use sigmoidal score for certainty"

Definition at line 354 of file language_model.h.

"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"

Definition at line 313 of file language_model.h.

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"

Definition at line 315 of file language_model.h.

Definition at line 383 of file language_model.h.

Definition at line 390 of file language_model.h.

Definition at line 391 of file language_model.h.

Definition at line 364 of file language_model.h.

"Display Segmentations"

Definition at line 352 of file language_model.h.


The documentation for this class was generated from the following files:
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines