Tesseract 3.01
|
#include <language_model.h>
Public Member Functions | |
LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict, WERD_CHOICE **prev_word_best_choice) | |
~LanguageModel () | |
void | InitForWord (const WERD_CHOICE *prev_word, const DENORM *denorm, bool fixed_pitch, float best_choice_cert, float max_char_wh_ratio, HEAP *pain_points, CHUNKS_RECORD *chunks_record) |
void | CleanUp () |
void | DeleteState (BLOB_CHOICE_LIST *choices) |
LanguageModelFlagsType | UpdateState (LanguageModelFlagsType changed, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE_LIST *parent_list, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle) |
void | GenerateNgramModelPainPointsFromColumn (int col, int row, HEAP *pain_points, CHUNKS_RECORD *chunks_record) |
void | GenerateProblematicPathPainPointsFromColumn (int col, int row, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record) |
void | GeneratePainPointsFromColumn (int col, const GenericVector< int > &non_empty_rows, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record) |
void | GeneratePainPointsFromBestChoice (HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle) |
void | GeneratePainPoint (int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points) |
bool | AcceptableChoiceFound () |
void | GetWorstPieceCertainty (int col, int row, MATRIX *ratings, float *cert, bool *fragmented) |
Public Attributes | |
int | language_model_debug_level = 0 |
bool | language_model_ngram_on = false |
int | language_model_ngram_order = 8 |
int | language_model_viterbi_list_max_num_prunable = 10 |
int | language_model_viterbi_list_max_size = 500 |
double | language_model_ngram_small_prob = 0.000001 |
double | language_model_ngram_nonmatch_score = -40.0 |
bool | language_model_ngram_use_only_first_uft8_step = false |
double | language_model_ngram_scale_factor = 0.03 |
int | language_model_min_compound_length = 3 |
int | language_model_fixed_length_choices_depth = 3 |
double | language_model_penalty_non_freq_dict_word = 0.1 |
double | language_model_penalty_non_dict_word = 0.15 |
double | language_model_penalty_punc = 0.2 |
double | language_model_penalty_case = 0.1 |
double | language_model_penalty_script = 0.5 |
double | language_model_penalty_chartype = 0.3 |
double | language_model_penalty_font = 0.00 |
double | language_model_penalty_spacing = 0.05 |
double | language_model_penalty_increment = 0.01 |
Static Public Attributes | |
static const float | kInitialPainPointPriorityAdjustment = 5.0f |
static const float | kDefaultPainPointPriorityAdjustment = 2.0f |
static const float | kBestChoicePainPointPriorityAdjustment = 0.5f |
static const float | kCriticalPainPointPriorityAdjustment = 0.1f |
static const float | kMaxAvgNgramCost = 25.0f |
static const int | kMinFixedLengthDawgLength = 2 |
static const float | kLooseMaxCharWhRatio = 2.5f |
static const LanguageModelFlagsType | kSmallestRatingFlag = 0x1 |
static const LanguageModelFlagsType | kLowerCaseFlag = 0x2 |
static const LanguageModelFlagsType | kUpperCaseFlag = 0x4 |
static const LanguageModelFlagsType | kConsistentFlag = 0x8 |
static const LanguageModelFlagsType | kDawgFlag = 0x10 |
static const LanguageModelFlagsType | kNgramFlag = 0x20 |
static const LanguageModelFlagsType | kJustClassifiedFlag = 0x80 |
static const LanguageModelFlagsType | kAllChangedFlag = 0xff |
Protected Member Functions | |
bool | NonAlphaOrDigitMiddle (int col, int row, int dimension, UNICHAR_ID unichar_id) |
bool | IsFragment (BLOB_CHOICE *b) |
bool | IsHan (int script_id) |
void | GetPieceCertainty (BLOB_CHOICE_LIST *blist, float *cert, bool *fragmented) |
float | ComputeAdjustment (int num_problems, float penalty) |
float | ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info) |
float | ComputeConsistencyAdjustedRatingsSum (float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info) |
float | ComputeAdjustedPathCost (float ratings_sum, int length, float dawg_score, const LanguageModelDawgInfo *dawg_info, const LanguageModelNgramInfo *ngram_info, const LanguageModelConsistencyInfo &consistency_info, const AssociateStats &associate_stats, ViterbiStateEntry *parent_vse) |
bool | ProblematicPath (const ViterbiStateEntry &vse, UNICHAR_ID unichar_id, bool word_end) |
void | GetTopChoiceLowerUpper (LanguageModelFlagsType changed, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper) |
LanguageModelFlagsType | AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle) |
void | PrintViterbiStateEntry (const char *msg, ViterbiStateEntry *vse, BLOB_CHOICE *b, CHUNKS_RECORD *chunks_record) |
void | GenerateTopChoiceInfo (float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *b, LanguageModelFlagsType *top_choice_flags, LanguageModelFlagsType *changed) |
LanguageModelDawgInfo * | GenerateDawgInfo (bool word_end, int script_id, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse, LanguageModelFlagsType *changed) |
LanguageModelNgramInfo * | GenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, LanguageModelFlagsType *changed) |
float | ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob) |
float | ComputeDenom (BLOB_CHOICE_LIST *curr_list) |
void | FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, CHUNKS_RECORD *chunks_record, LanguageModelConsistencyInfo *consistency_info) |
void | UpdateBestChoice (BLOB_CHOICE *b, ViterbiStateEntry *vse, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle) |
WERD_CHOICE * | ConstructWord (BLOB_CHOICE *b, ViterbiStateEntry *vse, CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, float certainties[], float *dawg_score, STATE *state) |
void | UpdateCoveredByFixedLengthDawgs (const DawgInfoVector &active_dawgs, int word_index, int word_length, int *skip, int *covered, float *dawg_score, bool *dawg_score_done) |
void | ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, AssociateStats *associate_stats) |
bool | PrunablePath (LanguageModelFlagsType top_choice_flags, const LanguageModelDawgInfo *dawg_info) |
bool | UseFixedLengthDawgs (int script_id) |
bool | AcceptablePath (const ViterbiStateEntry &vse) |
Static Protected Member Functions | |
static float | CertaintyScore (float cert) |
Protected Attributes | |
DawgArgs * | dawg_args_ |
GenericVector< bool * > | updated_flags_ |
const UnicityTable< FontInfo > * | fontinfo_table_ |
Dict * | dict_ |
const DENORM * | denorm_ |
bool | fixed_pitch_ |
float | max_char_wh_ratio_ |
STRING | prev_word_str_ |
int | prev_word_unichar_step_len_ |
DawgInfoVector * | beginning_active_dawgs_ |
DawgInfoVector * | beginning_constraints_ |
DawgInfoVector * | fixed_length_beginning_active_dawgs_ |
DawgInfoVector * | empty_dawg_info_vec_ |
float | max_penalty_adjust_ |
bool | acceptable_choice_found_ |
tesseract::LanguageModel::LanguageModel | ( | const UnicityTable< FontInfo > * | fontinfo_table, |
Dict * | dict, | ||
WERD_CHOICE ** | prev_word_best_choice | ||
) |
tesseract::LanguageModel::~LanguageModel | ( | ) |
bool tesseract::LanguageModel::AcceptableChoiceFound | ( | ) | [inline] |
bool tesseract::LanguageModel::AcceptablePath | ( | const ViterbiStateEntry & | vse | ) | [inline, protected] |
LanguageModelFlagsType tesseract::LanguageModel::AddViterbiStateEntry | ( | LanguageModelFlagsType | top_choice_flags, |
float | denom, | ||
bool | word_end, | ||
int | curr_col, | ||
int | curr_row, | ||
BLOB_CHOICE * | b, | ||
BLOB_CHOICE * | parent_b, | ||
ViterbiStateEntry * | parent_vse, | ||
HEAP * | pain_points, | ||
BestPathByColumn * | best_path_by_column[], | ||
CHUNKS_RECORD * | chunks_record, | ||
BestChoiceBundle * | best_choice_bundle | ||
) | [protected] |
static float tesseract::LanguageModel::CertaintyScore | ( | float | cert | ) | [inline, static, protected] |
void tesseract::LanguageModel::CleanUp | ( | ) |
float tesseract::LanguageModel::ComputeAdjustedPathCost | ( | float | ratings_sum, |
int | length, | ||
float | dawg_score, | ||
const LanguageModelDawgInfo * | dawg_info, | ||
const LanguageModelNgramInfo * | ngram_info, | ||
const LanguageModelConsistencyInfo & | consistency_info, | ||
const AssociateStats & | associate_stats, | ||
ViterbiStateEntry * | parent_vse | ||
) | [protected] |
float tesseract::LanguageModel::ComputeAdjustment | ( | int | num_problems, |
float | penalty | ||
) | [inline, protected] |
void tesseract::LanguageModel::ComputeAssociateStats | ( | int | col, |
int | row, | ||
float | max_char_wh_ratio, | ||
ViterbiStateEntry * | parent_vse, | ||
CHUNKS_RECORD * | chunks_record, | ||
AssociateStats * | associate_stats | ||
) | [inline, protected] |
float tesseract::LanguageModel::ComputeConsistencyAdjustedRatingsSum | ( | float | ratings_sum, |
const LanguageModelDawgInfo * | dawg_info, | ||
const LanguageModelConsistencyInfo & | consistency_info | ||
) | [inline, protected] |
float tesseract::LanguageModel::ComputeConsistencyAdjustment | ( | const LanguageModelDawgInfo * | dawg_info, |
const LanguageModelConsistencyInfo & | consistency_info | ||
) | [inline, protected] |
float tesseract::LanguageModel::ComputeDenom | ( | BLOB_CHOICE_LIST * | curr_list | ) | [protected] |
float tesseract::LanguageModel::ComputeNgramCost | ( | const char * | unichar, |
float | certainty, | ||
float | denom, | ||
const char * | context, | ||
int * | unichar_step_len, | ||
bool * | found_small_prob | ||
) | [protected] |
WERD_CHOICE * tesseract::LanguageModel::ConstructWord | ( | BLOB_CHOICE * | b, |
ViterbiStateEntry * | vse, | ||
CHUNKS_RECORD * | chunks_record, | ||
BLOB_CHOICE_LIST_VECTOR * | best_char_choices, | ||
float | certainties[], | ||
float * | dawg_score, | ||
STATE * | state | ||
) | [protected] |
void tesseract::LanguageModel::DeleteState | ( | BLOB_CHOICE_LIST * | choices | ) |
void tesseract::LanguageModel::FillConsistencyInfo | ( | int | curr_col, |
bool | word_end, | ||
BLOB_CHOICE * | b, | ||
ViterbiStateEntry * | parent_vse, | ||
BLOB_CHOICE * | parent_b, | ||
CHUNKS_RECORD * | chunks_record, | ||
LanguageModelConsistencyInfo * | consistency_info | ||
) | [protected] |
LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo | ( | bool | word_end, |
int | script_id, | ||
int | curr_col, | ||
int | curr_row, | ||
const BLOB_CHOICE & | b, | ||
const ViterbiStateEntry * | parent_vse, | ||
LanguageModelFlagsType * | changed | ||
) | [protected] |
LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo | ( | const char * | unichar, |
float | certainty, | ||
float | denom, | ||
int | curr_col, | ||
int | curr_row, | ||
const ViterbiStateEntry * | parent_vse, | ||
BLOB_CHOICE * | parent_b, | ||
LanguageModelFlagsType * | changed | ||
) | [protected] |
void tesseract::LanguageModel::GenerateNgramModelPainPointsFromColumn | ( | int | col, |
int | row, | ||
HEAP * | pain_points, | ||
CHUNKS_RECORD * | chunks_record | ||
) |
void tesseract::LanguageModel::GeneratePainPoint | ( | int | col, |
int | row, | ||
bool | ok_to_extend, | ||
float | priority_adjustment, | ||
float | worst_piece_cert, | ||
bool | fragmented, | ||
float | best_choice_cert, | ||
float | max_char_wh_ratio, | ||
BLOB_CHOICE * | parent_b, | ||
ViterbiStateEntry * | parent_vse, | ||
CHUNKS_RECORD * | chunks_record, | ||
HEAP * | pain_points | ||
) |
void tesseract::LanguageModel::GeneratePainPointsFromBestChoice | ( | HEAP * | pain_points, |
CHUNKS_RECORD * | chunks_record, | ||
BestChoiceBundle * | best_choice_bundle | ||
) |
void tesseract::LanguageModel::GeneratePainPointsFromColumn | ( | int | col, |
const GenericVector< int > & | non_empty_rows, | ||
float | best_choice_cert, | ||
HEAP * | pain_points, | ||
BestPathByColumn * | best_path_by_column[], | ||
CHUNKS_RECORD * | chunks_record | ||
) |
void tesseract::LanguageModel::GenerateProblematicPathPainPointsFromColumn | ( | int | col, |
int | row, | ||
float | best_choice_cert, | ||
HEAP * | pain_points, | ||
BestPathByColumn * | best_path_by_column[], | ||
CHUNKS_RECORD * | chunks_record | ||
) |
void tesseract::LanguageModel::GenerateTopChoiceInfo | ( | float | ratings_sum, |
const LanguageModelDawgInfo * | dawg_info, | ||
const LanguageModelConsistencyInfo & | consistency_info, | ||
const ViterbiStateEntry * | parent_vse, | ||
BLOB_CHOICE * | b, | ||
LanguageModelFlagsType * | top_choice_flags, | ||
LanguageModelFlagsType * | changed | ||
) | [protected] |
void tesseract::LanguageModel::GetPieceCertainty | ( | BLOB_CHOICE_LIST * | blist, |
float * | cert, | ||
bool * | fragmented | ||
) | [inline, protected] |
void tesseract::LanguageModel::GetTopChoiceLowerUpper | ( | LanguageModelFlagsType | changed, |
BLOB_CHOICE_LIST * | curr_list, | ||
BLOB_CHOICE ** | first_lower, | ||
BLOB_CHOICE ** | first_upper | ||
) | [protected] |
void tesseract::LanguageModel::GetWorstPieceCertainty | ( | int | col, |
int | row, | ||
MATRIX * | ratings, | ||
float * | cert, | ||
bool * | fragmented | ||
) | [inline] |
void tesseract::LanguageModel::InitForWord | ( | const WERD_CHOICE * | prev_word, |
const DENORM * | denorm, | ||
bool | fixed_pitch, | ||
float | best_choice_cert, | ||
float | max_char_wh_ratio, | ||
HEAP * | pain_points, | ||
CHUNKS_RECORD * | chunks_record | ||
) |
bool tesseract::LanguageModel::IsFragment | ( | BLOB_CHOICE * | b | ) | [inline, protected] |
bool tesseract::LanguageModel::IsHan | ( | int | script_id | ) | [inline, protected] |
bool tesseract::LanguageModel::NonAlphaOrDigitMiddle | ( | int | col, |
int | row, | ||
int | dimension, | ||
UNICHAR_ID | unichar_id | ||
) | [inline, protected] |
void tesseract::LanguageModel::PrintViterbiStateEntry | ( | const char * | msg, |
ViterbiStateEntry * | vse, | ||
BLOB_CHOICE * | b, | ||
CHUNKS_RECORD * | chunks_record | ||
) | [protected] |
bool tesseract::LanguageModel::ProblematicPath | ( | const ViterbiStateEntry & | vse, |
UNICHAR_ID | unichar_id, | ||
bool | word_end | ||
) | [protected] |
bool tesseract::LanguageModel::PrunablePath | ( | LanguageModelFlagsType | top_choice_flags, |
const LanguageModelDawgInfo * | dawg_info | ||
) | [inline, protected] |
void tesseract::LanguageModel::UpdateBestChoice | ( | BLOB_CHOICE * | b, |
ViterbiStateEntry * | vse, | ||
HEAP * | pain_points, | ||
CHUNKS_RECORD * | chunks_record, | ||
BestChoiceBundle * | best_choice_bundle | ||
) | [protected] |
void tesseract::LanguageModel::UpdateCoveredByFixedLengthDawgs | ( | const DawgInfoVector & | active_dawgs, |
int | word_index, | ||
int | word_length, | ||
int * | skip, | ||
int * | covered, | ||
float * | dawg_score, | ||
bool * | dawg_score_done | ||
) | [protected] |
LanguageModelFlagsType tesseract::LanguageModel::UpdateState | ( | LanguageModelFlagsType | changed, |
int | curr_col, | ||
int | curr_row, | ||
BLOB_CHOICE_LIST * | curr_list, | ||
BLOB_CHOICE_LIST * | parent_list, | ||
HEAP * | pain_points, | ||
BestPathByColumn * | best_path_by_column[], | ||
CHUNKS_RECORD * | chunks_record, | ||
BestChoiceBundle * | best_choice_bundle | ||
) |
bool tesseract::LanguageModel::UseFixedLengthDawgs | ( | int | script_id | ) | [inline, protected] |
bool tesseract::LanguageModel::acceptable_choice_found_ [protected] |
DawgArgs* tesseract::LanguageModel::dawg_args_ [protected] |
const DENORM* tesseract::LanguageModel::denorm_ [protected] |
Dict* tesseract::LanguageModel::dict_ [protected] |
bool tesseract::LanguageModel::fixed_pitch_ [protected] |
const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_ [protected] |
const LanguageModelFlagsType tesseract::LanguageModel::kAllChangedFlag = 0xff [static] |
const float tesseract::LanguageModel::kBestChoicePainPointPriorityAdjustment = 0.5f [static] |
const LanguageModelFlagsType tesseract::LanguageModel::kConsistentFlag = 0x8 [static] |
const float tesseract::LanguageModel::kCriticalPainPointPriorityAdjustment = 0.1f [static] |
const LanguageModelFlagsType tesseract::LanguageModel::kDawgFlag = 0x10 [static] |
const float tesseract::LanguageModel::kDefaultPainPointPriorityAdjustment = 2.0f [static] |
const float tesseract::LanguageModel::kInitialPainPointPriorityAdjustment = 5.0f [static] |
const LanguageModelFlagsType tesseract::LanguageModel::kJustClassifiedFlag = 0x80 [static] |
const float tesseract::LanguageModel::kLooseMaxCharWhRatio = 2.5f [static] |
const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2 [static] |
const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f [static] |
const int tesseract::LanguageModel::kMinFixedLengthDawgLength = 2 [static] |
const LanguageModelFlagsType tesseract::LanguageModel::kNgramFlag = 0x20 [static] |
const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1 [static] |
const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4 [static] |
"Language model debug level"
"Depth of blob choice lists to explore" " when fixed length dawgs are on"
"Minimum length of compound words"
"Average classifier score of a non-matching unichar"
bool tesseract::LanguageModel::language_model_ngram_on = false |
"Turn on/off the use of character ngram model"
"Maximum order of the character ngram model"
"Strength of the character ngram model relative to the" " character classifier "
double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001 |
"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"
"Use only the first UTF8 step of the given string" " when computing log probabilities"
"Penalty for inconsistent case"
"Penalty for inconsistent character type"
double tesseract::LanguageModel::language_model_penalty_font = 0.00 |
"Penalty for inconsistent font"
"Penalty increment"
"Penalty for non-dictionary words"
"Penalty for words not in the frequent word dictionary"
"Penalty for inconsistent punctuation"
"Penalty for inconsistent script"
double tesseract::LanguageModel::language_model_penalty_spacing = 0.05 |
"Penalty for inconsistent spacing"
"Maximum number of prunable (those for which PrunablePath() is true)" "entries in each viterbi list recorded in BLOB_CHOICEs"
"Maximum size of viterbi lists recorded in BLOB_CHOICEs"
float tesseract::LanguageModel::max_char_wh_ratio_ [protected] |
float tesseract::LanguageModel::max_penalty_adjust_ [protected] |
STRING tesseract::LanguageModel::prev_word_str_ [protected] |
int tesseract::LanguageModel::prev_word_unichar_step_len_ [protected] |
GenericVector<bool *> tesseract::LanguageModel::updated_flags_ [protected] |