Tesseract 3.01
|
00001 00002 // File: language_model.h 00003 // Description: Functions that utilize the knowledge about the properties, 00004 // structure and statistics of the language to help recognition. 00005 // Author: Daria Antonova 00006 // Created: Mon Nov 11 11:26:43 PST 2009 00007 // 00008 // (C) Copyright 2009, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_ 00022 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_ 00023 00024 #include "associate.h" 00025 #include "dawg.h" 00026 #include "dict.h" 00027 #include "intproto.h" 00028 #include "matrix.h" 00029 #include "oldheap.h" 00030 #include "params.h" 00031 00032 namespace tesseract { 00033 00034 // Used for expressing various language model flags. 00035 typedef unsigned char LanguageModelFlagsType; 00036 00037 // Struct for keeping track of the consistency of the path. 00038 struct LanguageModelConsistencyInfo { 00039 LanguageModelConsistencyInfo() 00040 : punc_ref(NO_EDGE), num_punc(0), invalid_punc(false), 00041 num_non_first_upper(0), num_lower(0), 00042 script_id(0), inconsistent_script(false), 00043 num_alphas(0), num_digits(0), num_other(0), 00044 num_inconsistent_spaces(0), inconsistent_font(false) {} 00045 inline int NumInconsistentPunc() const { 00046 return invalid_punc ? num_punc : 0; 00047 } 00048 inline int NumInconsistentCase() const { 00049 return (num_non_first_upper > num_lower) ? num_lower : num_non_first_upper; 00050 } 00051 inline int NumInconsistentChartype() const { 00052 return (NumInconsistentPunc() + num_other + 00053 ((num_alphas > num_digits) ? num_digits : num_alphas)); 00054 } 00055 inline bool Consistent() const { 00056 return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 && 00057 NumInconsistentChartype() == 0 && !inconsistent_script); 00058 } 00059 inline int NumInconsistentSpaces() const { 00060 return num_inconsistent_spaces; 00061 } 00062 00063 EDGE_REF punc_ref; 00064 int num_punc; 00065 bool invalid_punc; 00066 int num_non_first_upper; 00067 int num_lower; 00068 int script_id; 00069 bool inconsistent_script; 00070 int num_alphas; 00071 int num_digits; 00072 int num_other; 00073 int num_inconsistent_spaces; 00074 bool inconsistent_font; 00075 }; 00076 00077 00078 // The following structs are used for storing the state of the language model 00079 // in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs 00080 // and the links are the replationships between the underlying blobs (see 00081 // segsearch.h for a more detailed description). 00082 // Each of the BLOB_CHOICEs contains LanguageModelState struct, which has 00083 // a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi 00084 // search leading up to and including this BLOB_CHOICE. 00085 // Each ViterbiStateEntry contains information from various components of the 00086 // language model: dawgs in which the path is found, character ngram model 00087 // probability of the path, script/chartype/font consistency info, state for 00088 // language-specific heuristics (e.g. hyphenated and compund words, lower/upper 00089 // case preferences, etc). 00090 // Each ViterbiStateEntry also contains the parent pointer, so that the path 00091 // that it represents (WERD_CHOICE) can be constructed by following these 00092 // parent pointers. 00093 00094 // Struct for storing additional information used by Dawg language model 00095 // component. It stores the set of active dawgs in which the sequence of 00096 // letters on a path can be found and the constraints that have to be 00097 // satisfied at the end of the word (e.g. beginning/ending punctuation). 00098 struct LanguageModelDawgInfo { 00099 LanguageModelDawgInfo(DawgInfoVector *a, DawgInfoVector *c, 00100 PermuterType pt) : permuter(pt) { 00101 active_dawgs = new DawgInfoVector(*a); 00102 constraints = new DawgInfoVector(*c); 00103 } 00104 ~LanguageModelDawgInfo() { 00105 delete active_dawgs; 00106 delete constraints; 00107 } 00108 DawgInfoVector *active_dawgs; 00109 DawgInfoVector *constraints; 00110 PermuterType permuter; 00111 }; 00112 00113 // Struct for storing additional information used by Ngram language model 00114 // component. 00115 struct LanguageModelNgramInfo { 00116 LanguageModelNgramInfo(const char *c, int l, bool p, float nc) 00117 : context(c), context_unichar_step_len(l), pruned(p), ngram_cost(nc) {} 00118 STRING context; // context string 00119 // Length of the context measured by advancing using UNICHAR::utf8_step() 00120 // (should be at most the order of the character ngram model used). 00121 int context_unichar_step_len; 00122 // The paths with pruned set are pruned out from the perspective of the 00123 // character ngram model. They are explored further because they represent 00124 // a dictionary match or a top choice. Thus ngram_info is still computed 00125 // for them in order to calculate the combined cost. 00126 bool pruned; 00127 // -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) 00128 float ngram_cost; 00129 }; 00130 00131 // Struct for storing the information about a path in the segmentation graph 00132 // explored by Viterbi search. 00133 struct ViterbiStateEntry : public ELIST_LINK { 00134 ViterbiStateEntry(BLOB_CHOICE *pb, ViterbiStateEntry *pe, 00135 BLOB_CHOICE *b, float c, 00136 const LanguageModelConsistencyInfo &ci, 00137 const AssociateStats &as, 00138 LanguageModelFlagsType tcf, 00139 LanguageModelDawgInfo *d, LanguageModelNgramInfo *n) 00140 : cost(c), parent_b(pb), parent_vse(pe), ratings_sum(b->rating()), 00141 min_certainty(b->certainty()), length(1), consistency_info(ci), 00142 associate_stats(as), top_choice_flags(tcf), dawg_info(d), ngram_info(n), 00143 updated(true) { 00144 if (pe != NULL) { 00145 ratings_sum += pe->ratings_sum; 00146 if (pe->min_certainty < min_certainty) { 00147 min_certainty = pe->min_certainty; 00148 } 00149 length += pe->length; 00150 } 00151 } 00152 ~ViterbiStateEntry() { 00153 delete dawg_info; 00154 delete ngram_info; 00155 } 00156 // Comparator function for sorting ViterbiStateEntry_LISTs in 00157 // non-increasing order of costs. 00158 static int Compare(const void *e1, const void *e2) { 00159 const ViterbiStateEntry *ve1 = 00160 *reinterpret_cast<const ViterbiStateEntry * const *>(e1); 00161 const ViterbiStateEntry *ve2 = 00162 *reinterpret_cast<const ViterbiStateEntry * const *>(e2); 00163 return (ve1->cost < ve2->cost) ? -1 : 1; 00164 } 00165 inline bool Consistent() const { 00166 if (dawg_info != NULL && consistency_info.NumInconsistentCase() == 0) { 00167 return true; 00168 } 00169 return consistency_info.Consistent(); 00170 } 00171 00172 // The cost is an adjusted ratings sum, that is adjusted by all the language 00173 // model components that use Viterbi search. 00174 float cost; 00175 00176 // Pointers to parent BLOB_CHOICE and ViterbiStateEntry (not owned by this). 00177 BLOB_CHOICE *parent_b; 00178 ViterbiStateEntry *parent_vse; 00179 00180 // Various information about the characters on the path represented 00181 // by this ViterbiStateEntry. 00182 float ratings_sum; // sum of ratings of character on the path 00183 float min_certainty; // minimum certainty on the path 00184 int length; // number of characters on the path 00185 LanguageModelConsistencyInfo consistency_info; // path consistency info 00186 AssociateStats associate_stats; // character widths/gaps/seams 00187 00188 // Flags for marking the entry as a top choice path with 00189 // the smallest rating or lower/upper case letters). 00190 LanguageModelFlagsType top_choice_flags; 00191 00192 // Extra information maintained by Dawg laguage model component 00193 // (owned by ViterbiStateEntry). 00194 LanguageModelDawgInfo *dawg_info; 00195 00196 // Extra information maintained by Ngram laguage model component 00197 // (owned by ViterbiStateEntry). 00198 LanguageModelNgramInfo *ngram_info; 00199 00200 bool updated; // set to true if the entry has just been created/updated 00201 }; 00202 00203 ELISTIZEH(ViterbiStateEntry); 00204 00205 // Struct to store information maintained by various language model components. 00206 struct LanguageModelState { 00207 LanguageModelState(int col, int row) : contained_in_col(col), 00208 contained_in_row(row), viterbi_state_entries_prunable_length(0), 00209 viterbi_state_entries_length(0), 00210 viterbi_state_entries_prunable_max_cost(MAX_FLOAT32) {} 00211 ~LanguageModelState() {} 00212 00213 // Ratings matrix cell that holds this LanguageModelState 00214 // (needed to construct best STATE for rebuild_current_state() 00215 // and best BLOB_CHOICE_LIST_VECTOR for AcceptableChoice()). 00216 int contained_in_col; 00217 int contained_in_row; 00218 00219 // Storage for the Viterbi state. 00220 ViterbiStateEntry_LIST viterbi_state_entries; 00221 // Number and max cost of prunable paths in viterbi_state_entries. 00222 int viterbi_state_entries_prunable_length; 00223 // Total number of entries in viterbi_state_entries. 00224 int viterbi_state_entries_length; 00225 float viterbi_state_entries_prunable_max_cost; 00226 00227 // TODO(daria): add font consistency checking. 00228 }; 00229 00230 // Bundle together all the things pertaining to the best choice/state. 00231 struct BestChoiceBundle { 00232 BestChoiceBundle(STATE *s, WERD_CHOICE *bc, WERD_CHOICE *rc, 00233 BLOB_CHOICE_LIST_VECTOR *bcc) 00234 : best_state(s), best_choice(bc), raw_choice(rc), 00235 best_char_choices(bcc), updated(false), best_vse(NULL), best_b(NULL) {} 00236 00237 STATE *best_state; 00238 WERD_CHOICE *best_choice; 00239 WERD_CHOICE *raw_choice; 00240 BLOB_CHOICE_LIST_VECTOR *best_char_choices; 00241 bool updated; 00242 DANGERR fixpt; 00243 ViterbiStateEntry *best_vse; // best ViterbiStateEntry and BLOB_CHOICE 00244 BLOB_CHOICE *best_b; // at the end of the best choice path 00245 }; 00246 00247 struct BestPathByColumn { 00248 float avg_cost; 00249 ViterbiStateEntry *best_vse; 00250 BLOB_CHOICE *best_b; 00251 }; 00252 00253 // This class that contains the data structures and functions necessary 00254 // to represent and use the knowledge about the language. 00255 class LanguageModel { 00256 public: 00257 // Adjustments to pain point priority. 00258 static const float kInitialPainPointPriorityAdjustment; 00259 static const float kDefaultPainPointPriorityAdjustment; 00260 static const float kBestChoicePainPointPriorityAdjustment; 00261 static const float kCriticalPainPointPriorityAdjustment; 00262 00263 // Denominator for normalizing per-letter ngram cost when deriving 00264 // penalty adjustments. 00265 static const float kMaxAvgNgramCost; 00266 // Minimum word length for fixed length dawgs. 00267 // TODO(daria): check in the new chi/jpn.traineddata without the 00268 // fixed length dawg of length 1 and delete this variable. 00269 static const int kMinFixedLengthDawgLength; 00270 // If there is a significant drop in character ngram probability or a 00271 // dangerous ambiguity make the thresholds on what blob combinations 00272 // can be classified looser. 00273 static const float kLooseMaxCharWhRatio; 00274 00275 // Masks for interpreting which language model components 00276 // were changed by the call to UpdateState(). 00277 static const LanguageModelFlagsType kSmallestRatingFlag = 0x1; 00278 static const LanguageModelFlagsType kLowerCaseFlag = 0x2; 00279 static const LanguageModelFlagsType kUpperCaseFlag = 0x4; 00280 static const LanguageModelFlagsType kConsistentFlag = 0x8; 00281 static const LanguageModelFlagsType kDawgFlag = 0x10; 00282 static const LanguageModelFlagsType kNgramFlag = 0x20; 00283 static const LanguageModelFlagsType kJustClassifiedFlag = 0x80; 00284 static const LanguageModelFlagsType kAllChangedFlag = 0xff; 00285 00286 LanguageModel(const UnicityTable<FontInfo> *fontinfo_table, 00287 Dict *dict, WERD_CHOICE **prev_word_best_choice); 00288 ~LanguageModel(); 00289 00290 // Updates data structures that are used for the duration of the segmentation 00291 // search on the current word; 00292 void InitForWord(const WERD_CHOICE *prev_word, const DENORM *denorm, 00293 bool fixed_pitch, float best_choice_cert, 00294 float max_char_wh_ratio, 00295 HEAP *pain_points, CHUNKS_RECORD *chunks_record); 00296 // Resets all the "updated" flags used by the Viterbi search that were 00297 // "registered" during the update of the ratings matrix. 00298 void CleanUp(); 00299 // Deletes and sets to NULL language model states of each of the 00300 // BLOB_CHOICEs in the given BLOB_CHOICE_LIST. 00301 void DeleteState(BLOB_CHOICE_LIST *choices); 00302 00303 // Updates language model state of the given BLOB_CHOICE_LIST (from 00304 // the ratings matrix) a its parent. Updates pain_points if new 00305 // problematic points are found in the segmentation graph. 00306 // 00307 // At most language_model_viterbi_list_size are kept in each 00308 // LanguageModelState.viterbi_state_entries list. 00309 // At most language_model_viterbi_list_max_num_prunable of those are prunable 00310 // (non-dictionary) paths. 00311 // The entries that represent dictionary word paths are kept at the front 00312 // of the list. 00313 // The list ordered by cost that is computed collectively by several 00314 // language model components (currently dawg and ngram components). 00315 // 00316 // best_path_by_column records the lowest cost path found so far for each 00317 // column of the chunks_record->ratings matrix over all the rows. This 00318 // array is updated if a lower cost ViterbiStateEntry is created in curr_col. 00319 LanguageModelFlagsType UpdateState( 00320 LanguageModelFlagsType changed, 00321 int curr_col, int curr_row, 00322 BLOB_CHOICE_LIST *curr_list, 00323 BLOB_CHOICE_LIST *parent_list, 00324 HEAP *pain_points, 00325 BestPathByColumn *best_path_by_column[], 00326 CHUNKS_RECORD *chunks_record, 00327 BestChoiceBundle *best_choice_bundle); 00328 00329 // Generates pain points from the problematic top choice paths when the 00330 // segmentation search is guided by the character ngram model. 00331 // It is necessary to consider problematic the top choice paths instead of 00332 // the problematic lowest cost paths because the character ngram model 00333 // might assign a very high cost to very improbably paths. For example, 00334 // "liot" might have a much lower cost than "llot", and the character ngram 00335 // model might detect a dip in probability for p(t|lio) at the end of the 00336 // word, but not at the beginning (p(i|l) would be ok). However, looking at 00337 // the dips in character ngram probability of the top choices we would be 00338 // able to stop the problematic points (p(l| l) would be low). 00339 void GenerateNgramModelPainPointsFromColumn(int col, int row, 00340 HEAP *pain_points, 00341 CHUNKS_RECORD *chunks_record); 00342 00343 // Generates pain points from the problematic lowest cost paths that are 00344 // "promising" (i.e. would have the cost lower than the one recorded in 00345 // best_path_by_column if the problematic ending of the path is removed 00346 // and after being combined with another blob the certainty of the last 00347 // blob is improved). 00348 void GenerateProblematicPathPainPointsFromColumn( 00349 int col, int row, float best_choice_cert, 00350 HEAP *pain_points, BestPathByColumn *best_path_by_column[], 00351 CHUNKS_RECORD *chunks_record); 00352 00353 // This function can be called after processing column col of the 00354 // chunks_record->ratings matrix in order to find the promising paths 00355 // that were terminated or made inconsistent by the character choices 00356 // in column col. If such paths are identified, this function generates 00357 // pain points to combine the problematic cells of the matrix. 00358 void GeneratePainPointsFromColumn( 00359 int col, 00360 const GenericVector<int> &non_empty_rows, 00361 float best_choice_cert, 00362 HEAP *pain_points, 00363 BestPathByColumn *best_path_by_column[], 00364 CHUNKS_RECORD *chunks_record); 00365 00366 // Generates a pain point for each problematic point on the best choice 00367 // path. Such problematic points could be a termination of a dicionary 00368 // word, dip in ngram probability, invalid punctuation, inconsistent 00369 // case/chartype/script or punctuation in the middle of a word. 00370 void GeneratePainPointsFromBestChoice( 00371 HEAP *pain_points, 00372 CHUNKS_RECORD *chunks_record, 00373 BestChoiceBundle *best_choice_bundle); 00374 00375 // Adds a pain point to the given pain_points queue that will cause 00376 // the entry at chunks_record->ratings(col, row) to be classified. 00377 // The priority of the pain point is set to be: 00378 // 00379 // priority_adjustment * sqrt(avg_parent_cost) 00380 // ---------------------------------------------------- 00381 // sqrt(dict_parent_path_length) * |worst_piece_cert| 00382 // 00383 // The priority is further lowered if fragmented is true. 00384 // 00385 void GeneratePainPoint(int col, int row, bool ok_to_extend, 00386 float priority_adjustment, 00387 float worst_piece_cert, 00388 bool fragmented, 00389 float best_choice_cert, 00390 float max_char_wh_ratio, 00391 BLOB_CHOICE *parent_b, 00392 ViterbiStateEntry *parent_vse, 00393 CHUNKS_RECORD *chunks_record, 00394 HEAP *pain_points); 00395 00396 // Returns true if an acceptable best choice was discovered. 00397 inline bool AcceptableChoiceFound() { return acceptable_choice_found_; } 00398 00399 // Fills cert with the worst certainty of the top non-fragmented choice 00400 // of the left and right neighbor of the given col,row. 00401 // Sets fragmented if any of the neighbors have a fragmented character 00402 // as the top choice. 00403 inline void GetWorstPieceCertainty(int col, int row, MATRIX *ratings, 00404 float *cert, bool *fragmented) { 00405 *cert = 0.0f; 00406 *fragmented = false; 00407 if (row > 0) { 00408 GetPieceCertainty(ratings->get(col, row-1), cert, fragmented); 00409 } 00410 if (col+1 < ratings->dimension()) { 00411 GetPieceCertainty(ratings->get(col+1, row), cert, fragmented); 00412 } 00413 ASSERT_HOST(*cert < 0.0f); 00414 } 00415 00416 protected: 00417 00418 inline static float CertaintyScore(float cert) { return (-1.0f / cert); } 00419 00420 inline bool NonAlphaOrDigitMiddle(int col, int row, int dimension, 00421 UNICHAR_ID unichar_id) { 00422 return (!dict_->getUnicharset().get_isalpha(unichar_id) && 00423 !dict_->getUnicharset().get_isdigit(unichar_id) && 00424 col > 0 && row+1 < dimension); 00425 } 00426 00427 inline bool IsFragment(BLOB_CHOICE *b) { 00428 return dict_->getUnicharset().get_fragment(b->unichar_id()); 00429 } 00430 00431 inline bool IsHan(int script_id) { 00432 return ((dict_->getUnicharset().han_sid() != 00433 dict_->getUnicharset().null_sid()) && 00434 (script_id == dict_->getUnicharset().han_sid())); 00435 } 00436 00437 // Finds the first non-fragmented character in the given BLOB_CHOICE_LIST 00438 // and updates cert if its certainty is less than the one recorded in cert. 00439 // Sets fragmented if the first choice in BLOB_CHOICE_LIST is a fragment. 00440 inline void GetPieceCertainty(BLOB_CHOICE_LIST *blist, 00441 float *cert, bool *fragmented) { 00442 if (blist == NOT_CLASSIFIED || blist->empty()) return; 00443 BLOB_CHOICE_IT bit(blist); 00444 while (!bit.at_last() && IsFragment(bit.data())) { 00445 *fragmented = true; 00446 bit.forward(); // skip fragments 00447 } 00448 // Each classification must have at least one non-fragmented choice. 00449 ASSERT_HOST(!IsFragment(bit.data())); 00450 if (bit.data()->certainty() < *cert) *cert = bit.data()->certainty(); 00451 } 00452 00453 inline float ComputeAdjustment(int num_problems, float penalty) { 00454 if (num_problems == 0) return 0.0f; 00455 if (num_problems == 1) return penalty; 00456 return (penalty + (language_model_penalty_increment * 00457 static_cast<float>(num_problems-1))); 00458 } 00459 00460 // Computes the adjustment to the ratings sum based on the given 00461 // consistency_info. The paths with invalid punctuation, inconsistent 00462 // case and character type are penalized proportionally to the number 00463 // of inconsistencies on the path. 00464 inline float ComputeConsistencyAdjustment( 00465 const LanguageModelDawgInfo *dawg_info, 00466 const LanguageModelConsistencyInfo &consistency_info) { 00467 if (dawg_info != NULL) { 00468 return ComputeAdjustment(consistency_info.NumInconsistentCase(), 00469 language_model_penalty_case); 00470 } 00471 return (ComputeAdjustment(consistency_info.NumInconsistentPunc(), 00472 language_model_penalty_punc) + 00473 ComputeAdjustment(consistency_info.NumInconsistentCase(), 00474 language_model_penalty_case) + 00475 ComputeAdjustment(consistency_info.NumInconsistentChartype(), 00476 language_model_penalty_chartype) + 00477 ComputeAdjustment(consistency_info.NumInconsistentSpaces(), 00478 language_model_penalty_spacing) + 00479 (consistency_info.inconsistent_script ? 00480 language_model_penalty_script : 0.0f) + 00481 (consistency_info.inconsistent_font ? 00482 language_model_penalty_font : 0.0f)); 00483 } 00484 00485 // Returns an adjusted ratings sum that includes inconsistency penalties. 00486 inline float ComputeConsistencyAdjustedRatingsSum( 00487 float ratings_sum, 00488 const LanguageModelDawgInfo *dawg_info, 00489 const LanguageModelConsistencyInfo &consistency_info) { 00490 return (ratings_sum * (1.0f + ComputeConsistencyAdjustment( 00491 dawg_info, consistency_info))); 00492 } 00493 00494 // Returns an adjusted ratings sum that includes inconsistency penalties, 00495 // penalties for non-dictionary paths and paths with dips in ngram 00496 // probability. 00497 float ComputeAdjustedPathCost( 00498 float ratings_sum, int length, float dawg_score, 00499 const LanguageModelDawgInfo *dawg_info, 00500 const LanguageModelNgramInfo *ngram_info, 00501 const LanguageModelConsistencyInfo &consistency_info, 00502 const AssociateStats &associate_stats, 00503 ViterbiStateEntry *parent_vse); 00504 00505 // Returns true if the given ViterbiStateEntry represents a problematic 00506 // path. A path is considered problematic if the last unichar makes it 00507 // inconsistent, introduces a dip in ngram probability or transforms a 00508 // dictionary path into a non-dictionary one. 00509 bool ProblematicPath(const ViterbiStateEntry &vse, 00510 UNICHAR_ID unichar_id, bool word_end); 00511 00512 // Finds the first lower and upper case character in curr_list. 00513 // If none found, chooses the first character in the list. 00514 void GetTopChoiceLowerUpper(LanguageModelFlagsType changed, 00515 BLOB_CHOICE_LIST *curr_list, 00516 BLOB_CHOICE **first_lower, 00517 BLOB_CHOICE **first_upper); 00518 00519 // Helper function that computes the cost of the path composed of the 00520 // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE. 00521 // Adds a new ViterbiStateEntry to the list of viterbi entries 00522 // in the given BLOB_CHOICE if the new path looks good enough. 00523 // Returns LanguageModelFlagsType that indicates which language 00524 // model components were involved in creating the new entry. 00525 LanguageModelFlagsType AddViterbiStateEntry( 00526 LanguageModelFlagsType top_choice_flags, 00527 float denom, 00528 bool word_end, 00529 int curr_col, int curr_row, 00530 BLOB_CHOICE *b, 00531 BLOB_CHOICE *parent_b, 00532 ViterbiStateEntry *parent_vse, 00533 HEAP *pain_points, 00534 BestPathByColumn *best_path_by_column[], 00535 CHUNKS_RECORD *chunks_record, 00536 BestChoiceBundle *best_choice_bundle); 00537 00538 // Pretty print information in the given ViterbiStateEntry. 00539 void PrintViterbiStateEntry(const char *msg, 00540 ViterbiStateEntry *vse, 00541 BLOB_CHOICE *b, 00542 CHUNKS_RECORD *chunks_record); 00543 00544 // Determines whether a potential entry is a true top choice and 00545 // updates changed accordingly. 00546 // 00547 // Note: The function assumes that b, top_choice_flags and changed 00548 // are not NULL. 00549 void GenerateTopChoiceInfo( 00550 float ratings_sum, 00551 const LanguageModelDawgInfo *dawg_info, 00552 const LanguageModelConsistencyInfo &consistency_info, 00553 const ViterbiStateEntry *parent_vse, 00554 BLOB_CHOICE *b, 00555 LanguageModelFlagsType *top_choice_flags, 00556 LanguageModelFlagsType *changed); 00557 00558 // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and 00559 // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo 00560 // with updated active dawgs, constraints and permuter. 00561 // 00562 // Note: the caller is responsible for deleting the returned pointer. 00563 LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, int script_id, 00564 int curr_col, int curr_row, 00565 const BLOB_CHOICE &b, 00566 const ViterbiStateEntry *parent_vse, 00567 LanguageModelFlagsType *changed); 00568 00569 // Computes p(unichar | parent context) and records it in ngram_cost. 00570 // If b.unichar_id() is an unlikely continuation of the parent context 00571 // sets found_small_prob to true and returns NULL. 00572 // Otherwise creates a new LanguageModelNgramInfo entry containing the 00573 // updated context (that includes b.unichar_id() at the end) and returns it. 00574 // 00575 // Note: the caller is responsible for deleting the returned pointer. 00576 LanguageModelNgramInfo *GenerateNgramInfo(const char *unichar, 00577 float certainty, float denom, 00578 int curr_col, int curr_row, 00579 const ViterbiStateEntry *parent_vse, 00580 BLOB_CHOICE *parent_b, 00581 LanguageModelFlagsType *changed); 00582 00583 // Computes -(log(prob(classifier)) + log(prob(ngram model))) 00584 // for the given unichar in the given context. If there are multiple 00585 // unichars at one position - takes the average of their probabilities. 00586 // UNICHAR::utf8_step() is used to separate out individual UTF8 characters, 00587 // since probability_in_context() can only handle one at a time (while 00588 // unicharset might contain ngrams and glyphs composed from multiple UTF8 00589 // characters). 00590 float ComputeNgramCost(const char *unichar, float certainty, float denom, 00591 const char *context, 00592 int *unichar_step_len, bool *found_small_prob); 00593 00594 // Computes the normalization factors for the classifier confidences 00595 // (used by ComputeNgramCost()). 00596 float ComputeDenom(BLOB_CHOICE_LIST *curr_list); 00597 00598 // Fills the given consistenty_info based on parent_vse.consistency_info 00599 // and on the consistency of the given unichar_id with parent_vse. 00600 void FillConsistencyInfo( 00601 int curr_col, bool word_end, BLOB_CHOICE *b, 00602 ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, 00603 CHUNKS_RECORD *chunks_record, 00604 LanguageModelConsistencyInfo *consistency_info); 00605 00606 // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs 00607 // on the path represented by the given BLOB_CHOICE and language model 00608 // state entries (lmse, dse). The path is re-constructed by following 00609 // the parent pointers in the the lang model state entries). If the 00610 // constructed WERD_CHOICE is better than the best/raw choice recorded 00611 // in the best_choice_bundle, this function updates the corresponding 00612 // fields and sets best_choice_bunldle->updated to true. 00613 void UpdateBestChoice(BLOB_CHOICE *b, 00614 ViterbiStateEntry *vse, 00615 HEAP *pain_points, 00616 CHUNKS_RECORD *chunks_record, 00617 BestChoiceBundle *best_choice_bundle); 00618 00619 // Constructs a WERD_CHOICE by tracing parent pointers starting with 00620 // the given LanguageModelStateEntry. Returns the constructed word. 00621 // Updates best_char_choices, certainties and state if they are not 00622 // NULL (best_char_choices and certainties are assumed to have the 00623 // length equal to lmse->length). 00624 // The caller is resposible for freeing memory associated with the 00625 // returned WERD_CHOICE. 00626 WERD_CHOICE *ConstructWord(BLOB_CHOICE *b, 00627 ViterbiStateEntry *vse, 00628 CHUNKS_RECORD *chunks_record, 00629 BLOB_CHOICE_LIST_VECTOR *best_char_choices, 00630 float certainties[], 00631 float *dawg_score, 00632 STATE *state); 00633 00634 // This function is used for non-space delimited languages when looking 00635 // for word endings recorded while trying to separate the path into words. 00636 // 00637 // The function increments covered if a valid word ending is found in 00638 // active_dawgs (if covered is incremented, skip is set to the number 00639 // of unichars that should be skipped because they are covered by the 00640 // word whose ending was just discovered). 00641 // 00642 // dawg_score and dawg_score_done are updated if: 00643 // -- at the end of the path we discover a valid word ending from a 00644 // non-fixed length dawg (this means that the whole word is a 00645 // valid word, so dawg_score is set to 1.0f 00646 // -- word_start is true (dawg_score is set to covered / word length) 00647 // 00648 // Note: this function assumes that skip, covered, dawg_score and 00649 // dawg_score_done are not NULL. 00650 void UpdateCoveredByFixedLengthDawgs(const DawgInfoVector &active_dawgs, 00651 int word_index, int word_length, 00652 int *skip, int *covered, 00653 float *dawg_score, 00654 bool *dawg_score_done); 00655 00656 // Wrapper around AssociateUtils::ComputeStats(). 00657 inline void ComputeAssociateStats(int col, int row, 00658 float max_char_wh_ratio, 00659 ViterbiStateEntry *parent_vse, 00660 CHUNKS_RECORD *chunks_record, 00661 AssociateStats *associate_stats) { 00662 AssociateUtils::ComputeStats( 00663 col, row, 00664 (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL, 00665 (parent_vse != NULL) ? parent_vse->length : 0, 00666 fixed_pitch_, max_char_wh_ratio, denorm_, 00667 chunks_record, language_model_debug_level, associate_stats); 00668 } 00669 00670 // Returns true if the path with such top_choice_flags and dawg_info 00671 // could be pruned out (i.e. is neither a system/user/frequent dictionary 00672 // nor a top choice path). 00673 // In non-space delimited languages all paths can be "somewhat" dictionary 00674 // words. In such languages we can not do dictionary-driven path prunning, 00675 // so paths with non-empty dawg_info are considered prunable. 00676 inline bool PrunablePath(LanguageModelFlagsType top_choice_flags, 00677 const LanguageModelDawgInfo *dawg_info) { 00678 if (top_choice_flags) return false; 00679 if (dawg_info != NULL && 00680 (dawg_info->permuter == SYSTEM_DAWG_PERM || 00681 dawg_info->permuter == USER_DAWG_PERM || 00682 dawg_info->permuter == FREQ_DAWG_PERM) && 00683 dict_->GetMaxFixedLengthDawgIndex() < 0) return false; 00684 return true; 00685 } 00686 00687 // Returns true if the given script id indicates a path that might consist 00688 // of non-space delimited words (e.g. when dealing with Chinese and Japanese 00689 // languages), and fixed length dawgs were loaded. 00690 // 00691 // TODO(daria): generate fixed length dawgs for Thai. 00692 inline bool UseFixedLengthDawgs(int script_id) { 00693 if (dict_->GetMaxFixedLengthDawgIndex() < 0) return false; 00694 if ((dict_->getUnicharset().han_sid() != 00695 dict_->getUnicharset().null_sid()) && 00696 script_id == dict_->getUnicharset().han_sid()) return true; 00697 if ((dict_->getUnicharset().hiragana_sid() != 00698 dict_->getUnicharset().null_sid()) && 00699 script_id == dict_->getUnicharset().hiragana_sid()) return true; 00700 if ((dict_->getUnicharset().katakana_sid() != 00701 dict_->getUnicharset().null_sid()) && 00702 script_id == dict_->getUnicharset().katakana_sid()) return true; 00703 return false; 00704 } 00705 00706 // Returns true if the given ViterbiStateEntry represents an acceptable path. 00707 inline bool AcceptablePath(const ViterbiStateEntry &vse) { 00708 return (vse.dawg_info != NULL || vse.Consistent() || 00709 (vse.ngram_info != NULL && !vse.ngram_info->pruned)); 00710 } 00711 00712 public: 00713 // Parameters. 00714 INT_VAR_H(language_model_debug_level, 0, "Language model debug level"); 00715 BOOL_VAR_H(language_model_ngram_on, false, 00716 "Turn on/off the use of character ngram model"); 00717 INT_VAR_H(language_model_ngram_order, 8, 00718 "Maximum order of the character ngram model"); 00719 INT_VAR_H(language_model_viterbi_list_max_num_prunable, 10, 00720 "Maximum number of prunable (those for which PrunablePath() is true)" 00721 "entries in each viterbi list recorded in BLOB_CHOICEs"); 00722 INT_VAR_H(language_model_viterbi_list_max_size, 500, 00723 "Maximum size of viterbi lists recorded in BLOB_CHOICEs"); 00724 double_VAR_H(language_model_ngram_small_prob, 0.000001, 00725 "To avoid overly small denominators use this as the floor" 00726 " of the probability returned by the ngram model"); 00727 double_VAR_H(language_model_ngram_nonmatch_score, -40.0, 00728 "Average classifier score of a non-matching unichar"); 00729 BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step, false, 00730 "Use only the first UTF8 step of the given string" 00731 " when computing log probabilities"); 00732 double_VAR_H(language_model_ngram_scale_factor, 0.03, 00733 "Strength of the character ngram model relative to the" 00734 " character classifier "); 00735 INT_VAR_H(language_model_min_compound_length, 3, 00736 "Minimum length of compound words"); 00737 INT_VAR_H(language_model_fixed_length_choices_depth, 3, 00738 "Depth of blob choice lists to explore" 00739 " when fixed length dawgs are on"); 00740 // Penalties used for adjusting path costs and final word rating. 00741 double_VAR_H(language_model_penalty_non_freq_dict_word, 0.1, 00742 "Penalty for words not in the frequent word dictionary"); 00743 double_VAR_H(language_model_penalty_non_dict_word, 0.15, 00744 "Penalty for non-dictionary words"); 00745 double_VAR_H(language_model_penalty_punc, 0.2, 00746 "Penalty for inconsistent punctuation"); 00747 double_VAR_H(language_model_penalty_case, 0.1, 00748 "Penalty for inconsistent case"); 00749 double_VAR_H(language_model_penalty_script, 0.5, 00750 "Penalty for inconsistent script"); 00751 double_VAR_H(language_model_penalty_chartype, 0.3, 00752 "Penalty for inconsistent character type"); 00753 double_VAR_H(language_model_penalty_font, 0.00, 00754 "Penalty for inconsistent font"); 00755 double_VAR_H(language_model_penalty_spacing, 0.05, 00756 "Penalty for inconsistent spacing"); 00757 double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment"); 00758 00759 protected: 00760 // Member Variables. 00761 00762 // Temporary DawgArgs struct that is re-used across different words to 00763 // avoid dynamic memory re-allocation (should be cleared before each use). 00764 DawgArgs *dawg_args_; 00765 // List of pointers to updated flags used by Viterbi search to mark 00766 // recently updated ViterbiStateEntries. 00767 GenericVector<bool *> updated_flags_; 00768 00769 // The following variables are set at construction time. 00770 00771 // Pointer to fontinfo table (not owned by LanguageModel). 00772 const UnicityTable<FontInfo> *fontinfo_table_; 00773 00774 // Pointer to Dict class, that is used for querying the dictionaries 00775 // (the pointer is not owned by LanguageModel). 00776 Dict *dict_; 00777 00778 // DENORM computed by Tesseract (not owned by LanguageModel). 00779 const DENORM *denorm_; 00780 // TODO(daria): the following variables should become LanguageModel params 00781 // when the old code in bestfirst.cpp and heuristic.cpp is deprecated. 00782 // 00783 // Set to true if we are dealing with fixed pitch text 00784 // (set to assume_fixed_pitch_char_segment). 00785 bool fixed_pitch_; 00786 // Max char width-to-height ratio allowed 00787 // (set to segsearch_max_char_wh_ratio). 00788 float max_char_wh_ratio_; 00789 00790 // The following variables are initialized with InitForWord(). 00791 00792 // String representation of the classification of the previous word 00793 // (since this is only used by the character ngram model component, 00794 // only the last language_model_ngram_order of the word are stored). 00795 STRING prev_word_str_; 00796 int prev_word_unichar_step_len_; 00797 // Active dawg and constraints vector. 00798 DawgInfoVector *beginning_active_dawgs_; 00799 DawgInfoVector *beginning_constraints_; 00800 DawgInfoVector *fixed_length_beginning_active_dawgs_; 00801 DawgInfoVector *empty_dawg_info_vec_; 00802 // Maximum adjustment factor for character ngram choices. 00803 float max_penalty_adjust_; 00804 // Set to true if acceptable choice was discovered. 00805 // Note: it would be nice to use this to terminate the search once an 00806 // acceptable choices is found. However we do not do that and once an 00807 // acceptable choice is found we finish looking for alternative choices 00808 // in the current segmentation graph and then exit the search (no more 00809 // classifications are done after an acceptable choice is found). 00810 // This is needed in order to let the search find the words very close to 00811 // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these 00812 // choices. This way the stopper will know that the best choice is not 00813 // ambiguous (i.e. there are best choices in the best choice list that have 00814 // ratings close to the very best one) and will be less likely to mis-adapt. 00815 bool acceptable_choice_found_; 00816 00817 }; 00818 00819 } // namespace tesseract 00820 00821 #endif // TESSERACT_WORDREC_LANGUAGE_MODEL_H_