Tesseract 3.01
/data/source/tesseract-ocr/wordrec/language_model.h
Go to the documentation of this file.
00001 
00002 // File:        language_model.h
00003 // Description: Functions that utilize the knowledge about the properties,
00004 //              structure and statistics of the language to help recognition.
00005 // Author:      Daria Antonova
00006 // Created:     Mon Nov 11 11:26:43 PST 2009
00007 //
00008 // (C) Copyright 2009, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifndef TESSERACT_WORDREC_LANGUAGE_MODEL_H_
00022 #define TESSERACT_WORDREC_LANGUAGE_MODEL_H_
00023 
00024 #include "associate.h"
00025 #include "dawg.h"
00026 #include "dict.h"
00027 #include "intproto.h"
00028 #include "matrix.h"
00029 #include "oldheap.h"
00030 #include "params.h"
00031 
00032 namespace tesseract {
00033 
00034 // Used for expressing various language model flags.
00035 typedef unsigned char LanguageModelFlagsType;
00036 
00037 // Struct for keeping track of the consistency of the path.
00038 struct LanguageModelConsistencyInfo {
00039   LanguageModelConsistencyInfo()
00040     : punc_ref(NO_EDGE), num_punc(0), invalid_punc(false),
00041       num_non_first_upper(0), num_lower(0),
00042       script_id(0), inconsistent_script(false),
00043       num_alphas(0), num_digits(0), num_other(0),
00044       num_inconsistent_spaces(0), inconsistent_font(false) {}
00045   inline int NumInconsistentPunc() const {
00046     return invalid_punc ? num_punc : 0;
00047   }
00048   inline int NumInconsistentCase() const {
00049     return (num_non_first_upper > num_lower) ? num_lower : num_non_first_upper;
00050   }
00051   inline int NumInconsistentChartype() const {
00052     return (NumInconsistentPunc() + num_other +
00053             ((num_alphas > num_digits) ? num_digits : num_alphas));
00054   }
00055   inline bool Consistent() const {
00056     return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 &&
00057             NumInconsistentChartype() == 0 && !inconsistent_script);
00058   }
00059   inline int  NumInconsistentSpaces() const {
00060     return num_inconsistent_spaces;
00061   }
00062 
00063   EDGE_REF punc_ref;
00064   int num_punc;
00065   bool invalid_punc;
00066   int num_non_first_upper;
00067   int num_lower;
00068   int script_id;
00069   bool inconsistent_script;
00070   int num_alphas;
00071   int num_digits;
00072   int num_other;
00073   int num_inconsistent_spaces;
00074   bool inconsistent_font;
00075 };
00076 
00077 
00078 // The following structs are used for storing the state of the language model
00079 // in the segmentation search graph. In this graph the nodes are BLOB_CHOICEs
00080 // and the links are the replationships between the underlying blobs (see
00081 // segsearch.h for a more detailed description).
00082 // Each of the BLOB_CHOICEs contains LanguageModelState struct, which has
00083 // a list of N best paths (list of ViterbiStateEntry) explored by the Viterbi
00084 // search leading up to and including this BLOB_CHOICE.
00085 // Each ViterbiStateEntry contains information from various components of the
00086 // language model: dawgs in which the path is found, character ngram model
00087 // probability of the path, script/chartype/font consistency info, state for
00088 // language-specific heuristics (e.g. hyphenated and compund words, lower/upper
00089 // case preferences, etc).
00090 // Each ViterbiStateEntry also contains the parent pointer, so that the path
00091 // that it represents (WERD_CHOICE) can be constructed by following these
00092 // parent pointers.
00093 
00094 // Struct for storing additional information used by Dawg language model
00095 // component. It stores the set of active dawgs in which the sequence of
00096 // letters on a path can be found and the constraints that have to be
00097 // satisfied at the end of the word (e.g. beginning/ending punctuation).
00098 struct LanguageModelDawgInfo {
00099   LanguageModelDawgInfo(DawgInfoVector *a, DawgInfoVector *c,
00100                         PermuterType pt) : permuter(pt) {
00101     active_dawgs = new DawgInfoVector(*a);
00102     constraints = new DawgInfoVector(*c);
00103   }
00104   ~LanguageModelDawgInfo() {
00105     delete active_dawgs;
00106     delete constraints;
00107   }
00108   DawgInfoVector *active_dawgs;
00109   DawgInfoVector *constraints;
00110   PermuterType permuter;
00111 };
00112 
00113 // Struct for storing additional information used by Ngram language model
00114 // component.
00115 struct LanguageModelNgramInfo {
00116   LanguageModelNgramInfo(const char *c, int l, bool p, float nc)
00117     : context(c), context_unichar_step_len(l), pruned(p), ngram_cost(nc) {}
00118   STRING context;  // context string
00119   // Length of the context measured by advancing using UNICHAR::utf8_step()
00120   // (should be at most the order of the character ngram model used).
00121   int context_unichar_step_len;
00122   // The paths with pruned set are pruned out from the perspective of the
00123   // character ngram model. They are explored further because they represent
00124   // a dictionary match or a top choice. Thus ngram_info is still computed
00125   // for them in order to calculate the combined cost.
00126   bool pruned;
00127   // -[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path))
00128   float ngram_cost;
00129 };
00130 
00131 // Struct for storing the information about a path in the segmentation graph
00132 // explored by Viterbi search.
00133 struct ViterbiStateEntry : public ELIST_LINK {
00134   ViterbiStateEntry(BLOB_CHOICE *pb, ViterbiStateEntry *pe,
00135                     BLOB_CHOICE *b, float c,
00136                     const LanguageModelConsistencyInfo &ci,
00137                     const AssociateStats &as,
00138                     LanguageModelFlagsType tcf,
00139                     LanguageModelDawgInfo *d, LanguageModelNgramInfo *n)
00140     : cost(c), parent_b(pb), parent_vse(pe), ratings_sum(b->rating()),
00141       min_certainty(b->certainty()), length(1), consistency_info(ci),
00142       associate_stats(as), top_choice_flags(tcf), dawg_info(d), ngram_info(n),
00143       updated(true) {
00144     if (pe != NULL) {
00145       ratings_sum += pe->ratings_sum;
00146       if (pe->min_certainty < min_certainty) {
00147         min_certainty = pe->min_certainty;
00148       }
00149       length += pe->length;
00150     }
00151   }
00152   ~ViterbiStateEntry() {
00153     delete dawg_info;
00154     delete ngram_info;
00155   }
00156   // Comparator function for sorting ViterbiStateEntry_LISTs in
00157   // non-increasing order of costs.
00158   static int Compare(const void *e1, const void *e2) {
00159     const ViterbiStateEntry *ve1 =
00160       *reinterpret_cast<const ViterbiStateEntry * const *>(e1);
00161     const ViterbiStateEntry *ve2 =
00162       *reinterpret_cast<const ViterbiStateEntry * const *>(e2);
00163     return (ve1->cost < ve2->cost) ? -1 : 1;
00164   }
00165   inline bool Consistent() const {
00166     if (dawg_info != NULL && consistency_info.NumInconsistentCase() == 0) {
00167       return true;
00168     }
00169     return consistency_info.Consistent();
00170   }
00171 
00172   // The cost is an adjusted ratings sum, that is adjusted by all the language
00173   // model components that use Viterbi search.
00174   float cost;
00175 
00176   // Pointers to parent BLOB_CHOICE and ViterbiStateEntry (not owned by this).
00177   BLOB_CHOICE *parent_b;
00178   ViterbiStateEntry *parent_vse;
00179 
00180   // Various information about the characters on the path represented
00181   // by this ViterbiStateEntry.
00182   float ratings_sum;  // sum of ratings of character on the path
00183   float min_certainty;  // minimum certainty on the path
00184   int length;  // number of characters on the path
00185   LanguageModelConsistencyInfo consistency_info;  // path consistency info
00186   AssociateStats associate_stats;  // character widths/gaps/seams
00187 
00188   // Flags for marking the entry as a top choice path with
00189   // the smallest rating or lower/upper case letters).
00190   LanguageModelFlagsType top_choice_flags;
00191 
00192   // Extra information maintained by Dawg laguage model component
00193   // (owned by ViterbiStateEntry).
00194   LanguageModelDawgInfo *dawg_info;
00195 
00196   // Extra information maintained by Ngram laguage model component
00197   // (owned by ViterbiStateEntry).
00198   LanguageModelNgramInfo *ngram_info;
00199 
00200   bool updated;  // set to true if the entry has just been created/updated
00201 };
00202 
00203 ELISTIZEH(ViterbiStateEntry);
00204 
00205 // Struct to store information maintained by various language model components.
00206 struct LanguageModelState {
00207   LanguageModelState(int col, int row) : contained_in_col(col),
00208       contained_in_row(row), viterbi_state_entries_prunable_length(0),
00209       viterbi_state_entries_length(0),
00210       viterbi_state_entries_prunable_max_cost(MAX_FLOAT32) {}
00211   ~LanguageModelState() {}
00212 
00213   // Ratings matrix cell that holds this LanguageModelState
00214   // (needed to construct best STATE for rebuild_current_state()
00215   // and best BLOB_CHOICE_LIST_VECTOR for AcceptableChoice()).
00216   int contained_in_col;
00217   int contained_in_row;
00218 
00219   // Storage for the Viterbi state.
00220   ViterbiStateEntry_LIST viterbi_state_entries;
00221   // Number and max cost of prunable paths in viterbi_state_entries.
00222   int viterbi_state_entries_prunable_length;
00223   // Total number of entries in viterbi_state_entries.
00224   int viterbi_state_entries_length;
00225   float viterbi_state_entries_prunable_max_cost;
00226 
00227   // TODO(daria): add font consistency checking.
00228 };
00229 
00230 // Bundle together all the things pertaining to the best choice/state.
00231 struct BestChoiceBundle {
00232   BestChoiceBundle(STATE *s, WERD_CHOICE *bc, WERD_CHOICE *rc,
00233                    BLOB_CHOICE_LIST_VECTOR *bcc)
00234     : best_state(s), best_choice(bc), raw_choice(rc),
00235       best_char_choices(bcc), updated(false), best_vse(NULL), best_b(NULL) {}
00236 
00237   STATE *best_state;
00238   WERD_CHOICE *best_choice;
00239   WERD_CHOICE *raw_choice;
00240   BLOB_CHOICE_LIST_VECTOR *best_char_choices;
00241   bool updated;
00242   DANGERR fixpt;
00243   ViterbiStateEntry *best_vse; // best ViterbiStateEntry and BLOB_CHOICE
00244   BLOB_CHOICE *best_b;         // at the end of the best choice path
00245 };
00246 
00247 struct BestPathByColumn {
00248   float avg_cost;
00249   ViterbiStateEntry *best_vse;
00250   BLOB_CHOICE *best_b;
00251 };
00252 
00253 // This class that contains the data structures and functions necessary
00254 // to represent and use the knowledge about the language.
00255 class LanguageModel {
00256  public:
00257   // Adjustments to pain point priority.
00258   static const float kInitialPainPointPriorityAdjustment;
00259   static const float kDefaultPainPointPriorityAdjustment;
00260   static const float kBestChoicePainPointPriorityAdjustment;
00261   static const float kCriticalPainPointPriorityAdjustment;
00262 
00263   // Denominator for normalizing per-letter ngram cost when deriving
00264   // penalty adjustments.
00265   static const float kMaxAvgNgramCost;
00266   // Minimum word length for fixed length dawgs.
00267   // TODO(daria): check in the new chi/jpn.traineddata without the
00268   // fixed length dawg of length 1 and delete this variable.
00269   static const int kMinFixedLengthDawgLength;
00270   // If there is a significant drop in character ngram probability or a
00271   // dangerous ambiguity make the thresholds on what blob combinations
00272   // can be classified looser.
00273   static const float kLooseMaxCharWhRatio;
00274 
00275   // Masks for interpreting which language model components
00276   // were changed by the call to UpdateState().
00277   static const LanguageModelFlagsType kSmallestRatingFlag = 0x1;
00278   static const LanguageModelFlagsType kLowerCaseFlag = 0x2;
00279   static const LanguageModelFlagsType kUpperCaseFlag = 0x4;
00280   static const LanguageModelFlagsType kConsistentFlag = 0x8;
00281   static const LanguageModelFlagsType kDawgFlag = 0x10;
00282   static const LanguageModelFlagsType kNgramFlag = 0x20;
00283   static const LanguageModelFlagsType kJustClassifiedFlag = 0x80;
00284   static const LanguageModelFlagsType kAllChangedFlag = 0xff;
00285 
00286   LanguageModel(const UnicityTable<FontInfo> *fontinfo_table,
00287                 Dict *dict, WERD_CHOICE **prev_word_best_choice);
00288   ~LanguageModel();
00289 
00290   // Updates data structures that are used for the duration of the segmentation
00291   // search on the current word;
00292   void InitForWord(const WERD_CHOICE *prev_word, const DENORM *denorm,
00293                    bool fixed_pitch, float best_choice_cert,
00294                    float max_char_wh_ratio,
00295                    HEAP *pain_points, CHUNKS_RECORD *chunks_record);
00296   // Resets all the "updated" flags used by the Viterbi search that were
00297   // "registered" during the update of the ratings matrix.
00298   void CleanUp();
00299   // Deletes and sets to NULL language model states of each of the
00300   // BLOB_CHOICEs in the given BLOB_CHOICE_LIST.
00301   void DeleteState(BLOB_CHOICE_LIST *choices);
00302 
00303   // Updates language model state of the given BLOB_CHOICE_LIST (from
00304   // the ratings matrix) a its parent. Updates pain_points if new
00305   // problematic points are found in the segmentation graph.
00306   //
00307   // At most language_model_viterbi_list_size are kept in each
00308   // LanguageModelState.viterbi_state_entries list.
00309   // At most language_model_viterbi_list_max_num_prunable of those are prunable
00310   // (non-dictionary) paths.
00311   // The entries that represent dictionary word paths are kept at the front
00312   // of the list.
00313   // The list ordered by cost that is computed collectively by several
00314   // language model components (currently dawg and ngram components).
00315   //
00316   // best_path_by_column records the lowest cost path found so far for each
00317   // column of the chunks_record->ratings matrix over all the rows. This
00318   // array is updated if a lower cost ViterbiStateEntry is created in curr_col.
00319   LanguageModelFlagsType UpdateState(
00320       LanguageModelFlagsType changed,
00321       int curr_col, int curr_row,
00322       BLOB_CHOICE_LIST *curr_list,
00323       BLOB_CHOICE_LIST *parent_list,
00324       HEAP *pain_points,
00325       BestPathByColumn *best_path_by_column[],
00326       CHUNKS_RECORD *chunks_record,
00327       BestChoiceBundle *best_choice_bundle);
00328 
00329   // Generates pain points from the problematic top choice paths when the
00330   // segmentation search is guided by the character ngram model.
00331   // It is necessary to consider problematic the top choice paths instead of
00332   // the problematic lowest cost paths because the character ngram model
00333   // might assign a very high cost to very improbably paths. For example,
00334   // "liot" might have a much lower cost than "llot", and the character ngram
00335   // model might detect a dip in probability for p(t|lio) at the end of the
00336   // word, but not at the beginning (p(i|l) would be ok). However, looking at
00337   // the dips in character ngram probability of the top choices we would be
00338   // able to stop the problematic points (p(l| l) would be low).
00339   void GenerateNgramModelPainPointsFromColumn(int col, int row,
00340                                               HEAP *pain_points,
00341                                               CHUNKS_RECORD *chunks_record);
00342 
00343   // Generates pain points from the problematic lowest cost paths that are
00344   // "promising" (i.e. would have the cost lower than the one recorded in
00345   // best_path_by_column if the problematic ending of the path is removed
00346   // and after being combined with another blob the certainty of the last
00347   // blob is improved).
00348   void GenerateProblematicPathPainPointsFromColumn(
00349       int col, int row, float best_choice_cert,
00350       HEAP *pain_points, BestPathByColumn *best_path_by_column[],
00351       CHUNKS_RECORD *chunks_record);
00352 
00353   // This function can be called after processing column col of the
00354   // chunks_record->ratings matrix in order to find the promising paths
00355   // that were terminated or made inconsistent by the character choices
00356   // in column col. If such paths are identified, this function generates
00357   // pain points to combine the problematic cells of the matrix.
00358   void GeneratePainPointsFromColumn(
00359     int col,
00360     const GenericVector<int> &non_empty_rows,
00361     float best_choice_cert,
00362     HEAP *pain_points,
00363     BestPathByColumn *best_path_by_column[],
00364     CHUNKS_RECORD *chunks_record);
00365 
00366   // Generates a pain point for each problematic point on the best choice
00367   // path. Such problematic points could be a termination of a dicionary
00368   // word, dip in ngram probability, invalid punctuation, inconsistent
00369   // case/chartype/script or punctuation in the middle of a word.
00370   void GeneratePainPointsFromBestChoice(
00371       HEAP *pain_points,
00372       CHUNKS_RECORD *chunks_record,
00373       BestChoiceBundle *best_choice_bundle);
00374 
00375   // Adds a pain point to the given pain_points queue that will cause
00376   // the entry at chunks_record->ratings(col, row) to be classified.
00377   // The priority of the pain point is set to be:
00378   //
00379   // priority_adjustment * sqrt(avg_parent_cost)
00380   // ----------------------------------------------------
00381   // sqrt(dict_parent_path_length) * |worst_piece_cert|
00382   //
00383   // The priority is further lowered if fragmented is true.
00384   //
00385   void GeneratePainPoint(int col, int row, bool ok_to_extend,
00386                          float priority_adjustment,
00387                          float worst_piece_cert,
00388                          bool fragmented,
00389                          float best_choice_cert,
00390                          float max_char_wh_ratio,
00391                          BLOB_CHOICE *parent_b,
00392                          ViterbiStateEntry *parent_vse,
00393                          CHUNKS_RECORD *chunks_record,
00394                          HEAP *pain_points);
00395 
00396   // Returns true if an acceptable best choice was discovered.
00397   inline bool AcceptableChoiceFound() { return acceptable_choice_found_; }
00398 
00399   // Fills cert with the worst certainty of the top non-fragmented choice
00400   // of the left and right neighbor of the given col,row.
00401   // Sets fragmented if any of the neighbors have a fragmented character
00402   // as the top choice.
00403   inline void GetWorstPieceCertainty(int col, int row, MATRIX *ratings,
00404                                      float *cert, bool *fragmented) {
00405     *cert = 0.0f;
00406     *fragmented = false;
00407     if (row > 0) {
00408       GetPieceCertainty(ratings->get(col, row-1), cert, fragmented);
00409     }
00410     if (col+1 < ratings->dimension()) {
00411       GetPieceCertainty(ratings->get(col+1, row), cert, fragmented);
00412     }
00413     ASSERT_HOST(*cert < 0.0f);
00414   }
00415 
00416  protected:
00417 
00418   inline static float CertaintyScore(float cert) { return (-1.0f / cert); }
00419 
00420   inline bool NonAlphaOrDigitMiddle(int col, int row, int dimension,
00421                                     UNICHAR_ID unichar_id) {
00422     return (!dict_->getUnicharset().get_isalpha(unichar_id) &&
00423             !dict_->getUnicharset().get_isdigit(unichar_id) &&
00424             col > 0 && row+1 < dimension);
00425   }
00426 
00427   inline bool IsFragment(BLOB_CHOICE *b) {
00428     return dict_->getUnicharset().get_fragment(b->unichar_id());
00429   }
00430 
00431   inline bool IsHan(int script_id) {
00432     return ((dict_->getUnicharset().han_sid() !=
00433              dict_->getUnicharset().null_sid()) &&
00434             (script_id == dict_->getUnicharset().han_sid()));
00435   }
00436 
00437   // Finds the first non-fragmented character in the given BLOB_CHOICE_LIST
00438   // and updates cert if its certainty is less than the one recorded in cert.
00439   // Sets fragmented if the first choice in BLOB_CHOICE_LIST is a fragment.
00440   inline void GetPieceCertainty(BLOB_CHOICE_LIST *blist,
00441                                 float *cert, bool *fragmented) {
00442     if (blist == NOT_CLASSIFIED || blist->empty()) return;
00443     BLOB_CHOICE_IT bit(blist);
00444     while (!bit.at_last() && IsFragment(bit.data())) {
00445       *fragmented = true;
00446       bit.forward();  // skip fragments
00447     }
00448     // Each classification must have at least one non-fragmented choice.
00449     ASSERT_HOST(!IsFragment(bit.data()));
00450     if (bit.data()->certainty() < *cert) *cert = bit.data()->certainty();
00451   }
00452 
00453   inline float ComputeAdjustment(int num_problems, float penalty) {
00454     if (num_problems == 0) return 0.0f;
00455     if (num_problems == 1) return penalty;
00456     return (penalty + (language_model_penalty_increment *
00457                        static_cast<float>(num_problems-1)));
00458   }
00459 
00460   // Computes the adjustment to the ratings sum based on the given
00461   // consistency_info. The paths with invalid punctuation, inconsistent
00462   // case and character type are penalized proportionally to the number
00463   // of inconsistencies on the path.
00464   inline float ComputeConsistencyAdjustment(
00465       const LanguageModelDawgInfo *dawg_info,
00466       const LanguageModelConsistencyInfo &consistency_info) {
00467     if (dawg_info != NULL) {
00468       return ComputeAdjustment(consistency_info.NumInconsistentCase(),
00469                                language_model_penalty_case);
00470     }
00471     return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
00472                               language_model_penalty_punc) +
00473             ComputeAdjustment(consistency_info.NumInconsistentCase(),
00474                               language_model_penalty_case) +
00475             ComputeAdjustment(consistency_info.NumInconsistentChartype(),
00476                               language_model_penalty_chartype) +
00477             ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
00478                               language_model_penalty_spacing) +
00479             (consistency_info.inconsistent_script ?
00480              language_model_penalty_script : 0.0f) +
00481             (consistency_info.inconsistent_font ?
00482              language_model_penalty_font : 0.0f));
00483   }
00484 
00485   // Returns an adjusted ratings sum that includes inconsistency penalties.
00486   inline float ComputeConsistencyAdjustedRatingsSum(
00487       float ratings_sum,
00488       const LanguageModelDawgInfo *dawg_info,
00489       const LanguageModelConsistencyInfo &consistency_info) {
00490     return (ratings_sum * (1.0f + ComputeConsistencyAdjustment(
00491         dawg_info, consistency_info)));
00492   }
00493 
00494   // Returns an adjusted ratings sum that includes inconsistency penalties,
00495   // penalties for non-dictionary paths and paths with dips in ngram
00496   // probability.
00497   float ComputeAdjustedPathCost(
00498       float ratings_sum, int length, float dawg_score,
00499       const LanguageModelDawgInfo *dawg_info,
00500       const LanguageModelNgramInfo *ngram_info,
00501       const LanguageModelConsistencyInfo &consistency_info,
00502       const AssociateStats &associate_stats,
00503       ViterbiStateEntry *parent_vse);
00504 
00505   // Returns true if the given ViterbiStateEntry represents a problematic
00506   // path. A path is considered problematic if the last unichar makes it
00507   // inconsistent, introduces a dip in ngram probability or transforms a
00508   // dictionary path into a non-dictionary one.
00509   bool ProblematicPath(const ViterbiStateEntry &vse,
00510                        UNICHAR_ID unichar_id, bool word_end);
00511 
00512   // Finds the first lower and upper case character in curr_list.
00513   // If none found, chooses the first character in the list.
00514   void GetTopChoiceLowerUpper(LanguageModelFlagsType changed,
00515                               BLOB_CHOICE_LIST *curr_list,
00516                               BLOB_CHOICE **first_lower,
00517                               BLOB_CHOICE **first_upper);
00518 
00519   // Helper function that computes the cost of the path composed of the
00520   // path in the given parent ViterbiStateEntry and the given BLOB_CHOICE.
00521   // Adds a new ViterbiStateEntry to the list of viterbi entries
00522   // in the given BLOB_CHOICE if the new path looks good enough.
00523   // Returns LanguageModelFlagsType that indicates which language
00524   // model components were involved in creating the new entry.
00525   LanguageModelFlagsType AddViterbiStateEntry(
00526       LanguageModelFlagsType top_choice_flags,
00527       float denom,
00528       bool word_end,
00529       int curr_col, int curr_row,
00530       BLOB_CHOICE *b,
00531       BLOB_CHOICE *parent_b,
00532       ViterbiStateEntry *parent_vse,
00533       HEAP *pain_points,
00534       BestPathByColumn *best_path_by_column[],
00535       CHUNKS_RECORD *chunks_record,
00536       BestChoiceBundle *best_choice_bundle);
00537 
00538   // Pretty print information in the given ViterbiStateEntry.
00539   void PrintViterbiStateEntry(const char *msg,
00540                               ViterbiStateEntry *vse,
00541                               BLOB_CHOICE *b,
00542                               CHUNKS_RECORD *chunks_record);
00543 
00544   // Determines whether a potential entry is a true top choice and
00545   // updates changed accordingly.
00546   //
00547   // Note: The function assumes that b, top_choice_flags and changed
00548   // are not NULL.
00549   void GenerateTopChoiceInfo(
00550       float ratings_sum,
00551       const LanguageModelDawgInfo *dawg_info,
00552       const LanguageModelConsistencyInfo &consistency_info,
00553       const ViterbiStateEntry *parent_vse,
00554       BLOB_CHOICE *b,
00555       LanguageModelFlagsType *top_choice_flags,
00556       LanguageModelFlagsType *changed);
00557 
00558   // Calls dict_->LetterIsOk() with DawgArgs initialized from parent_vse and
00559   // unichar from b.unichar_id(). Constructs and returns LanguageModelDawgInfo
00560   // with updated active dawgs, constraints and permuter.
00561   //
00562   // Note: the caller is responsible for deleting the returned pointer.
00563   LanguageModelDawgInfo *GenerateDawgInfo(bool word_end, int script_id,
00564                                           int curr_col, int curr_row,
00565                                           const BLOB_CHOICE &b,
00566                                           const ViterbiStateEntry *parent_vse,
00567                                           LanguageModelFlagsType *changed);
00568 
00569   // Computes p(unichar | parent context) and records it in ngram_cost.
00570   // If b.unichar_id() is an unlikely continuation of the parent context
00571   // sets found_small_prob to true and returns NULL.
00572   // Otherwise creates a new LanguageModelNgramInfo entry containing the
00573   // updated context (that includes b.unichar_id() at the end) and returns it.
00574   //
00575   // Note: the caller is responsible for deleting the returned pointer.
00576   LanguageModelNgramInfo *GenerateNgramInfo(const char *unichar,
00577                                             float certainty, float denom,
00578                                             int curr_col, int curr_row,
00579                                             const ViterbiStateEntry *parent_vse,
00580                                             BLOB_CHOICE *parent_b,
00581                                             LanguageModelFlagsType *changed);
00582 
00583   // Computes -(log(prob(classifier)) + log(prob(ngram model)))
00584   // for the given unichar in the given context. If there are multiple
00585   // unichars at one position - takes the average of their probabilities.
00586   // UNICHAR::utf8_step() is used to separate out individual UTF8 characters,
00587   // since probability_in_context() can only handle one at a time (while
00588   // unicharset might contain ngrams and glyphs composed from multiple UTF8
00589   // characters).
00590   float ComputeNgramCost(const char *unichar, float certainty, float denom,
00591                          const char *context,
00592                          int *unichar_step_len, bool *found_small_prob);
00593 
00594   // Computes the normalization factors for the classifier confidences
00595   // (used by ComputeNgramCost()).
00596   float ComputeDenom(BLOB_CHOICE_LIST *curr_list);
00597 
00598   // Fills the given consistenty_info based on parent_vse.consistency_info
00599   // and on the consistency of the given unichar_id with parent_vse.
00600   void FillConsistencyInfo(
00601       int curr_col, bool word_end, BLOB_CHOICE *b,
00602       ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b,
00603       CHUNKS_RECORD *chunks_record,
00604       LanguageModelConsistencyInfo *consistency_info);
00605 
00606   // Constructs WERD_CHOICE by recording unichar_ids of the BLOB_CHOICEs
00607   // on the path represented by the given BLOB_CHOICE and language model
00608   // state entries (lmse, dse). The path is re-constructed by following
00609   // the parent pointers in the the lang model state entries). If the
00610   // constructed WERD_CHOICE is better than the best/raw choice recorded
00611   // in the best_choice_bundle, this function updates the corresponding
00612   // fields and sets best_choice_bunldle->updated to true.
00613   void UpdateBestChoice(BLOB_CHOICE *b,
00614                         ViterbiStateEntry *vse,
00615                         HEAP *pain_points,
00616                         CHUNKS_RECORD *chunks_record,
00617                         BestChoiceBundle *best_choice_bundle);
00618 
00619   // Constructs a WERD_CHOICE by tracing parent pointers starting with
00620   // the given LanguageModelStateEntry. Returns the constructed word.
00621   // Updates best_char_choices, certainties and state if they are not
00622   // NULL (best_char_choices and certainties are assumed to have the
00623   // length equal to lmse->length).
00624   // The caller is resposible for freeing memory associated with the
00625   // returned WERD_CHOICE.
00626   WERD_CHOICE *ConstructWord(BLOB_CHOICE *b,
00627                              ViterbiStateEntry *vse,
00628                              CHUNKS_RECORD *chunks_record,
00629                              BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00630                              float certainties[],
00631                              float *dawg_score,
00632                              STATE *state);
00633 
00634   // This function is used for non-space delimited languages when looking
00635   // for word endings recorded while trying to separate the path into words.
00636   //
00637   // The function increments covered if a valid word ending is found in
00638   // active_dawgs (if covered is incremented, skip is set to the number
00639   // of unichars that should be skipped because they are covered by the
00640   // word whose ending was just discovered).
00641   //
00642   // dawg_score and dawg_score_done are updated if:
00643   // -- at the end of the path we discover a valid word ending from a
00644   //    non-fixed length dawg (this means that the whole word is a
00645   //    valid word, so dawg_score is set to 1.0f
00646   // -- word_start is true (dawg_score is set to covered / word length)
00647   //
00648   // Note: this function assumes that skip, covered, dawg_score and
00649   // dawg_score_done are not NULL.
00650   void UpdateCoveredByFixedLengthDawgs(const DawgInfoVector &active_dawgs,
00651                                        int word_index, int word_length,
00652                                        int *skip, int *covered,
00653                                        float *dawg_score,
00654                                        bool *dawg_score_done);
00655 
00656   // Wrapper around AssociateUtils::ComputeStats().
00657   inline void ComputeAssociateStats(int col, int row,
00658                                     float max_char_wh_ratio,
00659                                     ViterbiStateEntry *parent_vse,
00660                                     CHUNKS_RECORD *chunks_record,
00661                                     AssociateStats *associate_stats) {
00662   AssociateUtils::ComputeStats(
00663       col, row,
00664       (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
00665       (parent_vse != NULL) ? parent_vse->length : 0,
00666       fixed_pitch_, max_char_wh_ratio, denorm_,
00667       chunks_record, language_model_debug_level, associate_stats);
00668   }
00669 
00670   // Returns true if the path with such top_choice_flags and dawg_info
00671   // could be pruned out (i.e. is neither a system/user/frequent dictionary
00672   // nor a top choice path).
00673   // In non-space delimited languages all paths can be "somewhat" dictionary
00674   // words. In such languages we can not do dictionary-driven path prunning,
00675   // so paths with non-empty dawg_info are considered prunable.
00676   inline bool PrunablePath(LanguageModelFlagsType top_choice_flags,
00677                            const LanguageModelDawgInfo *dawg_info) {
00678     if (top_choice_flags) return false;
00679     if (dawg_info != NULL &&
00680         (dawg_info->permuter == SYSTEM_DAWG_PERM ||
00681          dawg_info->permuter == USER_DAWG_PERM ||
00682          dawg_info->permuter == FREQ_DAWG_PERM) &&
00683          dict_->GetMaxFixedLengthDawgIndex() < 0) return false;
00684     return true;
00685   }
00686 
00687   // Returns true if the given script id indicates a path that might consist
00688   // of non-space delimited words (e.g. when dealing with Chinese and Japanese
00689   // languages), and fixed length dawgs were loaded.
00690   //
00691   // TODO(daria): generate fixed length dawgs for Thai.
00692   inline bool UseFixedLengthDawgs(int script_id) {
00693     if (dict_->GetMaxFixedLengthDawgIndex() < 0) return false;
00694     if ((dict_->getUnicharset().han_sid() !=
00695          dict_->getUnicharset().null_sid()) &&
00696         script_id == dict_->getUnicharset().han_sid()) return true;
00697     if ((dict_->getUnicharset().hiragana_sid() !=
00698          dict_->getUnicharset().null_sid()) &&
00699         script_id == dict_->getUnicharset().hiragana_sid()) return true;
00700     if ((dict_->getUnicharset().katakana_sid() !=
00701          dict_->getUnicharset().null_sid()) &&
00702         script_id == dict_->getUnicharset().katakana_sid()) return true;
00703     return false;
00704   }
00705 
00706   // Returns true if the given ViterbiStateEntry represents an acceptable path.
00707   inline bool AcceptablePath(const ViterbiStateEntry &vse) {
00708     return (vse.dawg_info != NULL || vse.Consistent() ||
00709             (vse.ngram_info != NULL && !vse.ngram_info->pruned));
00710   }
00711 
00712  public:
00713   // Parameters.
00714   INT_VAR_H(language_model_debug_level, 0, "Language model debug level");
00715   BOOL_VAR_H(language_model_ngram_on, false,
00716              "Turn on/off the use of character ngram model");
00717   INT_VAR_H(language_model_ngram_order, 8,
00718             "Maximum order of the character ngram model");
00719   INT_VAR_H(language_model_viterbi_list_max_num_prunable, 10,
00720             "Maximum number of prunable (those for which PrunablePath() is true)"
00721             "entries in each viterbi list recorded in BLOB_CHOICEs");
00722   INT_VAR_H(language_model_viterbi_list_max_size, 500,
00723             "Maximum size of viterbi lists recorded in BLOB_CHOICEs");
00724   double_VAR_H(language_model_ngram_small_prob, 0.000001,
00725                "To avoid overly small denominators use this as the floor"
00726                " of the probability returned by the ngram model");
00727   double_VAR_H(language_model_ngram_nonmatch_score, -40.0,
00728                "Average classifier score of a non-matching unichar");
00729   BOOL_VAR_H(language_model_ngram_use_only_first_uft8_step, false,
00730              "Use only the first UTF8 step of the given string"
00731              " when computing log probabilities");
00732   double_VAR_H(language_model_ngram_scale_factor, 0.03,
00733                "Strength of the character ngram model relative to the"
00734                " character classifier ");
00735   INT_VAR_H(language_model_min_compound_length, 3,
00736             "Minimum length of compound words");
00737   INT_VAR_H(language_model_fixed_length_choices_depth, 3,
00738             "Depth of blob choice lists to explore"
00739             " when fixed length dawgs are on");
00740   // Penalties used for adjusting path costs and final word rating.
00741   double_VAR_H(language_model_penalty_non_freq_dict_word, 0.1,
00742                "Penalty for words not in the frequent word dictionary");
00743   double_VAR_H(language_model_penalty_non_dict_word, 0.15,
00744                "Penalty for non-dictionary words");
00745   double_VAR_H(language_model_penalty_punc, 0.2,
00746                "Penalty for inconsistent punctuation");
00747   double_VAR_H(language_model_penalty_case, 0.1,
00748                "Penalty for inconsistent case");
00749   double_VAR_H(language_model_penalty_script, 0.5,
00750                "Penalty for inconsistent script");
00751   double_VAR_H(language_model_penalty_chartype, 0.3,
00752                "Penalty for inconsistent character type");
00753   double_VAR_H(language_model_penalty_font, 0.00,
00754                "Penalty for inconsistent font");
00755   double_VAR_H(language_model_penalty_spacing, 0.05,
00756                "Penalty for inconsistent spacing");
00757   double_VAR_H(language_model_penalty_increment, 0.01, "Penalty increment");
00758 
00759  protected:
00760   // Member Variables.
00761 
00762   // Temporary DawgArgs struct that is re-used across different words to
00763   // avoid dynamic memory re-allocation (should be cleared before each use).
00764   DawgArgs *dawg_args_;
00765   // List of pointers to updated flags used by Viterbi search to mark
00766   // recently updated ViterbiStateEntries.
00767   GenericVector<bool *> updated_flags_;
00768 
00769   // The following variables are set at construction time.
00770 
00771   // Pointer to fontinfo table (not owned by LanguageModel).
00772   const UnicityTable<FontInfo> *fontinfo_table_;
00773 
00774   // Pointer to Dict class, that is used for querying the dictionaries
00775   // (the pointer is not owned by LanguageModel).
00776   Dict *dict_;
00777 
00778   // DENORM computed by Tesseract (not owned by LanguageModel).
00779   const DENORM *denorm_;
00780   // TODO(daria): the following variables should become LanguageModel params
00781   // when the old code in bestfirst.cpp and heuristic.cpp is deprecated.
00782   //
00783   // Set to true if we are dealing with fixed pitch text
00784   // (set to assume_fixed_pitch_char_segment).
00785   bool fixed_pitch_;
00786   // Max char width-to-height ratio allowed
00787   // (set to segsearch_max_char_wh_ratio).
00788   float max_char_wh_ratio_;
00789 
00790   // The following variables are initialized with InitForWord().
00791 
00792   // String representation of the classification of the previous word
00793   // (since this is only used by the character ngram model component,
00794   // only the last language_model_ngram_order of the word are stored).
00795   STRING prev_word_str_;
00796   int prev_word_unichar_step_len_;
00797   // Active dawg and constraints vector.
00798   DawgInfoVector *beginning_active_dawgs_;
00799   DawgInfoVector *beginning_constraints_;
00800   DawgInfoVector *fixed_length_beginning_active_dawgs_;
00801   DawgInfoVector *empty_dawg_info_vec_;
00802   // Maximum adjustment factor for character ngram choices.
00803   float max_penalty_adjust_;
00804   // Set to true if acceptable choice was discovered.
00805   // Note: it would be nice to use this to terminate the search once an
00806   // acceptable choices is found. However we do not do that and once an
00807   // acceptable choice is found we finish looking for alternative choices
00808   // in the current segmentation graph and then exit the search (no more
00809   // classifications are done after an acceptable choice is found).
00810   // This is needed in order to let the search find the words very close to
00811   // the best choice in rating (e.g. what/What, Cat/cat, etc) and log these
00812   // choices. This way the stopper will know that the best choice is not
00813   // ambiguous (i.e. there are best choices in the best choice list that have
00814   // ratings close to the very best one) and will be less likely to mis-adapt.
00815   bool acceptable_choice_found_;
00816 
00817 };
00818 
00819 }  // namespace tesseract
00820 
00821 #endif  // TESSERACT_WORDREC_LANGUAGE_MODEL_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines