Tesseract 3.01
/data/source/tesseract-ocr/dict/dict.h
Go to the documentation of this file.
00001 
00002 // File:        dict.h
00003 // Description: dict class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018 
00019 #ifndef TESSERACT_DICT_DICT_H_
00020 #define TESSERACT_DICT_DICT_H_
00021 
00022 #include "ambigs.h"
00023 #include "dawg.h"
00024 #include "host.h"
00025 #include "image.h"
00026 #include "oldlist.h"
00027 #include "ratngs.h"
00028 #include "stopper.h"
00029 #include "trie.h"
00030 #include "unicharset.h"
00031 #include "permute.h"
00032 
00033 #define MAX_WERD_LENGTH        (inT64) 128
00034 #define NO_RATING               -1
00035 
00037 struct CHAR_FRAGMENT_INFO {
00038   UNICHAR_ID unichar_id;
00039   const CHAR_FRAGMENT *fragment;
00040   int num_fragments;
00041   float rating;
00042   float certainty;
00043 };
00044 
00045 namespace tesseract {
00046 
00047 typedef GenericVector<Dawg *> DawgVector;
00048 
00049 //
00050 // Constants
00051 //
00052 static const int kAnyWordLength = -1;
00053 static const int kRatingPad = 4;
00054 // TODO(daria): If hyphens are different in different languages and can be
00055 // inferred from training data we should load their values dynamically.
00056 static const char kHyphenSymbol[] = "-";
00057 static const int kMaxNumDawgEdgees = 2000000;
00058 static const int kMaxDocDawgEdges = 250000;
00059 static const int kMaxUserDawgEdges = 50000;
00060 static const float kSimCertaintyScale = -10.0;   // similarity matcher scaling
00061 static const float kSimCertaintyOffset = -10.0;  // similarity matcher offset
00062 static const float kSimilarityFloor = 100.0;  // worst E*L product to stop on
00063 static const int kDocDictMaxRepChars = 4;
00064 
00065 struct DawgArgs {
00066   DawgArgs(DawgInfoVector *d, DawgInfoVector *c, DawgInfoVector *ud,
00067            DawgInfoVector *uc, float r, PermuterType p, int len, int e) :
00068     active_dawgs(d), constraints(c), updated_active_dawgs(ud),
00069     updated_constraints(uc), rating_margin(r) {
00070     for (int i = 0; i < MAX_WERD_LENGTH; ++i) {
00071       rating_array[i] = NO_RATING;
00072     }
00073     permuter = p;
00074     sought_word_length = len;
00075     end_char_choice_index = e;
00076   }
00077   DawgInfoVector *active_dawgs;
00078   DawgInfoVector *constraints;
00079   DawgInfoVector *updated_active_dawgs;
00080   DawgInfoVector *updated_constraints;
00081   PermuterType permuter;
00082   int sought_word_length;
00083 
00084   // TODO(daria): remove these fields when permdawg is deprecated.
00085   float rating_margin;  
00086   float rating_array[MAX_WERD_LENGTH];
00087   int end_char_choice_index;
00088 };
00089 
00090 class Dict {
00091  public:
00092   // Gain factor for ambiguity threshold.
00093   static const float kStopperAmbiguityThresholdGain;
00094   // Certainty offset for ambiguity threshold.
00095   static const float kStopperAmbiguityThresholdOffset;
00096 
00097   Dict(Image* image_ptr);
00098   ~Dict();
00099   Image* getImage() {
00100     return image_ptr_;
00101   }
00102   UNICHARSET& getUnicharset() {
00103     return getImage()->getCCUtil()->unicharset;
00104   }
00105   const UnicharAmbigs &getUnicharAmbigs() {
00106     return getImage()->getCCUtil()->unichar_ambigs;
00107   }
00108 
00109   inline bool compound_marker(UNICHAR_ID unichar_id) {
00110     return (unichar_id == getUnicharset().unichar_to_id("-") ||
00111             unichar_id == getUnicharset().unichar_to_id("/"));
00112   }
00113 
00114   /* hyphen.cpp ************************************************************/
00115 
00117   inline bool hyphenated() { return
00118     !last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0;
00119   }
00121   inline int hyphen_base_size() {
00122     return this->hyphenated() ? hyphen_word_->length() : 0;
00123   }
00127   inline void copy_hyphen_info(WERD_CHOICE *word) {
00128     if (this->hyphenated()) {
00129       *word = *hyphen_word_;
00130       if (hyphen_debug_level) word->print("copy_hyphen_info: ");
00131     }
00132   }
00136   inline void remove_hyphen_head(WERD_CHOICE *word) {
00137     if (this->hyphenated()) {
00138       word->remove_unichar_ids(0, hyphen_word_->length());
00139       if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
00140     }
00141   }
00143   inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) {
00144     return (last_word_on_line_ && !first_pos &&
00145             unichar_id == hyphen_unichar_id_);
00146   }
00148   inline bool has_hyphen_end(const WERD_CHOICE &word) {
00149     int word_index = word.length() - 1;
00150     return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
00151   }
00155   void reset_hyphen_vars(bool last_word_on_line);
00158   void set_hyphen_word(const WERD_CHOICE &word,
00159                        const DawgInfoVector &active_dawgs,
00160                        const DawgInfoVector &constraints);
00161 
00162   /* permdawg.cpp ************************************************************/
00165   inline void update_best_choice(const WERD_CHOICE &word,
00166                                  WERD_CHOICE *best_choice) {
00167     if (word.rating() < best_choice->rating()) *best_choice = word;
00168   }
00172   void init_active_dawgs(int sought_word_length,
00173                          DawgInfoVector *active_dawgs,
00174                          bool ambigs_mode);
00177   void init_constraints(DawgInfoVector *constraints);
00179   inline bool ambigs_mode(float rating_limit) { return rating_limit <= 0.0; }
00185   WERD_CHOICE *dawg_permute_and_select(
00186       const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit,
00187       int sought_word_length, int end_char_choice_index);
00188   WERD_CHOICE *dawg_permute_and_select(
00189       const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
00190     return dawg_permute_and_select(char_choices, rating_limit,
00191                                    kAnyWordLength, 0);
00192   }
00200   void go_deeper_dawg_fxn(
00201       const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
00202       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00203       bool word_ending, WERD_CHOICE *word, float certainties[],
00204       float *limit, WERD_CHOICE *best_choice, int *attempts_left,
00205       void *void_more_args);
00206 
00207   /* permute.cpp *************************************************************/
00208   WERD_CHOICE *get_top_choice_word(
00209       const BLOB_CHOICE_LIST_VECTOR &char_choices);
00210   WERD_CHOICE *permute_top_choice(
00211     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00212     float* rating_limit,
00213     WERD_CHOICE *raw_choice,
00214     BOOL8 *any_alpha);
00215   const char* choose_il1(const char *first_char,       //first choice
00216                          const char *second_char,      //second choice
00217                          const char *third_char,       //third choice
00218                          const char *prev_char,        //prev in word
00219                          const char *next_char,        //next in word
00220                          const char *next_next_char);  //after next next in word
00221   WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00222                            const WERD_CHOICE *best_choice,
00223                            WERD_CHOICE *raw_choice);
00224   void end_permute();
00225   void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00226                        float rating_limit,
00227                        int start,
00228                        int end,
00229                        WERD_CHOICE *current_word);
00230   bool permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00231                           WERD_CHOICE *best_choice,
00232                           WERD_CHOICE *raw_choice);
00233   WERD_CHOICE *permute_compound_words(
00234       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00235       float rating_limit);
00239   WERD_CHOICE *permute_fixed_length_words(
00240       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00241       PermuterState *permuter_state);
00243   void incorporate_segcost(WERD_CHOICE* word);
00247   WERD_CHOICE *permute_script_words(
00248       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00249       PermuterState *permuter_state);
00251   WERD_CHOICE *permute_chartype_words(
00252       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00253       PermuterState *permuter_state);
00254 
00258   char top_word_chartype(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00259                          char* pos_chartypes);
00260 
00261   WERD_CHOICE *top_fragments_permute_and_select(
00262       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00263       float rating_limit);
00268   void go_deeper_top_fragments_fxn(
00269       const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
00270       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00271       bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
00272       WERD_CHOICE *best_choice, int *attempts_left, void *more_args);
00273 
00275   bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
00276                            float curr_rating, float curr_certainty,
00277                            const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00278                            const char *debug, int word_ending,
00279                            CHAR_FRAGMENT_INFO *char_frag_info);
00280   void permute_choices(
00281       const char *debug,
00282       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00283       int char_choice_index,
00284       const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00285       WERD_CHOICE *word,
00286       float certainties[],
00287       float *limit,
00288       WERD_CHOICE *best_choice,
00289       int *attempts_left,
00290       void *more_args);
00291 
00292   void append_choices(
00293       const char *debug,
00294       const BLOB_CHOICE_LIST_VECTOR &char_choices,
00295       const BLOB_CHOICE &blob_choice,
00296       int char_choice_index,
00297       const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00298       WERD_CHOICE *word,
00299       float certainties[],
00300       float *limit,
00301       WERD_CHOICE *best_choice,
00302       int *attempts_left,
00303       void *more_args);
00305   void (Dict::*go_deeper_fxn_)(const char *debug,
00306                                const BLOB_CHOICE_LIST_VECTOR &char_choices,
00307                                int char_choice_index,
00308                                const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00309                                bool word_ending, WERD_CHOICE *word,
00310                                float certainties[], float *limit,
00311                                WERD_CHOICE *best_choice, int *attempts_left,
00312                                void *void_more_args);
00313   /* stopper.cpp *************************************************************/
00314   bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
00315                         DANGERR *fixpt,
00316                         bool fix_replaceable,
00317                         BLOB_CHOICE_LIST_VECTOR *Choices,
00318                         bool *modified_blobs);
00327   void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
00328                     UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
00329                     BLOB_CHOICE_LIST_VECTOR *blob_choices,
00330                     bool *modified_blobs);
00331 
00332   inline void DisableChoiceAccum() { keep_word_choices_ = false; }
00333   inline void EnableChoiceAccum() { keep_word_choices_ = true; }
00334   inline bool ChoiceAccumEnabled() { return keep_word_choices_; }
00335 
00337   int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
00344   VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice,
00345                                 FLOAT32 AdjustFactor,
00346                                 const float Certainties[]);
00348   void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
00351   bool StringSameAs(const WERD_CHOICE &WordChoice,
00352                     VIABLE_CHOICE ViableChoice);
00354   bool StringSameAs(const char *String,
00355                     const char *String_lengths,
00356                     VIABLE_CHOICE ViableChoice);
00364   int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
00365                          const WERD_CHOICE &BestChoice);
00367   bool AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
00368                         WERD_CHOICE *BestChoice,
00369                         DANGERR *fixpt,
00370                         ACCEPTABLE_CHOICE_CALLER caller,
00371                         bool *modified_blobs);
00375   bool AcceptableResult(const WERD_CHOICE &BestChoice);
00378   int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice);
00386   void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[],
00387                     bool raw_choice, WERD_CHOICE *WordChoice);
00388   void EndDangerousAmbigs();
00390   bool CurrentBestChoiceIs(const WERD_CHOICE &WordChoice);
00392   FLOAT32 CurrentBestChoiceAdjustFactor();
00394   bool CurrentWordAmbig();
00396   void DebugWordChoices();
00398   void PrintAmbigAlternatives(FILE *file, const char *label,
00399                               int label_num_unichars);
00402   void FillViableChoice(const WERD_CHOICE &WordChoice,
00403                         FLOAT32 AdjustFactor, const float Certainties[],
00404                         bool SameString, VIABLE_CHOICE ViableChoice);
00407   bool AlternativeChoicesWorseThan(FLOAT32 Threshold);
00410   void FilterWordChoices();
00425   void FindClassifierErrors(FLOAT32 MinRating,
00426                             FLOAT32 MaxRating,
00427                             FLOAT32 RatingMargin,
00428                             FLOAT32 Thresholds[]);
00431   void InitChoiceAccum();
00433   void ClearBestChoiceAccum();
00437   void LogNewSegmentation(PIECES_STATE BlobWidth);
00440   void LogNewSplit(int Blob);
00443   void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
00445   void SettupStopperPass1();
00447   void SettupStopperPass2();
00448   /* context.cpp *************************************************************/
00450   int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset);
00453   bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
00454 
00455   /* dict.cpp ****************************************************************/
00456 
00459   void Load();
00460   void End();
00461 
00462   // Resets the document dictionary analogous to ResetAdaptiveClassifier.
00463   void ResetDocumentDictionary() {
00464     if (pending_words_ != NULL)
00465       pending_words_->clear();
00466     if (document_words_ != NULL)
00467       document_words_->clear();
00468   }
00469 
00532   //
00533   int def_letter_is_okay(void* void_dawg_args,
00534                          UNICHAR_ID unichar_id, bool word_end);
00535 
00536   int (Dict::*letter_is_okay_)(void* void_dawg_args,
00537                                UNICHAR_ID unichar_id, bool word_end);
00539   int LetterIsOkay(void* void_dawg_args,
00540                    UNICHAR_ID unichar_id, bool word_end) {
00541     return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
00542   }
00543 
00544 
00546   double (Dict::*probability_in_context_)(const char* lang,
00547                                           const char* context,
00548                                           int context_bytes,
00549                                           const char* character,
00550                                           int character_bytes);
00552   double ProbabilityInContext(const char* context,
00553                               int context_bytes,
00554                               const char* character,
00555                               int character_bytes) {
00556     return (this->*probability_in_context_)(
00557         getImage()->getCCUtil()->lang.string(),
00558         context, context_bytes,
00559         character, character_bytes);
00560   }
00561 
00563   double def_probability_in_context(
00564       const char* lang, const char* context, int context_bytes,
00565       const char* character, int character_bytes) {
00566     (void) context;
00567     (void) context_bytes;
00568     (void) character;
00569     (void) character_bytes;
00570     return 0.0;
00571   }
00572   double ngram_probability_in_context(const char* lang,
00573                                       const char* context,
00574                                       int context_bytes,
00575                                       const char* character,
00576                                       int character_bytes);
00577 
00579   inline const int NumDawgs() const { return dawgs_.size(); }
00581   inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
00583   inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
00585   inline const Dawg *GetFixedLengthDawg(int word_length) const {
00586     if (word_length > max_fixed_length_dawgs_wdlen_) return NULL;
00587     assert(dawgs_.size() > word_length);
00588     return dawgs_[word_length];
00589   }
00590   inline const int GetMaxFixedLengthDawgIndex() const {
00591     return max_fixed_length_dawgs_wdlen_;
00592   }
00594   static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
00595     if (edge_ref == NO_EDGE) return 0;  // beginning to explore the dawg
00596     NODE_REF node = dawg->next_node(edge_ref);
00597     if (node == 0) node = NO_EDGE;  // end of word
00598     return node;
00599   }
00605   inline bool ConstraintsOk(const DawgInfoVector &constraints,
00606                             int word_end, DawgType current_dawg_type) {
00607     if (!word_end) return true;
00608     if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
00609     for (int c = 0; c < constraints.length(); ++c) {
00610       const DawgInfo &cinfo = constraints[c];
00611       Dawg *cdawg = dawgs_[cinfo.dawg_index];
00612       if (!cdawg->end_of_word(cinfo.ref)) {
00613         if (dawg_debug_level >= 3) {
00614           tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n",
00615                   cinfo.dawg_index, cinfo.ref);
00616         }
00617         return false;
00618       }
00619     }
00620     return true;
00621   }
00622 
00628   void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
00629                            UNICHAR_ID unichar_id, bool word_end,
00630                            DawgArgs *dawg_args, PermuterType *current_permuter);
00631 
00635 
00641   static void ReadFixedLengthDawgs(DawgType type, const STRING &lang,
00642                                    PermuterType perm, int debug_level,
00643                                    FILE *file, DawgVector *dawg_vec,
00644                                    int *max_wdlen);
00647   static void WriteFixedLengthDawgs(
00648       const GenericVector<SquishedDawg *> &dawg_vec,
00649       int num_dawgs, int debug_level, FILE *output_file);
00650 
00652   inline bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
00653     return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
00654             perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
00655             perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM));
00656   }
00657   int valid_word(const WERD_CHOICE &word, bool numbers_ok);
00658   int valid_word(const WERD_CHOICE &word) {
00659     return valid_word(word, false);  // return NO_PERM for words with digits
00660   }
00661   int valid_word_or_number(const WERD_CHOICE &word) {
00662     return valid_word(word, true);  // return NUMBER_PERM for valid numbers
00663   }
00665   int valid_word(const char *string) {
00666     WERD_CHOICE word(string, getUnicharset());
00667     return valid_word(word);
00668   }
00673   bool valid_punctuation(const WERD_CHOICE &word);
00675   int good_choice(const WERD_CHOICE &choice);
00677   void add_document_word(const WERD_CHOICE &best_choice);
00678   int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00679                           const UNICHARSET &unicharset);
00681   void adjust_word(WERD_CHOICE *word, float *certainty_array,
00682                    const BLOB_CHOICE_LIST_VECTOR *char_choices,
00683                    bool nonword, float additional_adjust, bool debug);
00684   void adjust_word(WERD_CHOICE *word, float *certainty_array, bool debug) {
00685     adjust_word(word, certainty_array, NULL, false, 0.0f, debug);
00686   }
00687   void adjust_non_word(WERD_CHOICE *word, float *certainty_array, bool debug) {
00688     adjust_word(word, certainty_array, NULL, true, 0.0f, debug);
00689   }
00691   inline void SetWordsegRatingAdjustFactor(float f) {
00692     wordseg_rating_adjust_factor_ = f;
00693   }
00694 
00695  private:
00697   Image* image_ptr_;
00704   UnicharAmbigs *dang_ambigs_table_;
00706   UnicharAmbigs *replace_ambigs_table_;
00711   bool keep_word_choices_;
00713   FLOAT32 reject_offset_;
00715   PIECES_STATE current_segmentation_;
00717   VIABLE_CHOICE best_raw_choice_;
00718   LIST raw_choices_;
00719   LIST best_choices_;
00720   // Hyphen-related variables.
00721   UNICHAR_ID hyphen_unichar_id_;
00722   WERD_CHOICE *hyphen_word_;
00723   DawgInfoVector hyphen_active_dawgs_;
00724   DawgInfoVector hyphen_constraints_;
00725   bool last_word_on_line_;
00726   // Dawgs.
00727   DawgVector dawgs_;
00728   SuccessorListsVector successors_;
00729   Trie *pending_words_;
00732   // TODO(daria): need to support multiple languages in the future,
00733   // so maybe will need to maintain a list of dawgs of each kind.
00734   Dawg *freq_dawg_;
00735   Dawg *punc_dawg_;
00736   Trie *document_words_;
00739   int max_fixed_length_dawgs_wdlen_;
00742   float wordseg_rating_adjust_factor_;
00743 
00744  public:
00748   STRING_VAR_H(user_words_suffix, "", "A list of user-provided words.");
00749   STRING_VAR_H(user_patterns_suffix, "",
00750                "A list of user-provided patterns.");
00751   BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
00752   BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
00753   BOOL_VAR_H(load_punc_dawg, true,
00754              "Load dawg with punctuation patterns.");
00755   BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
00756   BOOL_VAR_H(load_fixed_length_dawgs, true,  "Load fixed length"
00757              " dawgs (e.g. for non-space delimited languages)");
00758   double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
00759                "Score multiplier for word matches which have good case and"
00760                "are frequent in the given language (lower is better).");
00761 
00762   double_VAR_H(segment_penalty_dict_case_ok, 1.1,
00763                "Score multiplier for word matches that have good case "
00764                "(lower is better).");
00765 
00766   double_VAR_H(segment_penalty_dict_case_bad, 1.3125,
00767                "Default score multiplier for word matches, which may have "
00768                "case issues (lower is better).");
00769 
00770   // TODO(daria): remove this param when ngram permuter is deprecated.
00771   double_VAR_H(segment_penalty_ngram_best_choice, 1.24,
00772                "Multipler to for the best choice from the ngram model.");
00773 
00774   double_VAR_H(segment_penalty_dict_nonword, 1.25,
00775                "Score multiplier for glyph fragment segmentations which "
00776                "do not match a dictionary word (lower is better).");
00777 
00778   double_VAR_H(segment_penalty_garbage, 1.50,
00779                "Score multiplier for poorly cased strings that are not in"
00780                " the dictionary and generally look like garbage (lower is"
00781                " better).");
00782   INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
00783             ", to 2 for more details, to 3 to see all the debug messages");
00784   INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
00785   INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list.");
00786   BOOL_VAR_H(use_only_first_uft8_step, false,
00787              "Use only the first UTF8 step of the given string"
00788              " when computing log probabilities.");
00789   double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
00790   double_VAR_H(stopper_nondict_certainty_base, -2.50,
00791                "Certainty threshold for non-dict words");
00792   double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,
00793                "Reject certainty offset");
00794   INT_VAR_H(stopper_smallword_size, 2,
00795             "Size of dict word to be treated as non-dict word");
00796   double_VAR_H(stopper_certainty_per_char, -0.50,
00797                "Certainty to add for each dict char above small word size.");
00798   double_VAR_H(stopper_allowable_character_badness, 3.0,
00799                "Max certaintly variation allowed in a word (in sigma)");
00800   INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
00801   BOOL_VAR_H(stopper_no_acceptable_choices, false,
00802              "Make AcceptableChoice() always return false. Useful"
00803              " when there is a need to explore all segmentations");
00804   BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices");
00805   INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
00806   STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
00807                " should be printed to stdout");
00808   STRING_VAR_H(word_to_debug_lengths, "",
00809                "Lengths of unichars in word_to_debug");
00810   INT_VAR_H(fragments_debug, 0, "Debug character fragments");
00811   INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
00812   BOOL_VAR_H(permute_debug, 0, "Debug char permutation process");
00813   double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of"
00814                " current best rate to prune other hypotheses");
00815   BOOL_VAR_H(permute_script_word, 0,
00816              "Turn on word script consistency permuter");
00817   BOOL_VAR_H(segment_segcost_rating, 0,
00818              "incorporate segmentation cost in word rating?");
00819   double_VAR_H(segment_reward_script, 0.95,
00820                "Score multipler for script consistency within a word. "
00821                "Being a 'reward' factor, it should be <= 1. "
00822                "Smaller value implies bigger reward.");
00823   BOOL_VAR_H(permute_fixed_length_dawg, 0,
00824              "Turn on fixed-length phrasebook search permuter");
00825   BOOL_VAR_H(permute_chartype_word, 0,
00826              "Turn on character type (property) consistency permuter");
00827   double_VAR_H(segment_reward_chartype, 0.97,
00828                "Score multipler for char type consistency within a word. ");
00829   // TODO(daria): remove this param when ngram permuter is deprecated.
00830   double_VAR_H(segment_reward_ngram_best_choice, 0.99,
00831                "Score multipler for ngram permuter's best choice"
00832                " (only used in the Han script path).");
00833   BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
00834   BOOL_VAR_H(doc_dict_enable, 1, "Enable Document Dictionary ");
00835   double_VAR_H(doc_dict_pending_threshold, 0.0,
00836                "Worst certainty for using pending dictionary");
00837   double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
00838                " for words that can be inserted into the document dictionary");
00839   BOOL_VAR_H(ngram_permuter_activated, false,
00840              "Activate character-level n-gram-based permuter");
00841   INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
00842             " character choices to consider during permutation."
00843             " This limit is especially useful when user patterns"
00844             " are specified, since overly generic patterns can result in"
00845             " dawg search exploring an overly large number of options.");
00846   BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter");
00847 };
00848 }  // namespace tesseract
00849 
00850 #endif  // THIRD_PARTY_TESSERACT_DICT_DICT_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines