Tesseract 3.01
|
00001 00002 // File: dict.h 00003 // Description: dict class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #ifndef TESSERACT_DICT_DICT_H_ 00020 #define TESSERACT_DICT_DICT_H_ 00021 00022 #include "ambigs.h" 00023 #include "dawg.h" 00024 #include "host.h" 00025 #include "image.h" 00026 #include "oldlist.h" 00027 #include "ratngs.h" 00028 #include "stopper.h" 00029 #include "trie.h" 00030 #include "unicharset.h" 00031 #include "permute.h" 00032 00033 #define MAX_WERD_LENGTH (inT64) 128 00034 #define NO_RATING -1 00035 00037 struct CHAR_FRAGMENT_INFO { 00038 UNICHAR_ID unichar_id; 00039 const CHAR_FRAGMENT *fragment; 00040 int num_fragments; 00041 float rating; 00042 float certainty; 00043 }; 00044 00045 namespace tesseract { 00046 00047 typedef GenericVector<Dawg *> DawgVector; 00048 00049 // 00050 // Constants 00051 // 00052 static const int kAnyWordLength = -1; 00053 static const int kRatingPad = 4; 00054 // TODO(daria): If hyphens are different in different languages and can be 00055 // inferred from training data we should load their values dynamically. 00056 static const char kHyphenSymbol[] = "-"; 00057 static const int kMaxNumDawgEdgees = 2000000; 00058 static const int kMaxDocDawgEdges = 250000; 00059 static const int kMaxUserDawgEdges = 50000; 00060 static const float kSimCertaintyScale = -10.0; // similarity matcher scaling 00061 static const float kSimCertaintyOffset = -10.0; // similarity matcher offset 00062 static const float kSimilarityFloor = 100.0; // worst E*L product to stop on 00063 static const int kDocDictMaxRepChars = 4; 00064 00065 struct DawgArgs { 00066 DawgArgs(DawgInfoVector *d, DawgInfoVector *c, DawgInfoVector *ud, 00067 DawgInfoVector *uc, float r, PermuterType p, int len, int e) : 00068 active_dawgs(d), constraints(c), updated_active_dawgs(ud), 00069 updated_constraints(uc), rating_margin(r) { 00070 for (int i = 0; i < MAX_WERD_LENGTH; ++i) { 00071 rating_array[i] = NO_RATING; 00072 } 00073 permuter = p; 00074 sought_word_length = len; 00075 end_char_choice_index = e; 00076 } 00077 DawgInfoVector *active_dawgs; 00078 DawgInfoVector *constraints; 00079 DawgInfoVector *updated_active_dawgs; 00080 DawgInfoVector *updated_constraints; 00081 PermuterType permuter; 00082 int sought_word_length; 00083 00084 // TODO(daria): remove these fields when permdawg is deprecated. 00085 float rating_margin; 00086 float rating_array[MAX_WERD_LENGTH]; 00087 int end_char_choice_index; 00088 }; 00089 00090 class Dict { 00091 public: 00092 // Gain factor for ambiguity threshold. 00093 static const float kStopperAmbiguityThresholdGain; 00094 // Certainty offset for ambiguity threshold. 00095 static const float kStopperAmbiguityThresholdOffset; 00096 00097 Dict(Image* image_ptr); 00098 ~Dict(); 00099 Image* getImage() { 00100 return image_ptr_; 00101 } 00102 UNICHARSET& getUnicharset() { 00103 return getImage()->getCCUtil()->unicharset; 00104 } 00105 const UnicharAmbigs &getUnicharAmbigs() { 00106 return getImage()->getCCUtil()->unichar_ambigs; 00107 } 00108 00109 inline bool compound_marker(UNICHAR_ID unichar_id) { 00110 return (unichar_id == getUnicharset().unichar_to_id("-") || 00111 unichar_id == getUnicharset().unichar_to_id("/")); 00112 } 00113 00114 /* hyphen.cpp ************************************************************/ 00115 00117 inline bool hyphenated() { return 00118 !last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0; 00119 } 00121 inline int hyphen_base_size() { 00122 return this->hyphenated() ? hyphen_word_->length() : 0; 00123 } 00127 inline void copy_hyphen_info(WERD_CHOICE *word) { 00128 if (this->hyphenated()) { 00129 *word = *hyphen_word_; 00130 if (hyphen_debug_level) word->print("copy_hyphen_info: "); 00131 } 00132 } 00136 inline void remove_hyphen_head(WERD_CHOICE *word) { 00137 if (this->hyphenated()) { 00138 word->remove_unichar_ids(0, hyphen_word_->length()); 00139 if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: "); 00140 } 00141 } 00143 inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) { 00144 return (last_word_on_line_ && !first_pos && 00145 unichar_id == hyphen_unichar_id_); 00146 } 00148 inline bool has_hyphen_end(const WERD_CHOICE &word) { 00149 int word_index = word.length() - 1; 00150 return has_hyphen_end(word.unichar_id(word_index), word_index == 0); 00151 } 00155 void reset_hyphen_vars(bool last_word_on_line); 00158 void set_hyphen_word(const WERD_CHOICE &word, 00159 const DawgInfoVector &active_dawgs, 00160 const DawgInfoVector &constraints); 00161 00162 /* permdawg.cpp ************************************************************/ 00165 inline void update_best_choice(const WERD_CHOICE &word, 00166 WERD_CHOICE *best_choice) { 00167 if (word.rating() < best_choice->rating()) *best_choice = word; 00168 } 00172 void init_active_dawgs(int sought_word_length, 00173 DawgInfoVector *active_dawgs, 00174 bool ambigs_mode); 00177 void init_constraints(DawgInfoVector *constraints); 00179 inline bool ambigs_mode(float rating_limit) { return rating_limit <= 0.0; } 00185 WERD_CHOICE *dawg_permute_and_select( 00186 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit, 00187 int sought_word_length, int end_char_choice_index); 00188 WERD_CHOICE *dawg_permute_and_select( 00189 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) { 00190 return dawg_permute_and_select(char_choices, rating_limit, 00191 kAnyWordLength, 0); 00192 } 00200 void go_deeper_dawg_fxn( 00201 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, 00202 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00203 bool word_ending, WERD_CHOICE *word, float certainties[], 00204 float *limit, WERD_CHOICE *best_choice, int *attempts_left, 00205 void *void_more_args); 00206 00207 /* permute.cpp *************************************************************/ 00208 WERD_CHOICE *get_top_choice_word( 00209 const BLOB_CHOICE_LIST_VECTOR &char_choices); 00210 WERD_CHOICE *permute_top_choice( 00211 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00212 float* rating_limit, 00213 WERD_CHOICE *raw_choice, 00214 BOOL8 *any_alpha); 00215 const char* choose_il1(const char *first_char, //first choice 00216 const char *second_char, //second choice 00217 const char *third_char, //third choice 00218 const char *prev_char, //prev in word 00219 const char *next_char, //next in word 00220 const char *next_next_char); //after next next in word 00221 WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00222 const WERD_CHOICE *best_choice, 00223 WERD_CHOICE *raw_choice); 00224 void end_permute(); 00225 void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00226 float rating_limit, 00227 int start, 00228 int end, 00229 WERD_CHOICE *current_word); 00230 bool permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00231 WERD_CHOICE *best_choice, 00232 WERD_CHOICE *raw_choice); 00233 WERD_CHOICE *permute_compound_words( 00234 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00235 float rating_limit); 00239 WERD_CHOICE *permute_fixed_length_words( 00240 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00241 PermuterState *permuter_state); 00243 void incorporate_segcost(WERD_CHOICE* word); 00247 WERD_CHOICE *permute_script_words( 00248 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00249 PermuterState *permuter_state); 00251 WERD_CHOICE *permute_chartype_words( 00252 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00253 PermuterState *permuter_state); 00254 00258 char top_word_chartype(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00259 char* pos_chartypes); 00260 00261 WERD_CHOICE *top_fragments_permute_and_select( 00262 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00263 float rating_limit); 00268 void go_deeper_top_fragments_fxn( 00269 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, 00270 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00271 bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, 00272 WERD_CHOICE *best_choice, int *attempts_left, void *more_args); 00273 00275 bool fragment_state_okay(UNICHAR_ID curr_unichar_id, 00276 float curr_rating, float curr_certainty, 00277 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00278 const char *debug, int word_ending, 00279 CHAR_FRAGMENT_INFO *char_frag_info); 00280 void permute_choices( 00281 const char *debug, 00282 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00283 int char_choice_index, 00284 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00285 WERD_CHOICE *word, 00286 float certainties[], 00287 float *limit, 00288 WERD_CHOICE *best_choice, 00289 int *attempts_left, 00290 void *more_args); 00291 00292 void append_choices( 00293 const char *debug, 00294 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00295 const BLOB_CHOICE &blob_choice, 00296 int char_choice_index, 00297 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00298 WERD_CHOICE *word, 00299 float certainties[], 00300 float *limit, 00301 WERD_CHOICE *best_choice, 00302 int *attempts_left, 00303 void *more_args); 00305 void (Dict::*go_deeper_fxn_)(const char *debug, 00306 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00307 int char_choice_index, 00308 const CHAR_FRAGMENT_INFO *prev_char_frag_info, 00309 bool word_ending, WERD_CHOICE *word, 00310 float certainties[], float *limit, 00311 WERD_CHOICE *best_choice, int *attempts_left, 00312 void *void_more_args); 00313 /* stopper.cpp *************************************************************/ 00314 bool NoDangerousAmbig(WERD_CHOICE *BestChoice, 00315 DANGERR *fixpt, 00316 bool fix_replaceable, 00317 BLOB_CHOICE_LIST_VECTOR *Choices, 00318 bool *modified_blobs); 00327 void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, 00328 UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, 00329 BLOB_CHOICE_LIST_VECTOR *blob_choices, 00330 bool *modified_blobs); 00331 00332 inline void DisableChoiceAccum() { keep_word_choices_ = false; } 00333 inline void EnableChoiceAccum() { keep_word_choices_ = true; } 00334 inline bool ChoiceAccumEnabled() { return keep_word_choices_; } 00335 00337 int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice); 00344 VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice, 00345 FLOAT32 AdjustFactor, 00346 const float Certainties[]); 00348 void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice); 00351 bool StringSameAs(const WERD_CHOICE &WordChoice, 00352 VIABLE_CHOICE ViableChoice); 00354 bool StringSameAs(const char *String, 00355 const char *String_lengths, 00356 VIABLE_CHOICE ViableChoice); 00364 int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices, 00365 const WERD_CHOICE &BestChoice); 00367 bool AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices, 00368 WERD_CHOICE *BestChoice, 00369 DANGERR *fixpt, 00370 ACCEPTABLE_CHOICE_CALLER caller, 00371 bool *modified_blobs); 00375 bool AcceptableResult(const WERD_CHOICE &BestChoice); 00378 int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice); 00386 void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[], 00387 bool raw_choice, WERD_CHOICE *WordChoice); 00388 void EndDangerousAmbigs(); 00390 bool CurrentBestChoiceIs(const WERD_CHOICE &WordChoice); 00392 FLOAT32 CurrentBestChoiceAdjustFactor(); 00394 bool CurrentWordAmbig(); 00396 void DebugWordChoices(); 00398 void PrintAmbigAlternatives(FILE *file, const char *label, 00399 int label_num_unichars); 00402 void FillViableChoice(const WERD_CHOICE &WordChoice, 00403 FLOAT32 AdjustFactor, const float Certainties[], 00404 bool SameString, VIABLE_CHOICE ViableChoice); 00407 bool AlternativeChoicesWorseThan(FLOAT32 Threshold); 00410 void FilterWordChoices(); 00425 void FindClassifierErrors(FLOAT32 MinRating, 00426 FLOAT32 MaxRating, 00427 FLOAT32 RatingMargin, 00428 FLOAT32 Thresholds[]); 00431 void InitChoiceAccum(); 00433 void ClearBestChoiceAccum(); 00437 void LogNewSegmentation(PIECES_STATE BlobWidth); 00440 void LogNewSplit(int Blob); 00443 void AddNewChunk(VIABLE_CHOICE Choice, int Blob); 00445 void SettupStopperPass1(); 00447 void SettupStopperPass2(); 00448 /* context.cpp *************************************************************/ 00450 int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset); 00453 bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset); 00454 00455 /* dict.cpp ****************************************************************/ 00456 00459 void Load(); 00460 void End(); 00461 00462 // Resets the document dictionary analogous to ResetAdaptiveClassifier. 00463 void ResetDocumentDictionary() { 00464 if (pending_words_ != NULL) 00465 pending_words_->clear(); 00466 if (document_words_ != NULL) 00467 document_words_->clear(); 00468 } 00469 00532 // 00533 int def_letter_is_okay(void* void_dawg_args, 00534 UNICHAR_ID unichar_id, bool word_end); 00535 00536 int (Dict::*letter_is_okay_)(void* void_dawg_args, 00537 UNICHAR_ID unichar_id, bool word_end); 00539 int LetterIsOkay(void* void_dawg_args, 00540 UNICHAR_ID unichar_id, bool word_end) { 00541 return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end); 00542 } 00543 00544 00546 double (Dict::*probability_in_context_)(const char* lang, 00547 const char* context, 00548 int context_bytes, 00549 const char* character, 00550 int character_bytes); 00552 double ProbabilityInContext(const char* context, 00553 int context_bytes, 00554 const char* character, 00555 int character_bytes) { 00556 return (this->*probability_in_context_)( 00557 getImage()->getCCUtil()->lang.string(), 00558 context, context_bytes, 00559 character, character_bytes); 00560 } 00561 00563 double def_probability_in_context( 00564 const char* lang, const char* context, int context_bytes, 00565 const char* character, int character_bytes) { 00566 (void) context; 00567 (void) context_bytes; 00568 (void) character; 00569 (void) character_bytes; 00570 return 0.0; 00571 } 00572 double ngram_probability_in_context(const char* lang, 00573 const char* context, 00574 int context_bytes, 00575 const char* character, 00576 int character_bytes); 00577 00579 inline const int NumDawgs() const { return dawgs_.size(); } 00581 inline const Dawg *GetDawg(int index) const { return dawgs_[index]; } 00583 inline const Dawg *GetPuncDawg() const { return punc_dawg_; } 00585 inline const Dawg *GetFixedLengthDawg(int word_length) const { 00586 if (word_length > max_fixed_length_dawgs_wdlen_) return NULL; 00587 assert(dawgs_.size() > word_length); 00588 return dawgs_[word_length]; 00589 } 00590 inline const int GetMaxFixedLengthDawgIndex() const { 00591 return max_fixed_length_dawgs_wdlen_; 00592 } 00594 static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) { 00595 if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg 00596 NODE_REF node = dawg->next_node(edge_ref); 00597 if (node == 0) node = NO_EDGE; // end of word 00598 return node; 00599 } 00605 inline bool ConstraintsOk(const DawgInfoVector &constraints, 00606 int word_end, DawgType current_dawg_type) { 00607 if (!word_end) return true; 00608 if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true; 00609 for (int c = 0; c < constraints.length(); ++c) { 00610 const DawgInfo &cinfo = constraints[c]; 00611 Dawg *cdawg = dawgs_[cinfo.dawg_index]; 00612 if (!cdawg->end_of_word(cinfo.ref)) { 00613 if (dawg_debug_level >= 3) { 00614 tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n", 00615 cinfo.dawg_index, cinfo.ref); 00616 } 00617 return false; 00618 } 00619 } 00620 return true; 00621 } 00622 00628 void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info, 00629 UNICHAR_ID unichar_id, bool word_end, 00630 DawgArgs *dawg_args, PermuterType *current_permuter); 00631 00635 00641 static void ReadFixedLengthDawgs(DawgType type, const STRING &lang, 00642 PermuterType perm, int debug_level, 00643 FILE *file, DawgVector *dawg_vec, 00644 int *max_wdlen); 00647 static void WriteFixedLengthDawgs( 00648 const GenericVector<SquishedDawg *> &dawg_vec, 00649 int num_dawgs, int debug_level, FILE *output_file); 00650 00652 inline bool valid_word_permuter(uinT8 perm, bool numbers_ok) { 00653 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || 00654 perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM || 00655 perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM)); 00656 } 00657 int valid_word(const WERD_CHOICE &word, bool numbers_ok); 00658 int valid_word(const WERD_CHOICE &word) { 00659 return valid_word(word, false); // return NO_PERM for words with digits 00660 } 00661 int valid_word_or_number(const WERD_CHOICE &word) { 00662 return valid_word(word, true); // return NUMBER_PERM for valid numbers 00663 } 00665 int valid_word(const char *string) { 00666 WERD_CHOICE word(string, getUnicharset()); 00667 return valid_word(word); 00668 } 00673 bool valid_punctuation(const WERD_CHOICE &word); 00675 int good_choice(const WERD_CHOICE &choice); 00677 void add_document_word(const WERD_CHOICE &best_choice); 00678 int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices, 00679 const UNICHARSET &unicharset); 00681 void adjust_word(WERD_CHOICE *word, float *certainty_array, 00682 const BLOB_CHOICE_LIST_VECTOR *char_choices, 00683 bool nonword, float additional_adjust, bool debug); 00684 void adjust_word(WERD_CHOICE *word, float *certainty_array, bool debug) { 00685 adjust_word(word, certainty_array, NULL, false, 0.0f, debug); 00686 } 00687 void adjust_non_word(WERD_CHOICE *word, float *certainty_array, bool debug) { 00688 adjust_word(word, certainty_array, NULL, true, 0.0f, debug); 00689 } 00691 inline void SetWordsegRatingAdjustFactor(float f) { 00692 wordseg_rating_adjust_factor_ = f; 00693 } 00694 00695 private: 00697 Image* image_ptr_; 00704 UnicharAmbigs *dang_ambigs_table_; 00706 UnicharAmbigs *replace_ambigs_table_; 00711 bool keep_word_choices_; 00713 FLOAT32 reject_offset_; 00715 PIECES_STATE current_segmentation_; 00717 VIABLE_CHOICE best_raw_choice_; 00718 LIST raw_choices_; 00719 LIST best_choices_; 00720 // Hyphen-related variables. 00721 UNICHAR_ID hyphen_unichar_id_; 00722 WERD_CHOICE *hyphen_word_; 00723 DawgInfoVector hyphen_active_dawgs_; 00724 DawgInfoVector hyphen_constraints_; 00725 bool last_word_on_line_; 00726 // Dawgs. 00727 DawgVector dawgs_; 00728 SuccessorListsVector successors_; 00729 Trie *pending_words_; 00732 // TODO(daria): need to support multiple languages in the future, 00733 // so maybe will need to maintain a list of dawgs of each kind. 00734 Dawg *freq_dawg_; 00735 Dawg *punc_dawg_; 00736 Trie *document_words_; 00739 int max_fixed_length_dawgs_wdlen_; 00742 float wordseg_rating_adjust_factor_; 00743 00744 public: 00748 STRING_VAR_H(user_words_suffix, "", "A list of user-provided words."); 00749 STRING_VAR_H(user_patterns_suffix, "", 00750 "A list of user-provided patterns."); 00751 BOOL_VAR_H(load_system_dawg, true, "Load system word dawg."); 00752 BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg."); 00753 BOOL_VAR_H(load_punc_dawg, true, 00754 "Load dawg with punctuation patterns."); 00755 BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns."); 00756 BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length" 00757 " dawgs (e.g. for non-space delimited languages)"); 00758 double_VAR_H(segment_penalty_dict_frequent_word, 1.0, 00759 "Score multiplier for word matches which have good case and" 00760 "are frequent in the given language (lower is better)."); 00761 00762 double_VAR_H(segment_penalty_dict_case_ok, 1.1, 00763 "Score multiplier for word matches that have good case " 00764 "(lower is better)."); 00765 00766 double_VAR_H(segment_penalty_dict_case_bad, 1.3125, 00767 "Default score multiplier for word matches, which may have " 00768 "case issues (lower is better)."); 00769 00770 // TODO(daria): remove this param when ngram permuter is deprecated. 00771 double_VAR_H(segment_penalty_ngram_best_choice, 1.24, 00772 "Multipler to for the best choice from the ngram model."); 00773 00774 double_VAR_H(segment_penalty_dict_nonword, 1.25, 00775 "Score multiplier for glyph fragment segmentations which " 00776 "do not match a dictionary word (lower is better)."); 00777 00778 double_VAR_H(segment_penalty_garbage, 1.50, 00779 "Score multiplier for poorly cased strings that are not in" 00780 " the dictionary and generally look like garbage (lower is" 00781 " better)."); 00782 INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info" 00783 ", to 2 for more details, to 3 to see all the debug messages"); 00784 INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words."); 00785 INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list."); 00786 BOOL_VAR_H(use_only_first_uft8_step, false, 00787 "Use only the first UTF8 step of the given string" 00788 " when computing log probabilities."); 00789 double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor"); 00790 double_VAR_H(stopper_nondict_certainty_base, -2.50, 00791 "Certainty threshold for non-dict words"); 00792 double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0, 00793 "Reject certainty offset"); 00794 INT_VAR_H(stopper_smallword_size, 2, 00795 "Size of dict word to be treated as non-dict word"); 00796 double_VAR_H(stopper_certainty_per_char, -0.50, 00797 "Certainty to add for each dict char above small word size."); 00798 double_VAR_H(stopper_allowable_character_badness, 3.0, 00799 "Max certaintly variation allowed in a word (in sigma)"); 00800 INT_VAR_H(stopper_debug_level, 0, "Stopper debug level"); 00801 BOOL_VAR_H(stopper_no_acceptable_choices, false, 00802 "Make AcceptableChoice() always return false. Useful" 00803 " when there is a need to explore all segmentations"); 00804 BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices"); 00805 INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list"); 00806 STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information" 00807 " should be printed to stdout"); 00808 STRING_VAR_H(word_to_debug_lengths, "", 00809 "Lengths of unichars in word_to_debug"); 00810 INT_VAR_H(fragments_debug, 0, "Debug character fragments"); 00811 INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process"); 00812 BOOL_VAR_H(permute_debug, 0, "Debug char permutation process"); 00813 double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of" 00814 " current best rate to prune other hypotheses"); 00815 BOOL_VAR_H(permute_script_word, 0, 00816 "Turn on word script consistency permuter"); 00817 BOOL_VAR_H(segment_segcost_rating, 0, 00818 "incorporate segmentation cost in word rating?"); 00819 double_VAR_H(segment_reward_script, 0.95, 00820 "Score multipler for script consistency within a word. " 00821 "Being a 'reward' factor, it should be <= 1. " 00822 "Smaller value implies bigger reward."); 00823 BOOL_VAR_H(permute_fixed_length_dawg, 0, 00824 "Turn on fixed-length phrasebook search permuter"); 00825 BOOL_VAR_H(permute_chartype_word, 0, 00826 "Turn on character type (property) consistency permuter"); 00827 double_VAR_H(segment_reward_chartype, 0.97, 00828 "Score multipler for char type consistency within a word. "); 00829 // TODO(daria): remove this param when ngram permuter is deprecated. 00830 double_VAR_H(segment_reward_ngram_best_choice, 0.99, 00831 "Score multipler for ngram permuter's best choice" 00832 " (only used in the Han script path)."); 00833 BOOL_VAR_H(save_doc_words, 0, "Save Document Words"); 00834 BOOL_VAR_H(doc_dict_enable, 1, "Enable Document Dictionary "); 00835 double_VAR_H(doc_dict_pending_threshold, 0.0, 00836 "Worst certainty for using pending dictionary"); 00837 double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty" 00838 " for words that can be inserted into the document dictionary"); 00839 BOOL_VAR_H(ngram_permuter_activated, false, 00840 "Activate character-level n-gram-based permuter"); 00841 INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different" 00842 " character choices to consider during permutation." 00843 " This limit is especially useful when user patterns" 00844 " are specified, since overly generic patterns can result in" 00845 " dawg search exploring an overly large number of options."); 00846 BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter"); 00847 }; 00848 } // namespace tesseract 00849 00850 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_