00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00018
00019 #ifndef TESSERACT_DICT_DICT_H_
00020 #define TESSERACT_DICT_DICT_H_
00021
00022 #include "ambigs.h"
00023 #include "choices.h"
00024 #include "choicearr.h"
00025 #include "dawg.h"
00026 #include "image.h"
00027 #include "ratngs.h"
00028 #include "stopper.h"
00029 #include "trie.h"
00030 #include "unicharset.h"
00031
00032 extern STRING_VAR_H(global_user_words_suffix, "user-words",
00033 "A list of user-provided words.");
00034 extern INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
00035
00036 #define MAX_WERD_LENGTH (inT64) 40
00037 #define NO_RATING -1
00038 #define FREQ_WERD 1.0
00039 #define GOOD_WERD 1.1
00040 #define OK_WERD 1.3125
00041
00043 struct CHAR_FRAGMENT_INFO {
00044 UNICHAR_ID unichar_id;
00045 const CHAR_FRAGMENT *fragment;
00046 int num_fragments;
00047 float rating;
00048 float certainty;
00049 };
00050
00051 namespace tesseract {
00052
00053 typedef GenericVector<Dawg *> DawgVector;
00054
00055 struct DawgArgs {
00056 DawgArgs(DawgInfoVector *d, DawgInfoVector *c,
00057 DawgInfoVector *ud, DawgInfoVector *uc, float r) :
00058 active_dawgs(d), constraints(c), updated_active_dawgs(ud),
00059 updated_constraints(uc), rating_margin(r) {
00060 for (int i = 0; i < MAX_WERD_LENGTH; ++i) {
00061 rating_array[i] = NO_RATING;
00062 }
00063 permuter = NO_PERM;
00064 }
00065 DawgInfoVector *active_dawgs;
00066 DawgInfoVector *constraints;
00067 DawgInfoVector *updated_active_dawgs;
00068 DawgInfoVector *updated_constraints;
00069 PermuterType permuter;
00070 float rating_margin;
00071 float rating_array[MAX_WERD_LENGTH];
00072 };
00073
00074 class Dict {
00075 public:
00076 Dict(Image* image_ptr);
00077 ~Dict();
00078 Image* getImage() {
00079 return image_ptr_;
00080 }
00081 UNICHARSET& getUnicharset() {
00082 return getImage()->getCCUtil()->unicharset;
00083 }
00084 const UnicharAmbigs &getUnicharAmbigs() {
00085 return getImage()->getCCUtil()->unichar_ambigs;
00086 }
00087
00088
00089
00091 inline bool hyphenated() { return !last_word_on_line_ && hyphen_word_; }
00093 inline int hyphen_base_size() {
00094 return this->hyphenated() ? hyphen_word_->length() : 0;
00095 }
00099 inline void copy_hyphen_info(WERD_CHOICE *word) {
00100 if (this->hyphenated()) {
00101 *word = *hyphen_word_;
00102 if (hyphen_debug_level) word->print("copy_hyphen_info: ");
00103 }
00104 }
00108 inline void remove_hyphen_head(WERD_CHOICE *word) {
00109 if (this->hyphenated()) {
00110 word->remove_unichar_ids(0, hyphen_word_->length());
00111 if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
00112 }
00113 }
00115 inline bool has_hyphen_end(const WERD_CHOICE &word) {
00116 int word_index = word.length() - 1;
00117 return (last_word_on_line_ && word_index > 0 &&
00118 word.unichar_id(word_index) == hyphen_unichar_id_);
00119 }
00123 void reset_hyphen_vars(bool last_word_on_line);
00126 void set_hyphen_word(const WERD_CHOICE &word,
00127 const DawgInfoVector &active_dawgs,
00128 const DawgInfoVector &constraints);
00129
00130
00133 inline void update_best_choice(
00134 const WERD_CHOICE &word, WERD_CHOICE *best_choice) {
00135 if (word.rating() < best_choice->rating()) {
00136 *best_choice = word;
00137 }
00138 }
00142 void init_active_dawgs(DawgInfoVector *active_dawgs);
00145 void init_constraints(DawgInfoVector *constraints);
00151 WERD_CHOICE *dawg_permute_and_select(
00152 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit);
00153 void adjust_word(WERD_CHOICE *best_choice,
00154 float *certainty_array);
00162 void go_deeper_dawg_fxn(
00163 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
00164 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00165 bool word_ending, WERD_CHOICE *word, float certainties[],
00166 float *limit, WERD_CHOICE *best_choice, void *void_more_args);
00167
00168
00169 void add_document_word(const WERD_CHOICE &best_choice);
00170 void init_permute();
00171 WERD_CHOICE *permute_top_choice(
00172 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00173 float* rating_limit,
00174 WERD_CHOICE *raw_choice,
00175 BOOL8 *any_alpha);
00176 const char* choose_il1(const char *first_char,
00177 const char *second_char,
00178 const char *third_char,
00179 const char *prev_char,
00180 const char *next_char,
00181 const char *next_next_char);
00183 int valid_word(const WERD_CHOICE &word) {
00184 return valid_word(word, false);
00185 }
00187 int valid_word_or_number(const WERD_CHOICE &word) {
00188 return valid_word(word, true);
00189 }
00190 int valid_word(const WERD_CHOICE &word, bool numbers_ok);
00191 bool valid_punctuation(const WERD_CHOICE &word);
00192 WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00193 float rating_limit,
00194 WERD_CHOICE *raw_choice);
00195 void end_permute();
00196 void adjust_non_word(WERD_CHOICE *word, float *adjust_factor);
00197 void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00198 float rating_limit,
00199 int start,
00200 int end,
00201 WERD_CHOICE *current_word);
00202 void permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00203 float limit,
00204 WERD_CHOICE *best_choice,
00205 WERD_CHOICE *raw_choice);
00206 WERD_CHOICE *permute_compound_words(
00207 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00208 float rating_limit);
00210 bool word_script_eq(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00211 int target_script_id);
00213 void incorporate_segcost(WERD_CHOICE* word);
00215 WERD_CHOICE *permute_script_words(
00216 const BLOB_CHOICE_LIST_VECTOR &char_choices);
00217
00218 WERD_CHOICE *top_fragments_permute_and_select(
00219 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00220 float rating_limit);
00225 void go_deeper_top_fragments_fxn(
00226 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
00227 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00228 bool word_ending, WERD_CHOICE *word, float certainties[],
00229 float *limit, WERD_CHOICE *best_choice, void *more_args);
00230
00232 bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
00233 float curr_rating, float curr_certainty,
00234 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00235 const char *debug, int word_ending,
00236 CHAR_FRAGMENT_INFO *char_frag_info);
00237 void permute_choices(
00238 const char *debug,
00239 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00240 int char_choice_index,
00241 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00242 WERD_CHOICE *word,
00243 float certainties[],
00244 float *limit,
00245 WERD_CHOICE *best_choice,
00246 void *more_args);
00247
00248 void append_choices(
00249 const char *debug,
00250 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00251 const BLOB_CHOICE &blob_choice,
00252 int char_choice_index,
00253 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00254 WERD_CHOICE *word,
00255 float certainties[],
00256 float *limit,
00257 WERD_CHOICE *best_choice,
00258 void *more_args);
00259
00260 void (Dict::*go_deeper_fxn_)(const char *debug,
00261 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00262 int char_choice_index,
00263 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00264 bool word_ending, WERD_CHOICE *word,
00265 float certainties[], float *limit,
00266 WERD_CHOICE *best_choice, void *void_more_args);
00267
00268 int NoDangerousAmbig(WERD_CHOICE *BestChoice,
00269 DANGERR *fixpt,
00270 bool fix_replaceable,
00271 BLOB_CHOICE_LIST_VECTOR *Choices,
00272 bool *modified_blobs);
00273 void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
00274 UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
00275 BLOB_CHOICE_LIST_VECTOR *blob_choices,
00276 bool *modified_blobs);
00277
00278 inline void DisableChoiceAccum() { keep_word_choices_ = FALSE; }
00279 inline void EnableChoiceAccum() { keep_word_choices_ = TRUE; }
00280
00281 int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
00282 VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice,
00283 FLOAT32 AdjustFactor,
00284 const float Certainties[]);
00285 void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
00286 int StringSameAs(const char *String,
00287 const char *String_lengths,
00288 VIABLE_CHOICE ViableChoice);
00289 bool StringSameAs(const WERD_CHOICE &WordChoice,
00290 VIABLE_CHOICE ViableChoice);
00291 int AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
00292 WERD_CHOICE *BestChoice,
00293 const WERD_CHOICE &RawChoice,
00294 DANGERR *fixpt,
00295 ACCEPTABLE_CHOICE_CALLER caller,
00296 bool *modified_blobs);
00297 int AcceptableResult(const WERD_CHOICE &BestChoice,
00298 const WERD_CHOICE &RawChoice);
00299 int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice);
00300 void LogNewChoice(const WERD_CHOICE &WordChoice, FLOAT32 AdjustFactor,
00301 const float Certainties[], bool raw_choice);
00302 void EndDangerousAmbigs();
00303 int CurrentBestChoiceIs(const WERD_CHOICE &WordChoice);
00304 FLOAT32 CurrentBestChoiceAdjustFactor();
00305 int CurrentWordAmbig();
00306 void DebugWordChoices();
00307 void PrintAmbigAlternatives(FILE *file, const char *label,
00308 int label_num_unichars);
00309 void FillViableChoice(const WERD_CHOICE &WordChoice,
00310 FLOAT32 AdjustFactor, const float Certainties[],
00311 bool SameString, VIABLE_CHOICE ViableChoice);
00312 int AlternativeChoicesWorseThan(FLOAT32 Threshold);
00313 void FilterWordChoices();
00314 void FindClassifierErrors(FLOAT32 MinRating,
00315 FLOAT32 MaxRating,
00316 FLOAT32 RatingMargin,
00317 FLOAT32 Thresholds[]);
00318 void InitChoiceAccum();
00319 void LogNewSegmentation(PIECES_STATE BlobWidth);
00320 void LogNewSplit(int Blob);
00321 void SettupStopperPass1();
00322 void SettupStopperPass2();
00323
00324 void print_word_string(const char* str);
00325 void print_word_choice(const char *label, A_CHOICE* choice);
00326 void print_choices(const char *label,
00327 CHOICES rating);
00328
00329 A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices,
00330 float rating_limit,
00331 const Dawg *dawg);
00332
00333
00396 int def_letter_is_okay(void* void_dawg_args, int word_index,
00397 const void* word, bool word_end);
00398
00399 int new_letter_is_okay(void* void_dawg_args, int word_index,
00400 const void* word, bool word_end);
00401 int (Dict::*letter_is_okay_)(void* void_dawg_args, int word_index,
00402 const void *word, bool word_end);
00404 inline const int NumDawgs() const { return dawgs_.size(); }
00406 inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
00414 inline bool ConstraintsOk(const DawgInfoVector &constraints,
00415 int word_end, DawgType current_dawg_type) {
00416 if (!word_end) return true;
00417 if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
00418 for (int c = 0; c < constraints.length(); ++c) {
00419 const DawgInfo &cinfo = constraints[c];
00420 Dawg *cdawg = dawgs_[cinfo.dawg_index];
00421 if (!cdawg->end_of_word(cinfo.ref)) {
00422 if (dawg_debug_level >= 3) {
00423 tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n",
00424 cinfo.dawg_index, cinfo.ref);
00425 }
00426 return false;
00427 }
00428 }
00429 return true;
00430 }
00432 static inline void UpdatePermuter(PermuterType new_permuter,
00433 PermuterType *permuter) {
00434 if (dawg_debug_level >= 3) tprintf("Letter found\n");
00435 if (new_permuter > *permuter) *permuter = new_permuter;
00436 }
00437
00438
00439
00440
00441 void LogNewWordChoice(A_CHOICE *a_choice,
00442 FLOAT32 adjust_factor,
00443 const float certainties[],
00444 const UNICHARSET &unicharset);
00445 int valid_word(const char *string);
00446
00447 private:
00449 Image* image_ptr_;
00456 UnicharAmbigs *dang_ambigs_table_;
00458 UnicharAmbigs *replace_ambigs_table_;
00463 BOOL8 keep_word_choices_;
00465 FLOAT32 reject_offset_;
00467 PIECES_STATE current_segmentation_;
00469 VIABLE_CHOICE best_raw_choice_;
00470 LIST raw_choices_;
00471 LIST best_choices_;
00472
00473 UNICHAR_ID hyphen_unichar_id_;
00474 WERD_CHOICE *hyphen_word_;
00475 DawgInfoVector hyphen_active_dawgs_;
00476 DawgInfoVector hyphen_constraints_;
00477 bool last_word_on_line_;
00478
00479 DawgVector dawgs_;
00480 SuccessorListsVector successors_;
00481 Dawg *freq_dawg_;
00482 Trie *pending_words_;
00485
00486
00487 Trie *document_words_;
00488 };
00489 }
00490
00491 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_