Tesseract 3.01
|
00001 00002 // File: classify.h 00003 // Description: classify class. 00004 // Author: Samuel Charron 00005 // 00006 // (C) Copyright 2006, Google Inc. 00007 // Licensed under the Apache License, Version 2.0 (the "License"); 00008 // you may not use this file except in compliance with the License. 00009 // You may obtain a copy of the License at 00010 // http://www.apache.org/licenses/LICENSE-2.0 00011 // Unless required by applicable law or agreed to in writing, software 00012 // distributed under the License is distributed on an "AS IS" BASIS, 00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00014 // See the License for the specific language governing permissions and 00015 // limitations under the License. 00016 // 00018 00019 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H__ 00020 #define TESSERACT_CLASSIFY_CLASSIFY_H__ 00021 00022 #include "adaptive.h" 00023 #include "ccstruct.h" 00024 #include "classify.h" 00025 #include "dict.h" 00026 #include "featdefs.h" 00027 #include "intfx.h" 00028 #include "intmatcher.h" 00029 #include "normalis.h" 00030 #include "ratngs.h" 00031 #include "ocrfeatures.h" 00032 #include "unicity_table.h" 00033 00034 class ScrollView; 00035 class WERD_CHOICE; 00036 class WERD_RES; 00037 struct ADAPT_RESULTS; 00038 struct NORM_PROTOS; 00039 00040 static const int kUnknownFontinfoId = -1; 00041 static const int kBlankFontinfoId = -2; 00042 00043 namespace tesseract { 00044 00045 // How segmented is a blob. In this enum, character refers to a classifiable 00046 // unit, but that is too long and character is usually easier to understand. 00047 enum CharSegmentationType { 00048 CST_FRAGMENT, // A partial character. 00049 CST_WHOLE, // A correctly segmented character. 00050 CST_IMPROPER, // More than one but less than 2 characters. 00051 CST_NGRAM // Multiple characters. 00052 }; 00053 00054 class Classify : public CCStruct { 00055 public: 00056 Classify(); 00057 virtual ~Classify(); 00058 Dict& getDict() { 00059 return dict_; 00060 } 00061 00062 // Set the denorm for classification. Takes a copy. 00063 void set_denorm(const DENORM* denorm) { 00064 denorm_ = *denorm; 00065 } 00066 00067 /* adaptive.cpp ************************************************************/ 00068 ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset); 00069 int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId); 00070 int ClassPruner(INT_TEMPLATES IntTemplates, 00071 inT16 NumFeatures, 00072 INT_FEATURE_ARRAY Features, 00073 CLASS_NORMALIZATION_ARRAY NormalizationFactors, 00074 CLASS_CUTOFF_ARRAY ExpectedNumFeatures, 00075 CLASS_PRUNER_RESULTS Results); 00076 void ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset, 00077 CLASS_CUTOFF_ARRAY Cutoffs); 00078 void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates); 00079 void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates); 00080 ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File); 00081 /* normmatch.cpp ************************************************************/ 00082 FLOAT32 ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature, BOOL8 DebugMatch); 00083 void FreeNormProtos(); 00084 NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset); 00085 /* protos.cpp ***************************************************************/ 00086 void ReadClassFile(); 00087 void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class); 00088 INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos, 00089 const UNICHARSET& target_unicharset); 00090 /* adaptmatch.cpp ***********************************************************/ 00091 // Learn the given word using its chopped_word, seam_array, denorm, 00092 // box_word, best_state, and correct_text to learn both correctly and 00093 // incorrectly segmented blobs. If filename is not NULL, then LearnBlob 00094 // is called and the data will be written to a file for static training. 00095 // Otherwise AdaptToBlob is called for adaption within a document. 00096 // If rejmap is not NULL, then only chars with a rejmap entry of '1' will 00097 // be learned, otherwise all chars with good correct_text are learned. 00098 void LearnWord(const char* filename, const char *rejmap, WERD_RES *word); 00099 00100 // Builds a blob of length fragments, from the word, starting at start, 00101 // and then learn it, as having the given correct_text. 00102 // If filename is not NULL, then LearnBlob 00103 // is called and the data will be written to a file for static training. 00104 // Otherwise AdaptToBlob is called for adaption within a document. 00105 // threshold is a magic number required by AdaptToChar and generated by 00106 // GetAdaptThresholds. 00107 // Although it can be partly inferred from the string, segmentation is 00108 // provided to explicitly clarify the character segmentation. 00109 void LearnPieces(const char* filename, int start, int length, 00110 float threshold, CharSegmentationType segmentation, 00111 const char* correct_text, WERD_RES *word); 00112 void InitAdaptiveClassifier(bool load_pre_trained_templates); 00113 void InitAdaptedClass(TBLOB *Blob, 00114 CLASS_ID ClassId, 00115 int FontinfoId, 00116 ADAPT_CLASS Class, 00117 ADAPT_TEMPLATES Templates); 00118 void AdaptToPunc(TBLOB *Blob, 00119 CLASS_ID ClassId, 00120 int FontinfoId, 00121 FLOAT32 Threshold); 00122 void AmbigClassifier(TBLOB *Blob, 00123 INT_TEMPLATES Templates, 00124 ADAPT_CLASS *Classes, 00125 UNICHAR_ID *Ambiguities, 00126 ADAPT_RESULTS *Results); 00127 void MasterMatcher(INT_TEMPLATES templates, 00128 inT16 num_features, 00129 INT_FEATURE_ARRAY features, 00130 CLASS_NORMALIZATION_ARRAY norm_factors, 00131 ADAPT_CLASS* classes, 00132 int debug, 00133 int num_classes, 00134 const TBOX& blob_box, 00135 CLASS_PRUNER_RESULTS results, 00136 ADAPT_RESULTS* final_results); 00137 void ConvertMatchesToChoices(ADAPT_RESULTS *Results, 00138 BLOB_CHOICE_LIST *Choices); 00139 void AddNewResult(ADAPT_RESULTS *results, 00140 CLASS_ID class_dd, 00141 FLOAT32 rating, 00142 int config, 00143 int config2, 00144 int fontinfo_id, 00145 int fontinfo_id2); 00146 int GetAdaptiveFeatures(TBLOB *Blob, 00147 INT_FEATURE_ARRAY IntFeatures, 00148 FEATURE_SET *FloatFeatures); 00149 00150 #ifndef GRAPHICS_DISABLED 00151 void DebugAdaptiveClassifier(TBLOB *Blob, 00152 ADAPT_RESULTS *Results); 00153 #endif 00154 void GetAdaptThresholds (TWERD * Word, 00155 const WERD_CHOICE& BestChoice, 00156 const WERD_CHOICE& BestRawChoice, 00157 FLOAT32 Thresholds[]); 00158 00159 PROTO_ID MakeNewTempProtos(FEATURE_SET Features, 00160 int NumBadFeat, 00161 FEATURE_ID BadFeat[], 00162 INT_CLASS IClass, 00163 ADAPT_CLASS Class, 00164 BIT_VECTOR TempProtoMask); 00165 int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, 00166 CLASS_ID ClassId, 00167 int FontinfoId, 00168 int NumFeatures, 00169 INT_FEATURE_ARRAY Features, 00170 FEATURE_SET FloatFeatures); 00171 void MakePermanent(ADAPT_TEMPLATES Templates, 00172 CLASS_ID ClassId, 00173 int ConfigId, 00174 TBLOB *Blob); 00175 void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results); 00176 void RemoveExtraPuncs(ADAPT_RESULTS *Results); 00177 void RemoveBadMatches(ADAPT_RESULTS *Results); 00178 void SetAdaptiveThreshold(FLOAT32 Threshold); 00179 void ShowBestMatchFor(TBLOB *Blob, 00180 CLASS_ID ClassId, 00181 BOOL8 AdaptiveOn, 00182 BOOL8 PreTrainedOn); 00183 UNICHAR_ID *BaselineClassifier(TBLOB *Blob, 00184 ADAPT_TEMPLATES Templates, 00185 ADAPT_RESULTS *Results); 00186 int CharNormClassifier(TBLOB *Blob, 00187 INT_TEMPLATES Templates, 00188 ADAPT_RESULTS *Results); 00189 UNICHAR_ID *GetAmbiguities(TBLOB *Blob, 00190 CLASS_ID CorrectClass); 00191 void DoAdaptiveMatch(TBLOB *Blob, 00192 ADAPT_RESULTS *Results); 00193 void AdaptToChar(TBLOB *Blob, 00194 CLASS_ID ClassId, 00195 int FontinfoId, 00196 FLOAT32 Threshold); 00197 void DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class); 00198 int AdaptableWord(TWERD *Word, 00199 const WERD_CHOICE &BestChoiceWord, 00200 const WERD_CHOICE &RawChoiceWord); 00201 void EndAdaptiveClassifier(); 00202 void PrintAdaptiveStatistics(FILE *File); 00203 void SettupPass1(); 00204 void SettupPass2(); 00205 void AdaptiveClassifier(TBLOB *Blob, 00206 BLOB_CHOICE_LIST *Choices, 00207 CLASS_PRUNER_RESULTS cp_results); 00208 void ClassifyAsNoise(ADAPT_RESULTS *Results); 00209 void ResetAdaptiveClassifier(); 00210 00211 int GetBaselineFeatures(TBLOB *Blob, 00212 INT_TEMPLATES Templates, 00213 INT_FEATURE_ARRAY IntFeatures, 00214 CLASS_NORMALIZATION_ARRAY CharNormArray, 00215 inT32 *BlobLength); 00216 FLOAT32 GetBestRatingFor(TBLOB *Blob, 00217 CLASS_ID ClassId); 00218 int GetCharNormFeatures(TBLOB *Blob, 00219 INT_TEMPLATES Templates, 00220 INT_FEATURE_ARRAY IntFeatures, 00221 CLASS_NORMALIZATION_ARRAY CharNormArray, 00222 inT32 *BlobLength, 00223 inT32 *FeatureOutlineIndex); 00224 int GetIntBaselineFeatures(TBLOB *Blob, 00225 INT_TEMPLATES Templates, 00226 INT_FEATURE_ARRAY IntFeatures, 00227 CLASS_NORMALIZATION_ARRAY CharNormArray, 00228 inT32 *BlobLength); 00229 int GetIntCharNormFeatures(TBLOB *Blob, 00230 INT_TEMPLATES Templates, 00231 INT_FEATURE_ARRAY IntFeatures, 00232 CLASS_NORMALIZATION_ARRAY CharNormArray, 00233 inT32 *BlobLength, 00234 inT32 *FeatureOutlineArray); 00235 00236 bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config); 00237 void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob); 00238 00239 void ResetFeaturesHaveBeenExtracted(); 00240 bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; } 00241 bool LooksLikeGarbage(TBLOB *blob); 00242 void RefreshDebugWindow(ScrollView **win, const char *msg, 00243 int y_offset, const TBOX &wbox); 00244 /* float2int.cpp ************************************************************/ 00245 void ComputeIntCharNormArray(FEATURE NormFeature, 00246 INT_TEMPLATES Templates, 00247 CLASS_NORMALIZATION_ARRAY CharNormArray); 00248 void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures); 00249 /* intproto.cpp *************************************************************/ 00250 INT_TEMPLATES ReadIntTemplates(FILE *File); 00251 void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, 00252 const UNICHARSET& target_unicharset); 00253 CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on, 00254 bool* pretrained_on); 00255 void ShowMatchDisplay(); 00256 /* font detection ***********************************************************/ 00257 UnicityTable<FontInfo>& get_fontinfo_table() { 00258 return fontinfo_table_; 00259 } 00260 UnicityTable<FontSet>& get_fontset_table() { 00261 return fontset_table_; 00262 } 00263 /* mfoutline.cpp ***********************************************************/ 00264 void NormalizeOutlines(LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale); 00265 /* outfeat.cpp ***********************************************************/ 00266 FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob); 00267 /* picofeat.cpp ***********************************************************/ 00268 FEATURE_SET ExtractPicoFeatures(TBLOB *Blob); 00269 00270 00271 // Member variables. 00272 00273 // Parameters. 00274 INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP"); 00275 BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier"); 00276 INT_VAR_H(classify_debug_level, 0, "Classify debug level"); 00277 00278 /* mfoutline.cpp ***********************************************************/ 00279 /* control knobs used to control normalization of outlines */ 00280 INT_VAR_H(classify_norm_method, character, "Normalization Method ..."); 00281 double_VAR_H(classify_char_norm_range, 0.2, 00282 "Character Normalization Range ..."); 00283 double_VAR_H(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ..."); 00284 double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ..."); 00285 double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ..."); 00286 double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ..."); 00287 00288 /* adaptmatch.cpp ***********************************************************/ 00289 BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching"); 00290 BOOL_VAR_H(tess_bn_matching, 0, "Baseline Normalized Matching"); 00291 BOOL_VAR_H(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier"); 00292 BOOL_VAR_H(classify_use_pre_adapted_templates, 0, 00293 "Use pre-adapted classifier templates"); 00294 BOOL_VAR_H(classify_save_adapted_templates, 0, 00295 "Save adapted templates to a file"); 00296 BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger"); 00297 INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level"); 00298 INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags"); 00299 INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: "); 00300 double_VAR_H(matcher_good_threshold, 0.125, "Good Match (0-1)"); 00301 double_VAR_H(matcher_great_threshold, 0.0, "Great Match (0-1)"); 00302 double_VAR_H(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)"); 00303 double_VAR_H(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)"); 00304 double_VAR_H(matcher_rating_margin, 0.1, "New template margin (0-1)"); 00305 double_VAR_H(matcher_avg_noise_size, 12.0, "Avg. noise blob length: "); 00306 INT_VAR_H(matcher_permanent_classes_min, 1, "Min # of permanent classes"); 00307 INT_VAR_H(matcher_min_examples_for_prototyping, 3, 00308 "Reliable Config Threshold"); 00309 INT_VAR_H(matcher_sufficient_examples_for_prototyping, 5, 00310 "Enable adaption even if the ambiguities have not been seen"); 00311 double_VAR_H(matcher_clustering_max_angle_delta, 0.015, 00312 "Maximum angle delta for prototype clustering"); 00313 double_VAR_H(classify_misfit_junk_penalty, 0.0, 00314 "Penalty to apply when a non-alnum is vertically out of " 00315 "its expected textline position"); 00316 BOOL_VAR_H(classify_enable_int_fx, 1, "Enable integer fx"); 00317 BOOL_VAR_H(classify_enable_new_adapt_rules, 1, "Enable new adaptation rules"); 00318 double_VAR_H(rating_scale, 1.5, "Rating scaling factor"); 00319 double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor"); 00320 double_VAR_H(tessedit_class_miss_scale, 0.00390625, 00321 "Scale factor for features not used"); 00322 INT_VAR_H(classify_adapt_proto_threshold, 230, 00323 "Threshold for good protos during adaptive 0-255"); 00324 INT_VAR_H(classify_adapt_feature_threshold, 230, 00325 "Threshold for good features during adaptive 0-255"); 00326 BOOL_VAR_H(disable_character_fragments, TRUE, 00327 "Do not include character fragments in the" 00328 " results of the classifier"); 00329 double_VAR_H(classify_character_fragments_garbage_certainty_threshold, -3.0, 00330 "Exclude fragments that do not match any whole character" 00331 " with at least this certainty"); 00332 BOOL_VAR_H(classify_debug_character_fragments, FALSE, 00333 "Bring up graphical debugging windows for fragments training"); 00334 BOOL_VAR_H(matcher_debug_separate_windows, FALSE, 00335 "Use two different windows for debugging the matching: " 00336 "One for the protos and one for the features."); 00337 STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning"); 00338 00339 /* intmatcher.cpp **********************************************************/ 00340 INT_VAR_H(classify_class_pruner_threshold, 229, 00341 "Class Pruner Threshold 0-255: "); 00342 INT_VAR_H(classify_class_pruner_multiplier, 30, 00343 "Class Pruner Multiplier 0-255: "); 00344 INT_VAR_H(classify_cp_cutoff_strength, 7, 00345 "Class Pruner CutoffStrength: "); 00346 INT_VAR_H(classify_integer_matcher_multiplier, 14, 00347 "Integer Matcher Multiplier 0-255: "); 00348 00349 // Use class variables to hold onto built-in templates and adapted templates. 00350 INT_TEMPLATES PreTrainedTemplates; 00351 ADAPT_TEMPLATES AdaptedTemplates; 00352 00353 // Create dummy proto and config masks for use with the built-in templates. 00354 BIT_VECTOR AllProtosOn; 00355 BIT_VECTOR PrunedProtos; 00356 BIT_VECTOR AllConfigsOn; 00357 BIT_VECTOR AllProtosOff; 00358 BIT_VECTOR AllConfigsOff; 00359 BIT_VECTOR TempProtoMask; 00360 bool EnableLearning; 00361 /* normmatch.cpp */ 00362 NORM_PROTOS *NormProtos; 00363 /* font detection ***********************************************************/ 00364 UnicityTable<FontInfo> fontinfo_table_; 00365 UnicityTable<FontSet> fontset_table_; 00366 00367 INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word"); 00368 BOOL_VAR_H(classify_bln_numeric_mode, 0, 00369 "Assume the input is numbers [0-9]."); 00370 protected: 00371 IntegerMatcher im_; 00372 FEATURE_DEFS_STRUCT feature_defs_; 00373 // Must be set for the classifier to operate. Ususally set in 00374 // Tesseract::recog_word_recursive, being the main word-level entry point. 00375 DENORM denorm_; 00376 00377 private: 00378 00379 Dict dict_; 00380 00381 /* variables used to hold performance statistics */ 00382 int AdaptiveMatcherCalls; 00383 int BaselineClassifierCalls; 00384 int CharNormClassifierCalls; 00385 int AmbigClassifierCalls; 00386 int NumWordsAdaptedTo; 00387 int NumCharsAdaptedTo; 00388 int NumBaselineClassesTried; 00389 int NumCharNormClassesTried; 00390 int NumAmbigClassesTried; 00391 int NumClassesOutput; 00392 int NumAdaptationsFailed; 00393 00394 /* variables used to hold onto extracted features. This is used 00395 to map from the old scheme in which baseline features and char norm 00396 features are extracted separately, to the new scheme in which they 00397 are extracted at the same time. */ 00398 bool FeaturesHaveBeenExtracted; 00399 bool FeaturesOK; 00400 INT_FEATURE_ARRAY BaselineFeatures; 00401 INT_FEATURE_ARRAY CharNormFeatures; 00402 INT_FX_RESULT_STRUCT FXInfo; 00403 00404 CLASS_CUTOFF_ARRAY CharNormCutoffs; 00405 CLASS_CUTOFF_ARRAY BaselineCutoffs; 00406 ScrollView* learn_debug_win_; 00407 ScrollView* learn_fragmented_word_debug_win_; 00408 ScrollView* learn_fragments_debug_win_; 00409 }; 00410 } // namespace tesseract 00411 00412 #endif // TESSERACT_CLASSIFY_CLASSIFY_H__