Tesseract 3.01
/data/source/tesseract-ocr/classify/classify.h
Go to the documentation of this file.
00001 
00002 // File:        classify.h
00003 // Description: classify class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018 
00019 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H__
00020 #define TESSERACT_CLASSIFY_CLASSIFY_H__
00021 
00022 #include "adaptive.h"
00023 #include "ccstruct.h"
00024 #include "classify.h"
00025 #include "dict.h"
00026 #include "featdefs.h"
00027 #include "intfx.h"
00028 #include "intmatcher.h"
00029 #include "normalis.h"
00030 #include "ratngs.h"
00031 #include "ocrfeatures.h"
00032 #include "unicity_table.h"
00033 
00034 class ScrollView;
00035 class WERD_CHOICE;
00036 class WERD_RES;
00037 struct ADAPT_RESULTS;
00038 struct NORM_PROTOS;
00039 
00040 static const int kUnknownFontinfoId = -1;
00041 static const int kBlankFontinfoId = -2;
00042 
00043 namespace tesseract {
00044 
00045 // How segmented is a blob. In this enum, character refers to a classifiable
00046 // unit, but that is too long and character is usually easier to understand.
00047 enum CharSegmentationType {
00048   CST_FRAGMENT,  // A partial character.
00049   CST_WHOLE,     // A correctly segmented character.
00050   CST_IMPROPER,  // More than one but less than 2 characters.
00051   CST_NGRAM      // Multiple characters.
00052 };
00053 
00054 class Classify : public CCStruct {
00055  public:
00056   Classify();
00057   virtual ~Classify();
00058   Dict& getDict() {
00059     return dict_;
00060   }
00061 
00062   // Set the denorm for classification. Takes a copy.
00063   void set_denorm(const DENORM* denorm) {
00064     denorm_ = *denorm;
00065   }
00066 
00067   /* adaptive.cpp ************************************************************/
00068   ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
00069   int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId);
00070   int ClassPruner(INT_TEMPLATES IntTemplates,
00071                             inT16 NumFeatures,
00072                             INT_FEATURE_ARRAY Features,
00073                             CLASS_NORMALIZATION_ARRAY NormalizationFactors,
00074                             CLASS_CUTOFF_ARRAY ExpectedNumFeatures,
00075                             CLASS_PRUNER_RESULTS Results);
00076   void ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
00077                       CLASS_CUTOFF_ARRAY Cutoffs);
00078   void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
00079   void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
00080   ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File);
00081   /* normmatch.cpp ************************************************************/
00082   FLOAT32 ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature, BOOL8 DebugMatch);
00083   void FreeNormProtos();
00084   NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
00085   /* protos.cpp ***************************************************************/
00086   void ReadClassFile();
00087   void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class);
00088   INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos,
00089                                    const UNICHARSET& target_unicharset);
00090   /* adaptmatch.cpp ***********************************************************/
00091   // Learn the given word using its chopped_word, seam_array, denorm,
00092   // box_word, best_state, and correct_text to learn both correctly and
00093   // incorrectly segmented blobs. If filename is not NULL, then LearnBlob
00094   // is called and the data will be written to a file for static training.
00095   // Otherwise AdaptToBlob is called for adaption within a document.
00096   // If rejmap is not NULL, then only chars with a rejmap entry of '1' will
00097   // be learned, otherwise all chars with good correct_text are learned.
00098   void LearnWord(const char* filename, const char *rejmap, WERD_RES *word);
00099 
00100   // Builds a blob of length fragments, from the word, starting at start,
00101   // and then learn it, as having the given correct_text.
00102   // If filename is not NULL, then LearnBlob
00103   // is called and the data will be written to a file for static training.
00104   // Otherwise AdaptToBlob is called for adaption within a document.
00105   // threshold is a magic number required by AdaptToChar and generated by
00106   // GetAdaptThresholds.
00107   // Although it can be partly inferred from the string, segmentation is
00108   // provided to explicitly clarify the character segmentation.
00109   void LearnPieces(const char* filename, int start, int length,
00110                    float threshold, CharSegmentationType segmentation,
00111                    const char* correct_text, WERD_RES *word);
00112   void InitAdaptiveClassifier(bool load_pre_trained_templates);
00113   void InitAdaptedClass(TBLOB *Blob,
00114                         CLASS_ID ClassId,
00115                         int FontinfoId,
00116                         ADAPT_CLASS Class,
00117                         ADAPT_TEMPLATES Templates);
00118   void AdaptToPunc(TBLOB *Blob,
00119                    CLASS_ID ClassId,
00120                    int FontinfoId,
00121                    FLOAT32 Threshold);
00122   void AmbigClassifier(TBLOB *Blob,
00123                        INT_TEMPLATES Templates,
00124                        ADAPT_CLASS *Classes,
00125                        UNICHAR_ID *Ambiguities,
00126                        ADAPT_RESULTS *Results);
00127   void MasterMatcher(INT_TEMPLATES templates,
00128                      inT16 num_features,
00129                      INT_FEATURE_ARRAY features,
00130                      CLASS_NORMALIZATION_ARRAY norm_factors,
00131                      ADAPT_CLASS* classes,
00132                      int debug,
00133                      int num_classes,
00134                      const TBOX& blob_box,
00135                      CLASS_PRUNER_RESULTS results,
00136                      ADAPT_RESULTS* final_results);
00137   void ConvertMatchesToChoices(ADAPT_RESULTS *Results,
00138                                BLOB_CHOICE_LIST *Choices);
00139   void AddNewResult(ADAPT_RESULTS *results,
00140                     CLASS_ID class_dd,
00141                     FLOAT32 rating,
00142                     int config,
00143                     int config2,
00144                     int fontinfo_id,
00145                     int fontinfo_id2);
00146   int GetAdaptiveFeatures(TBLOB *Blob,
00147                           INT_FEATURE_ARRAY IntFeatures,
00148                           FEATURE_SET *FloatFeatures);
00149 
00150 #ifndef GRAPHICS_DISABLED
00151   void DebugAdaptiveClassifier(TBLOB *Blob,
00152                                ADAPT_RESULTS *Results);
00153 #endif
00154   void GetAdaptThresholds (TWERD * Word,
00155                            const WERD_CHOICE& BestChoice,
00156                            const WERD_CHOICE& BestRawChoice,
00157                            FLOAT32 Thresholds[]);
00158 
00159   PROTO_ID MakeNewTempProtos(FEATURE_SET Features,
00160                              int NumBadFeat,
00161                              FEATURE_ID BadFeat[],
00162                              INT_CLASS IClass,
00163                              ADAPT_CLASS Class,
00164                              BIT_VECTOR TempProtoMask);
00165   int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
00166                              CLASS_ID ClassId,
00167                              int FontinfoId,
00168                              int NumFeatures,
00169                              INT_FEATURE_ARRAY Features,
00170                              FEATURE_SET FloatFeatures);
00171   void MakePermanent(ADAPT_TEMPLATES Templates,
00172                      CLASS_ID ClassId,
00173                      int ConfigId,
00174                      TBLOB *Blob);
00175   void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
00176   void RemoveExtraPuncs(ADAPT_RESULTS *Results);
00177   void RemoveBadMatches(ADAPT_RESULTS *Results);
00178   void SetAdaptiveThreshold(FLOAT32 Threshold);
00179   void ShowBestMatchFor(TBLOB *Blob,
00180                         CLASS_ID ClassId,
00181                         BOOL8 AdaptiveOn,
00182                         BOOL8 PreTrainedOn);
00183   UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
00184                                  ADAPT_TEMPLATES Templates,
00185                                  ADAPT_RESULTS *Results);
00186   int CharNormClassifier(TBLOB *Blob,
00187                          INT_TEMPLATES Templates,
00188                          ADAPT_RESULTS *Results);
00189   UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
00190                              CLASS_ID CorrectClass);
00191   void DoAdaptiveMatch(TBLOB *Blob,
00192                        ADAPT_RESULTS *Results);
00193   void AdaptToChar(TBLOB *Blob,
00194                    CLASS_ID ClassId,
00195                    int FontinfoId,
00196                    FLOAT32 Threshold);
00197   void DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class);
00198   int AdaptableWord(TWERD *Word,
00199                   const WERD_CHOICE &BestChoiceWord,
00200                   const WERD_CHOICE &RawChoiceWord);
00201   void EndAdaptiveClassifier();
00202   void PrintAdaptiveStatistics(FILE *File);
00203   void SettupPass1();
00204   void SettupPass2();
00205   void AdaptiveClassifier(TBLOB *Blob,
00206                           BLOB_CHOICE_LIST *Choices,
00207                           CLASS_PRUNER_RESULTS cp_results);
00208   void ClassifyAsNoise(ADAPT_RESULTS *Results);
00209   void ResetAdaptiveClassifier();
00210 
00211   int GetBaselineFeatures(TBLOB *Blob,
00212                           INT_TEMPLATES Templates,
00213                           INT_FEATURE_ARRAY IntFeatures,
00214                           CLASS_NORMALIZATION_ARRAY CharNormArray,
00215                           inT32 *BlobLength);
00216   FLOAT32 GetBestRatingFor(TBLOB *Blob,
00217                            CLASS_ID ClassId);
00218   int GetCharNormFeatures(TBLOB *Blob,
00219                           INT_TEMPLATES Templates,
00220                           INT_FEATURE_ARRAY IntFeatures,
00221                           CLASS_NORMALIZATION_ARRAY CharNormArray,
00222                           inT32 *BlobLength,
00223                           inT32 *FeatureOutlineIndex);
00224   int GetIntBaselineFeatures(TBLOB *Blob,
00225                              INT_TEMPLATES Templates,
00226                              INT_FEATURE_ARRAY IntFeatures,
00227                              CLASS_NORMALIZATION_ARRAY CharNormArray,
00228                              inT32 *BlobLength);
00229   int GetIntCharNormFeatures(TBLOB *Blob,
00230                              INT_TEMPLATES Templates,
00231                              INT_FEATURE_ARRAY IntFeatures,
00232                              CLASS_NORMALIZATION_ARRAY CharNormArray,
00233                              inT32 *BlobLength,
00234                              inT32 *FeatureOutlineArray);
00235 
00236   bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
00237   void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);
00238 
00239   void ResetFeaturesHaveBeenExtracted();
00240   bool AdaptiveClassifierIsFull() { return NumAdaptationsFailed > 0; }
00241   bool LooksLikeGarbage(TBLOB *blob);
00242   void RefreshDebugWindow(ScrollView **win, const char *msg,
00243                           int y_offset, const TBOX &wbox);
00244   /* float2int.cpp ************************************************************/
00245   void ComputeIntCharNormArray(FEATURE NormFeature,
00246                                INT_TEMPLATES Templates,
00247                                CLASS_NORMALIZATION_ARRAY CharNormArray);
00248   void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
00249   /* intproto.cpp *************************************************************/
00250   INT_TEMPLATES ReadIntTemplates(FILE *File);
00251   void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
00252                          const UNICHARSET& target_unicharset);
00253   CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on,
00254                            bool* pretrained_on);
00255   void ShowMatchDisplay();
00256   /* font detection ***********************************************************/
00257   UnicityTable<FontInfo>& get_fontinfo_table() {
00258     return fontinfo_table_;
00259   }
00260   UnicityTable<FontSet>& get_fontset_table() {
00261     return fontset_table_;
00262   }
00263   /* mfoutline.cpp ***********************************************************/
00264   void NormalizeOutlines(LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale);
00265   /* outfeat.cpp ***********************************************************/
00266   FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob);
00267   /* picofeat.cpp ***********************************************************/
00268   FEATURE_SET ExtractPicoFeatures(TBLOB *Blob);
00269 
00270 
00271   // Member variables.
00272 
00273   // Parameters.
00274   INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
00275   BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
00276   INT_VAR_H(classify_debug_level, 0, "Classify debug level");
00277 
00278   /* mfoutline.cpp ***********************************************************/
00279   /* control knobs used to control normalization of outlines */
00280   INT_VAR_H(classify_norm_method, character, "Normalization Method   ...");
00281   double_VAR_H(classify_char_norm_range, 0.2,
00282              "Character Normalization Range ...");
00283   double_VAR_H(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...");
00284   double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...");
00285   double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...");
00286   double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...");
00287 
00288   /* adaptmatch.cpp ***********************************************************/
00289   BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching");
00290   BOOL_VAR_H(tess_bn_matching, 0, "Baseline Normalized Matching");
00291   BOOL_VAR_H(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier");
00292   BOOL_VAR_H(classify_use_pre_adapted_templates, 0,
00293              "Use pre-adapted classifier templates");
00294   BOOL_VAR_H(classify_save_adapted_templates, 0,
00295              "Save adapted templates to a file");
00296   BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger");
00297   INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level");
00298   INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags");
00299   INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: ");
00300   double_VAR_H(matcher_good_threshold, 0.125, "Good Match (0-1)");
00301   double_VAR_H(matcher_great_threshold, 0.0, "Great Match (0-1)");
00302   double_VAR_H(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)");
00303   double_VAR_H(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)");
00304   double_VAR_H(matcher_rating_margin, 0.1, "New template margin (0-1)");
00305   double_VAR_H(matcher_avg_noise_size, 12.0, "Avg. noise blob length: ");
00306   INT_VAR_H(matcher_permanent_classes_min, 1, "Min # of permanent classes");
00307   INT_VAR_H(matcher_min_examples_for_prototyping, 3,
00308             "Reliable Config Threshold");
00309   INT_VAR_H(matcher_sufficient_examples_for_prototyping, 5,
00310             "Enable adaption even if the ambiguities have not been seen");
00311   double_VAR_H(matcher_clustering_max_angle_delta, 0.015,
00312                "Maximum angle delta for prototype clustering");
00313   double_VAR_H(classify_misfit_junk_penalty, 0.0,
00314                "Penalty to apply when a non-alnum is vertically out of "
00315                "its expected textline position");
00316   BOOL_VAR_H(classify_enable_int_fx, 1, "Enable integer fx");
00317   BOOL_VAR_H(classify_enable_new_adapt_rules, 1, "Enable new adaptation rules");
00318   double_VAR_H(rating_scale, 1.5, "Rating scaling factor");
00319   double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
00320   double_VAR_H(tessedit_class_miss_scale, 0.00390625,
00321                "Scale factor for features not used");
00322   INT_VAR_H(classify_adapt_proto_threshold, 230,
00323             "Threshold for good protos during adaptive 0-255");
00324   INT_VAR_H(classify_adapt_feature_threshold, 230,
00325             "Threshold for good features during adaptive 0-255");
00326   BOOL_VAR_H(disable_character_fragments, TRUE,
00327              "Do not include character fragments in the"
00328              " results of the classifier");
00329   double_VAR_H(classify_character_fragments_garbage_certainty_threshold, -3.0,
00330                "Exclude fragments that do not match any whole character"
00331                " with at least this certainty");
00332   BOOL_VAR_H(classify_debug_character_fragments, FALSE,
00333              "Bring up graphical debugging windows for fragments training");
00334   BOOL_VAR_H(matcher_debug_separate_windows, FALSE,
00335              "Use two different windows for debugging the matching: "
00336              "One for the protos and one for the features.");
00337   STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning");
00338 
00339   /* intmatcher.cpp **********************************************************/
00340   INT_VAR_H(classify_class_pruner_threshold, 229,
00341             "Class Pruner Threshold 0-255:        ");
00342   INT_VAR_H(classify_class_pruner_multiplier, 30,
00343             "Class Pruner Multiplier 0-255:       ");
00344   INT_VAR_H(classify_cp_cutoff_strength, 7,
00345             "Class Pruner CutoffStrength:         ");
00346   INT_VAR_H(classify_integer_matcher_multiplier, 14,
00347             "Integer Matcher Multiplier  0-255:   ");
00348 
00349   // Use class variables to hold onto built-in templates and adapted templates.
00350   INT_TEMPLATES PreTrainedTemplates;
00351   ADAPT_TEMPLATES AdaptedTemplates;
00352 
00353   // Create dummy proto and config masks for use with the built-in templates.
00354   BIT_VECTOR AllProtosOn;
00355   BIT_VECTOR PrunedProtos;
00356   BIT_VECTOR AllConfigsOn;
00357   BIT_VECTOR AllProtosOff;
00358   BIT_VECTOR AllConfigsOff;
00359   BIT_VECTOR TempProtoMask;
00360   bool EnableLearning;
00361   /* normmatch.cpp */
00362   NORM_PROTOS *NormProtos;
00363   /* font detection ***********************************************************/
00364   UnicityTable<FontInfo> fontinfo_table_;
00365   UnicityTable<FontSet> fontset_table_;
00366 
00367   INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
00368   BOOL_VAR_H(classify_bln_numeric_mode, 0,
00369              "Assume the input is numbers [0-9].");
00370  protected:
00371   IntegerMatcher im_;
00372   FEATURE_DEFS_STRUCT feature_defs_;
00373   // Must be set for the classifier to operate. Ususally set in
00374   // Tesseract::recog_word_recursive, being the main word-level entry point.
00375   DENORM denorm_;
00376 
00377  private:
00378 
00379   Dict dict_;
00380 
00381   /* variables used to hold performance statistics */
00382   int AdaptiveMatcherCalls;
00383   int BaselineClassifierCalls;
00384   int CharNormClassifierCalls;
00385   int AmbigClassifierCalls;
00386   int NumWordsAdaptedTo;
00387   int NumCharsAdaptedTo;
00388   int NumBaselineClassesTried;
00389   int NumCharNormClassesTried;
00390   int NumAmbigClassesTried;
00391   int NumClassesOutput;
00392   int NumAdaptationsFailed;
00393 
00394   /* variables used to hold onto extracted features.  This is used
00395   to map from the old scheme in which baseline features and char norm
00396   features are extracted separately, to the new scheme in which they
00397   are extracted at the same time. */
00398   bool FeaturesHaveBeenExtracted;
00399   bool FeaturesOK;
00400   INT_FEATURE_ARRAY BaselineFeatures;
00401   INT_FEATURE_ARRAY CharNormFeatures;
00402   INT_FX_RESULT_STRUCT FXInfo;
00403 
00404   CLASS_CUTOFF_ARRAY CharNormCutoffs;
00405   CLASS_CUTOFF_ARRAY BaselineCutoffs;
00406   ScrollView* learn_debug_win_;
00407   ScrollView* learn_fragmented_word_debug_win_;
00408   ScrollView* learn_fragments_debug_win_;
00409 };
00410 }  // namespace tesseract
00411 
00412 #endif  // TESSERACT_CLASSIFY_CLASSIFY_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines