00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00018
00019 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H__
00020 #define TESSERACT_CLASSIFY_CLASSIFY_H__
00021
00022 #include "adaptive.h"
00023 #include "ccstruct.h"
00024 #include "classify.h"
00025 #include "dict.h"
00026 #include "fxdefs.h"
00027 #include "intmatcher.h"
00028 #include "ratngs.h"
00029 #include "ocrfeatures.h"
00030 #include "unicity_table.h"
00031
00032 class WERD_CHOICE;
00033 struct ADAPT_RESULTS;
00034 struct NORM_PROTOS;
00035
00036 namespace tesseract {
00037 class Classify : public CCStruct {
00038 public:
00039 Classify();
00040 ~Classify();
00041 Dict& getDict() {
00042 return dict_;
00043 }
00044
00045 ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
00046 int ClassPruner(INT_TEMPLATES IntTemplates,
00047 inT16 NumFeatures,
00048 INT_FEATURE_ARRAY Features,
00049 CLASS_NORMALIZATION_ARRAY NormalizationFactors,
00050 CLASS_CUTOFF_ARRAY ExpectedNumFeatures,
00051 CLASS_PRUNER_RESULTS Results,
00052 int Debug);
00053 void ReadNewCutoffs(FILE *CutoffFile, inT64 end_offset,
00054 CLASS_CUTOFF_ARRAY Cutoffs);
00055 void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
00056 void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
00057 ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File);
00058
00059 FLOAT32 ComputeNormMatch(CLASS_ID ClassId, FEATURE Feature, BOOL8 DebugMatch);
00060 void FreeNormProtos();
00061 NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
00062
00063 void ReadClassFile();
00064 INT_TEMPLATES
00065 CreateIntTemplates(CLASSES FloatProtos,
00066 const UNICHARSET& target_unicharset);
00067
00068 void AdaptToWord(TWERD *Word,
00069 TEXTROW *Row,
00070 const WERD_CHOICE& BestChoice,
00071 const WERD_CHOICE& BestRawChoice,
00072 const char *rejmap);
00073 void InitAdaptiveClassifier();
00074 void InitAdaptedClass(TBLOB *Blob,
00075 LINE_STATS *LineStats,
00076 CLASS_ID ClassId,
00077 ADAPT_CLASS Class,
00078 ADAPT_TEMPLATES Templates);
00079 void AdaptToPunc(TBLOB *Blob,
00080 LINE_STATS *LineStats,
00081 CLASS_ID ClassId,
00082 FLOAT32 Threshold);
00083 void AmbigClassifier(TBLOB *Blob,
00084 LINE_STATS *LineStats,
00085 INT_TEMPLATES Templates,
00086 UNICHAR_ID *Ambiguities,
00087 ADAPT_RESULTS *Results);
00088 void MasterMatcher(INT_TEMPLATES templates,
00089 inT16 num_features,
00090 INT_FEATURE_ARRAY features,
00091 CLASS_NORMALIZATION_ARRAY norm_factors,
00092 ADAPT_CLASS* classes,
00093 int debug,
00094 int num_classes,
00095 CLASS_PRUNER_RESULTS results,
00096 ADAPT_RESULTS* final_results);
00097 void ConvertMatchesToChoices(ADAPT_RESULTS *Results,
00098 BLOB_CHOICE_LIST *Choices);
00099 void AddNewResult(ADAPT_RESULTS *Results,
00100 CLASS_ID ClassId,
00101 FLOAT32 Rating,
00102 int ConfigId);
00103 #ifndef GRAPHICS_DISABLED
00104 void DebugAdaptiveClassifier(TBLOB *Blob,
00105 LINE_STATS *LineStats,
00106 ADAPT_RESULTS *Results);
00107 #endif
00108 void GetAdaptThresholds (TWERD * Word,
00109 LINE_STATS * LineStats,
00110 const WERD_CHOICE& BestChoice,
00111 const WERD_CHOICE& BestRawChoice,
00112 FLOAT32 Thresholds[]);
00113
00114 int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
00115 CLASS_ID ClassId,
00116 int NumFeatures,
00117 INT_FEATURE_ARRAY Features,
00118 FEATURE_SET FloatFeatures);
00119 void MakePermanent(ADAPT_TEMPLATES Templates,
00120 CLASS_ID ClassId,
00121 int ConfigId,
00122 TBLOB *Blob,
00123 LINE_STATS *LineStats);
00124 void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results);
00125 void RemoveExtraPuncs(ADAPT_RESULTS *Results);
00126 void RemoveBadMatches(ADAPT_RESULTS *Results);
00127 void ShowBestMatchFor(TBLOB *Blob,
00128 LINE_STATS *LineStats,
00129 CLASS_ID ClassId,
00130 BOOL8 AdaptiveOn,
00131 BOOL8 PreTrainedOn);
00132 UNICHAR_ID *BaselineClassifier(TBLOB *Blob,
00133 LINE_STATS *LineStats,
00134 ADAPT_TEMPLATES Templates,
00135 ADAPT_RESULTS *Results);
00136 int CharNormClassifier(TBLOB *Blob,
00137 LINE_STATS *LineStats,
00138 INT_TEMPLATES Templates,
00139 ADAPT_RESULTS *Results);
00140 UNICHAR_ID *GetAmbiguities(TBLOB *Blob,
00141 LINE_STATS *LineStats,
00142 CLASS_ID CorrectClass);
00143 void DoAdaptiveMatch(TBLOB *Blob,
00144 LINE_STATS *LineStats,
00145 ADAPT_RESULTS *Results);
00146 void AdaptToChar(TBLOB *Blob,
00147 LINE_STATS *LineStats,
00148 CLASS_ID ClassId,
00149 FLOAT32 Threshold);
00150 int AdaptableWord(TWERD *Word,
00151 const WERD_CHOICE &BestChoiceWord,
00152 const WERD_CHOICE &RawChoiceWord);
00153 void EndAdaptiveClassifier();
00154 void PrintAdaptiveStatistics(FILE *File);
00155 void SettupPass1();
00156 void SettupPass2();
00157 void AdaptiveClassifier(TBLOB *Blob,
00158 TBLOB *DotBlob,
00159 TEXTROW *Row,
00160 BLOB_CHOICE_LIST *Choices,
00161 CLASS_PRUNER_RESULTS cp_results);
00162 void ClassifyAsNoise(ADAPT_RESULTS *Results);
00163 void ResetAdaptiveClassifier();
00164
00165 FLOAT32 GetBestRatingFor(TBLOB *Blob,
00166 LINE_STATS *LineStats,
00167 CLASS_ID ClassId);
00168 int GetCharNormFeatures(TBLOB *Blob,
00169 LINE_STATS *LineStats,
00170 INT_TEMPLATES Templates,
00171 INT_FEATURE_ARRAY IntFeatures,
00172 CLASS_NORMALIZATION_ARRAY CharNormArray,
00173 inT32 *BlobLength);
00174 int GetIntCharNormFeatures(TBLOB *Blob,
00175 LINE_STATS *LineStats,
00176 INT_TEMPLATES Templates,
00177 INT_FEATURE_ARRAY IntFeatures,
00178 CLASS_NORMALIZATION_ARRAY CharNormArray,
00179 inT32 *BlobLength);
00180
00181
00182 void ComputeIntCharNormArray(FEATURE NormFeature,
00183 INT_TEMPLATES Templates,
00184 CLASS_NORMALIZATION_ARRAY CharNormArray);
00185
00186 INT_TEMPLATES ReadIntTemplates(FILE *File);
00187 void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
00188 const UNICHARSET& target_unicharset);
00189 CLASS_ID GetClassToDebug(const char *Prompt);
00190
00191 UnicityTable<FontInfo>& get_fontinfo_table() {
00192 return fontinfo_table_;
00193 }
00194 UnicityTable<FontSet>& get_fontset_table() {
00195 return fontset_table_;
00196 }
00197
00198
00199 INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
00200
00201
00202 INT_TEMPLATES PreTrainedTemplates;
00203 ADAPT_TEMPLATES AdaptedTemplates;
00204
00205 bool inttemp_loaded_;
00206
00207
00208 BIT_VECTOR AllProtosOn;
00209 BIT_VECTOR PrunedProtos;
00210 BIT_VECTOR AllConfigsOn;
00211 BIT_VECTOR AllProtosOff;
00212 BIT_VECTOR AllConfigsOff;
00213 BIT_VECTOR TempProtoMask;
00214
00215 BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
00216
00217 BOOL_VAR_H(classify_recog_devanagari, false,
00218 "Whether recognizing a language with devanagari script.");
00219 bool EnableLearning;
00220
00221 NORM_PROTOS *NormProtos;
00222
00223 UnicityTable<FontInfo> fontinfo_table_;
00224 UnicityTable<FontSet> fontset_table_;
00225 private:
00226 Dict dict_;
00227 };
00228 }
00229
00230 #endif // TESSERACT_CLASSIFY_CLASSIFY_H__