00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00020
00021 #ifndef TESSERACT_CCUTIL_AMBIGS_H_
00022 #define TESSERACT_CCUTIL_AMBIGS_H_
00023
00024 #include "elst.h"
00025 #include "tprintf.h"
00026 #include "unichar.h"
00027 #include "unicharset.h"
00028 #include "genericvector.h"
00029
00030 #define MAX_AMBIG_SIZE 10
00031
00032 extern INT_VAR_H(global_ambigs_debug_level, 0,
00033 "Debug level for unichar ambiguities");
00034 extern BOOL_VAR_H(use_definite_ambigs_for_classifier, 0,
00035 "Use definite ambiguities when running character classifier");
00036
00037 namespace tesseract {
00038
00039 static const int kUnigramAmbigsBufferSize = 1000;
00040 static const char kAmbigNgramSeparator[] = { ' ', '\0' };
00041 static const char kAmbigDelimiters[] = "\t ";
00042 static const char kIllegalMsg[] =
00043 "Illegal ambiguity specification on line %d\n";
00044 static const char kIllegalUnicharMsg[] =
00045 "Illegal unichar %s in ambiguity specification\n";
00046
00047 enum AmbigType {
00048 NOT_AMBIG,
00049 REPLACE_AMBIG,
00050 DEFINITE_AMBIG,
00051 SIMILAR_AMBIG,
00052 CASE_AMBIG,
00053
00054 AMBIG_TYPE_COUNT
00055 };
00056
00057
00058
00059 class UnicharIdArrayUtils {
00060 public:
00061
00062
00063
00064
00065 static inline int compare(const UNICHAR_ID array1[],
00066 const UNICHAR_ID array2[]) {
00067 const UNICHAR_ID *ptr1 = array1;
00068 const UNICHAR_ID *ptr2 = array2;
00069 while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
00070 if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
00071 ++ptr1;
00072 ++ptr2;
00073 }
00074 if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
00075 return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
00076 }
00077
00078
00079
00080
00081 static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
00082 int i = 0;
00083 do {
00084 dst[i] = src[i];
00085 } while (dst[i++] != INVALID_UNICHAR_ID);
00086 return i - 1;
00087 }
00088
00089
00090
00091 static inline void print(const UNICHAR_ID array[],
00092 const UNICHARSET &unicharset) {
00093 const UNICHAR_ID *ptr = array;
00094 if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
00095 while (*ptr != INVALID_UNICHAR_ID) {
00096 tprintf("%s ", unicharset.id_to_unichar(*ptr++));
00097 }
00098 tprintf("( ");
00099 ptr = array;
00100 while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
00101 tprintf(")\n");
00102 }
00103 };
00104
00105
00106
00107 class AmbigSpec : public ELIST_LINK {
00108 public:
00109 AmbigSpec();
00110 ~AmbigSpec() {}
00111
00112
00113
00114
00115 static int compare_ambig_specs(const void *spec1, const void *spec2) {
00116 const AmbigSpec *s1 =
00117 *reinterpret_cast<const AmbigSpec * const *>(spec1);
00118 const AmbigSpec *s2 =
00119 *reinterpret_cast<const AmbigSpec * const *>(spec2);
00120 return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
00121 }
00122
00123 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
00124 UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
00125 UNICHAR_ID correct_ngram_id;
00126 AmbigType type;
00127 int wrong_ngram_size;
00128 };
00129 ELISTIZEH(AmbigSpec);
00130
00131
00132
00133 typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
00134 typedef GenericVector<UNICHAR_ID> UnicharIdVector;
00135
00136 class UnicharAmbigs {
00137 public:
00138 UnicharAmbigs() {}
00139 ~UnicharAmbigs() {
00140 replace_ambigs_.delete_data_pointers();
00141 dang_ambigs_.delete_data_pointers();
00142 one_to_one_definite_ambigs_.delete_data_pointers();
00143 }
00144
00145 const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
00146 const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158 void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset,
00159 UNICHARSET *unicharset);
00160
00161
00162 const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
00163 if (one_to_one_definite_ambigs_.empty()) return NULL;
00164 return one_to_one_definite_ambigs_[unichar_id];
00165 }
00166
00167 private:
00168
00169 bool ParseAmbiguityLine(int line_num, int version,
00170 const UNICHARSET &unicharset, char *buffer,
00171 int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
00172 int *ReplacementAmbigPartSize,
00173 char *ReplacementString, int *type);
00174 void InsertIntoTable(UnicharAmbigsVector &table,
00175 int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
00176 int ReplacementAmbigPartSize,
00177 const char *ReplacementString, int type,
00178 AmbigSpec *ambig_spec, UNICHARSET *unicharset);
00179 UnicharAmbigsVector dang_ambigs_;
00180 UnicharAmbigsVector replace_ambigs_;
00181 GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
00182 };
00183
00184 }
00185
00186 #endif // TESSERACT_CCUTIL_AMBIGS_H_