00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #ifndef DAWG_H
00028 #define DAWG_H
00029
00030
00031
00032
00033
00034 #include "elst.h"
00035 #include "general.h"
00036 #include "ratngs.h"
00037 #include "varable.h"
00038
00039
00040
00041
00042
00043 extern INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info, to"
00044 " 2 for more details, to 3 to see all the debug messages");
00045
00046 #ifdef __MSW32__
00047 #define NO_EDGE (inT64) 0xffffffffffffffffi64
00048 #else
00049 #define NO_EDGE (inT64) 0xffffffffffffffffll
00050 #endif
00051
00052
00053
00054
00055 class UNICHARSET;
00056
00057 typedef uinT64 EDGE_RECORD;
00058 typedef EDGE_RECORD *EDGE_ARRAY;
00059 typedef inT64 EDGE_REF;
00060 typedef inT64 NODE_REF;
00061 typedef EDGE_REF *NODE_MAP;
00062
00063 namespace tesseract {
00064
00065 struct NodeChild {
00066 UNICHAR_ID unichar_id;
00067 EDGE_REF edge_ref;
00068 NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {}
00069 NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
00070 };
00071
00072 typedef GenericVector<NodeChild> NodeChildVector;
00073 typedef GenericVector<int> SuccessorList;
00074 typedef GenericVector<SuccessorList *> SuccessorListsVector;
00075
00076 enum DawgType {
00077 DAWG_TYPE_PUNCTUATION,
00078 DAWG_TYPE_PREFIX,
00079 DAWG_TYPE_ROOT,
00080 DAWG_TYPE_WORD,
00081 DAWG_TYPE_SUFFIX,
00082 DAWG_TYPE_NUMBER,
00083
00084 DAWG_TYPE_COUNT
00085 };
00086
00087
00088
00089
00090 #define FORWARD_EDGE (inT32) 0
00091 #define BACKWARD_EDGE (inT32) 1
00092 #define MAX_NODE_EDGES_DISPLAY (inT64) 100
00093 #define LAST_FLAG (inT64) 1
00094 #define DIRECTION_FLAG (inT64) 2
00095 #define WERD_END_FLAG (inT64) 4
00096 #define LETTER_START_BIT 0
00097 #define NUM_FLAG_BITS 3
00098 #define REFFORMAT "%lld"
00099
00100
00101
00102 static const bool kBeginningDawgsType[] = {1, 1, 0, 1, 0, 1 };
00103
00104 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
00105 { 0, 1, 0, 1, 0, 0 },
00106 { 0, 0, 1, 1, 0, 0 },
00107 { 0, 0, 0, 0, 1, 0 },
00108 { 1, 0, 0, 0, 0, 0 },
00109 { 1, 0, 0, 0, 0, 0 },
00110 { 0, 0, 0, 0, 0, 0 }
00111 };
00112
00113 static const char kWildcard[] = "*";
00114
00115
00116
00117
00118
00119
00129
00130 class Dawg {
00131 public:
00133 static const inT16 kDawgMagicNumber = 42;
00137 static const UNICHAR_ID kPatternUnicharID = 0;
00138
00139 inline DawgType type() const { return type_; }
00140 inline const STRING &lang() const { return lang_; }
00141 inline PermuterType permuter() const { return perm_; }
00142
00143 virtual ~Dawg() {};
00144
00146 bool word_in_dawg(const WERD_CHOICE &word) const;
00147
00150 int check_for_words(const char *filename,
00151 const UNICHARSET &unicharset,
00152 bool enable_wildcard) const;
00153
00154
00155
00157 virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
00158 bool word_end) const = 0;
00159
00162 virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0;
00163
00166 virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
00167
00170 virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
00171
00173 virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
00174
00177 virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
00178
00179 protected:
00180 Dawg() {}
00181
00183 inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00184 return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
00185 }
00187 inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00188 return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
00189 BACKWARD_EDGE : FORWARD_EDGE;
00190 }
00192 inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00193 return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
00194 }
00196 inline UNICHAR_ID unichar_id_from_edge_rec(
00197 const EDGE_RECORD &edge_rec) const {
00198 return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
00199 }
00201 inline void set_next_node_in_edge_rec(
00202 EDGE_RECORD *edge_rec, EDGE_REF value) {
00203 *edge_rec &= (~next_node_mask_);
00204 *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
00205 }
00207 inline void set_last_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
00208 *edge_rec |= (LAST_FLAG << flag_start_bit_);
00209 }
00217 inline int given_greater_than_edge_rec(NODE_REF next_node,
00218 bool word_end,
00219 UNICHAR_ID unichar_id,
00220 const EDGE_RECORD &edge_rec) const {
00221 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
00222 NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
00223 bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
00224 if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
00225 curr_word_end, curr_unichar_id)) return 0;
00226 if (unichar_id > curr_unichar_id) return 1;
00227 if (unichar_id == curr_unichar_id) {
00228 if (next_node > curr_next_node) return 1;
00229 if (next_node == curr_next_node) {
00230 if (word_end > curr_word_end) return 1;
00231 }
00232 }
00233 return -1;
00234 }
00238 inline bool edge_rec_match(NODE_REF next_node,
00239 bool word_end,
00240 UNICHAR_ID unichar_id,
00241 NODE_REF other_next_node,
00242 bool other_word_end,
00243 UNICHAR_ID other_unichar_id) const {
00244 return ((unichar_id == other_unichar_id) &&
00245 (next_node == NO_EDGE || next_node == other_next_node) &&
00246 (!word_end || (word_end == other_word_end)));
00247 }
00248
00251 void init(DawgType type, const STRING &lang,
00252 PermuterType perm, int unicharset_size);
00253
00259 bool match_words(WERD_CHOICE *word, inT32 index,
00260 NODE_REF node, UNICHAR_ID wildcard) const;
00261
00262
00263 DawgType type_;
00264 STRING lang_;
00266 PermuterType perm_;
00267
00268
00269
00270
00271 int unicharset_size_;
00272 int flag_start_bit_;
00273 int next_node_start_bit_;
00274 uinT64 next_node_mask_;
00275 uinT64 flags_mask_;
00276 uinT64 letter_mask_;
00277 };
00278
00279
00282
00283 struct DawgInfo {
00284 DawgInfo() : dawg_index(-1), ref(NO_EDGE) {}
00285 DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {}
00286 bool operator==(const DawgInfo &other) {
00287 return (this->dawg_index == other.dawg_index &&
00288 this->ref == other.ref);
00289 }
00290 int dawg_index;
00291 EDGE_REF ref;
00292 };
00293 class DawgInfoVector : public GenericVector<DawgInfo> {
00294 public:
00296 ~DawgInfoVector() {
00297 if (size_reserved_ > 0) {
00298 delete[] data_;
00299 size_used_ = 0;
00300 size_reserved_ = 0;
00301 }
00302 }
00305 void clear() { size_used_ = 0; }
00309 inline bool add_unique(const DawgInfo &new_info, const char *debug_msg) {
00310 for (int i = 0; i < size_used_; ++i) {
00311 if (data_[i] == new_info) return false;
00312 }
00313 push_back(new_info);
00314 if (dawg_debug_level) {
00315 tprintf("%s[%d, " REFFORMAT "]\n", debug_msg,
00316 new_info.dawg_index, new_info.ref);
00317 }
00318 return true;
00319 }
00323 inline bool remove(const DawgInfo &info) {
00324 for (int i = 0; i < size_used_; ++i) {
00325 if (data_[i] == info) {
00326 for (int j = i + 1; j < size_used_; ++j) {
00327 data_[j-1] = data_[j];
00328 }
00329 size_used_--;
00330 return true;
00331 }
00332 }
00333 return false;
00334 }
00335 };
00336
00337
00344
00345 class SquishedDawg : public Dawg {
00346 public:
00347 SquishedDawg(FILE *file, DawgType type,
00348 const STRING &lang, PermuterType perm) {
00349 read_squished_dawg(file, type, lang, perm);
00350 num_forward_edges_in_node0 = num_forward_edges(0);
00351 }
00352 SquishedDawg(const char* filename, DawgType type,
00353 const STRING &lang, PermuterType perm) {
00354 FILE *file = fopen(filename, "rb");
00355 if (file == NULL) {
00356 tprintf("Failed to open dawg file %s\n", filename);
00357 exit(1);
00358 }
00359 read_squished_dawg(file, type, lang, perm);
00360 num_forward_edges_in_node0 = num_forward_edges(0);
00361 fclose(file);
00362 }
00363 SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
00364 const STRING &lang, PermuterType perm, int unicharset_size) :
00365 edges_(edges), num_edges_(num_edges) {
00366 init(type, lang, perm, unicharset_size);
00367 num_forward_edges_in_node0 = num_forward_edges(0);
00368 if (dawg_debug_level > 3) print_all("SquishedDawg:");
00369 }
00370 ~SquishedDawg();
00371
00373 EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
00374 bool word_end) const;
00375
00378 void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
00379 EDGE_REF edge = node;
00380 if (!edge_occupied(edge) || edge == NO_EDGE) return;
00381 assert(forward_edge(edge));
00382 do {
00383 vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
00384 } while (!last_edge(edge++));
00385 }
00386
00389 NODE_REF next_node(EDGE_REF edge) const {
00390 return next_node_from_edge_rec((edges_[edge]));
00391 }
00392
00395 bool end_of_word(EDGE_REF edge_ref) const {
00396 return end_of_word_from_edge_rec((edges_[edge_ref]));
00397 }
00398
00400 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
00401 return unichar_id_from_edge_rec((edges_[edge_ref]));
00402 }
00403
00406 void print_node(NODE_REF node, int max_num_edges) const;
00407
00409 void write_squished_dawg(const char *filename);
00410
00411 private:
00413 inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
00414 set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
00415 }
00417 inline void set_empty_edge(EDGE_REF edge_ref) {
00418 (edges_[edge_ref] = next_node_mask_);
00419 }
00421 inline void clear_all_edges() {
00422 for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
00423 }
00425 inline void clear_last_flag(EDGE_REF edge_ref) {
00426 (edges_[edge_ref] &= ~(LAST_FLAG << flag_start_bit_));
00427 }
00429 inline bool forward_edge(EDGE_REF edge_ref) const {
00430 return (edge_occupied(edge_ref) &&
00431 (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
00432 }
00434 inline bool backward_edge(EDGE_REF edge_ref) const {
00435 return (edge_occupied(edge_ref) &&
00436 (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
00437 }
00439 inline bool edge_occupied(EDGE_REF edge_ref) const {
00440 return (edges_[edge_ref] != next_node_mask_);
00441 }
00443 inline bool last_edge(EDGE_REF edge_ref) const {
00444 return (edges_[edge_ref] & (LAST_FLAG << flag_start_bit_)) != 0;
00445 }
00446
00448 inT32 num_forward_edges(NODE_REF node) const;
00449
00451 void read_squished_dawg(FILE *file, DawgType type,
00452 const STRING &lang, PermuterType perm);
00453
00455 void print_edge(EDGE_REF edge) const;
00456
00458 void print_all(const char* msg) {
00459 tprintf("\n__________________________\n%s\n", msg);
00460 for (int i = 0; i < num_edges_; ++i) print_edge(i);
00461 tprintf("__________________________\n");
00462 }
00464 NODE_MAP build_node_map(inT32 *num_nodes) const;
00465
00466
00467
00468 EDGE_ARRAY edges_;
00469 int num_edges_;
00470 int num_forward_edges_in_node0;
00471 };
00472 }
00473
00474 #endif