Tesseract 3.01
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: dawg.h (Formerly dawg.h) 00005 * Description: Definition of a class that represents Directed Accyclic Word 00006 * Graph (DAWG), functions to build and manipulate the DAWG. 00007 * Author: Mark Seaman, SW Productivity 00008 * Created: Fri Oct 16 14:37:00 1987 00009 * Modified: Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt 00010 * Language: C 00011 * Package: N/A 00012 * Status: Reusable Software Component 00013 * 00014 * (c) Copyright 1987, Hewlett-Packard Company. 00015 ** Licensed under the Apache License, Version 2.0 (the "License"); 00016 ** you may not use this file except in compliance with the License. 00017 ** You may obtain a copy of the License at 00018 ** http://www.apache.org/licenses/LICENSE-2.0 00019 ** Unless required by applicable law or agreed to in writing, software 00020 ** distributed under the License is distributed on an "AS IS" BASIS, 00021 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00022 ** See the License for the specific language governing permissions and 00023 ** limitations under the License. 00024 * 00025 *********************************************************************************/ 00026 00027 #ifndef DICT_DAWG_H_ 00028 #define DICT_DAWG_H_ 00029 00030 /*---------------------------------------------------------------------- 00031 I n c l u d e s 00032 ----------------------------------------------------------------------*/ 00033 00034 #include "elst.h" 00035 #include "ratngs.h" 00036 #include "params.h" 00037 00038 #ifndef __GNUC__ 00039 #ifdef __MSW32__ 00040 #define NO_EDGE (inT64) 0xffffffffffffffffi64 00041 #endif /*__MSW32__*/ 00042 #else 00043 #define NO_EDGE (inT64) 0xffffffffffffffffll 00044 #endif /*__GNUC__*/ 00045 00046 /*---------------------------------------------------------------------- 00047 T y p e s 00048 ----------------------------------------------------------------------*/ 00049 class UNICHARSET; 00050 00051 typedef uinT64 EDGE_RECORD; 00052 typedef EDGE_RECORD *EDGE_ARRAY; 00053 typedef inT64 EDGE_REF; 00054 typedef inT64 NODE_REF; 00055 typedef EDGE_REF *NODE_MAP; 00056 00057 namespace tesseract { 00058 00059 struct NodeChild { 00060 UNICHAR_ID unichar_id; 00061 EDGE_REF edge_ref; 00062 NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {} 00063 NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {} 00064 }; 00065 00066 typedef GenericVector<NodeChild> NodeChildVector; 00067 typedef GenericVector<int> SuccessorList; 00068 typedef GenericVector<SuccessorList *> SuccessorListsVector; 00069 00070 enum DawgType { 00071 DAWG_TYPE_PUNCTUATION, 00072 DAWG_TYPE_WORD, 00073 DAWG_TYPE_NUMBER, 00074 DAWG_TYPE_PATTERN, 00075 00076 DAWG_TYPE_COUNT // number of enum entries 00077 }; 00078 00079 /*---------------------------------------------------------------------- 00080 C o n s t a n t s 00081 ----------------------------------------------------------------------*/ 00082 00083 #define FORWARD_EDGE (inT32) 0 00084 #define BACKWARD_EDGE (inT32) 1 00085 #define MAX_NODE_EDGES_DISPLAY (inT64) 100 00086 #define MARKER_FLAG (inT64) 1 00087 #define DIRECTION_FLAG (inT64) 2 00088 #define WERD_END_FLAG (inT64) 4 00089 #define LETTER_START_BIT 0 00090 #define NUM_FLAG_BITS 3 00091 #define REFFORMAT "%lld" 00092 00093 // Set kBeginningDawgsType[i] to true if a Dawg of 00094 // DawgType i can contain the beginning of a word. 00095 static const bool kBeginningDawgsType[] = { 1, 1, 1, 1 }; 00096 00097 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = { 00098 { 0, 1, 1, 0 }, // for DAWG_TYPE_PUNCTUATION 00099 { 1, 0, 0, 0 }, // for DAWG_TYPE_WORD 00100 { 1, 0, 0, 0 }, // for DAWG_TYPE_NUMBER 00101 { 0, 0, 0, 0 }, // for DAWG_TYPE_PATTERN 00102 }; 00103 00104 static const char kWildcard[] = "*"; 00105 00106 00107 /*---------------------------------------------------------------------- 00108 C l a s s e s a n d S t r u c t s 00109 ----------------------------------------------------------------------*/ 00110 // 00120 // 00121 class Dawg { 00122 public: 00124 static const inT16 kDawgMagicNumber = 42; 00128 static const UNICHAR_ID kPatternUnicharID = 0; 00129 00130 inline DawgType type() const { return type_; } 00131 inline const STRING &lang() const { return lang_; } 00132 inline PermuterType permuter() const { return perm_; } 00133 00134 virtual ~Dawg() {}; 00135 00137 bool word_in_dawg(const WERD_CHOICE &word) const; 00138 00141 int check_for_words(const char *filename, 00142 const UNICHARSET &unicharset, 00143 bool enable_wildcard) const; 00144 00145 // Pure virtual function that should be implemented by the derived classes. 00146 00148 virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, 00149 bool word_end) const = 0; 00150 00153 virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0; 00154 00157 virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0; 00158 00161 virtual bool end_of_word(EDGE_REF edge_ref) const = 0; 00162 00164 virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0; 00165 00168 virtual void print_node(NODE_REF node, int max_num_edges) const = 0; 00169 00172 virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, 00173 const UNICHARSET &unicharset, 00174 GenericVector<UNICHAR_ID> *vec) const {}; 00175 00179 virtual EDGE_REF pattern_loop_edge( 00180 EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const { 00181 return false; 00182 } 00183 00184 protected: 00185 Dawg() {} 00186 00188 inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00189 return ((edge_rec & next_node_mask_) >> next_node_start_bit_); 00190 } 00192 inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00193 return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0; 00194 } 00196 inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00197 return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ? 00198 BACKWARD_EDGE : FORWARD_EDGE; 00199 } 00201 inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const { 00202 return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0; 00203 } 00205 inline UNICHAR_ID unichar_id_from_edge_rec( 00206 const EDGE_RECORD &edge_rec) const { 00207 return ((edge_rec & letter_mask_) >> LETTER_START_BIT); 00208 } 00210 inline void set_next_node_in_edge_rec( 00211 EDGE_RECORD *edge_rec, EDGE_REF value) { 00212 *edge_rec &= (~next_node_mask_); 00213 *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_); 00214 } 00216 inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) { 00217 *edge_rec |= (MARKER_FLAG << flag_start_bit_); 00218 } 00226 inline int given_greater_than_edge_rec(NODE_REF next_node, 00227 bool word_end, 00228 UNICHAR_ID unichar_id, 00229 const EDGE_RECORD &edge_rec) const { 00230 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec); 00231 NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec); 00232 bool curr_word_end = end_of_word_from_edge_rec(edge_rec); 00233 if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node, 00234 curr_word_end, curr_unichar_id)) return 0; 00235 if (unichar_id > curr_unichar_id) return 1; 00236 if (unichar_id == curr_unichar_id) { 00237 if (next_node > curr_next_node) return 1; 00238 if (next_node == curr_next_node) { 00239 if (word_end > curr_word_end) return 1; 00240 } 00241 } 00242 return -1; 00243 } 00247 inline bool edge_rec_match(NODE_REF next_node, 00248 bool word_end, 00249 UNICHAR_ID unichar_id, 00250 NODE_REF other_next_node, 00251 bool other_word_end, 00252 UNICHAR_ID other_unichar_id) const { 00253 return ((unichar_id == other_unichar_id) && 00254 (next_node == NO_EDGE || next_node == other_next_node) && 00255 (!word_end || (word_end == other_word_end))); 00256 } 00257 00260 void init(DawgType type, const STRING &lang, 00261 PermuterType perm, int unicharset_size, int debug_level); 00262 00268 bool match_words(WERD_CHOICE *word, inT32 index, 00269 NODE_REF node, UNICHAR_ID wildcard) const; 00270 00271 // Member Variables. 00272 DawgType type_; 00273 STRING lang_; 00275 PermuterType perm_; 00276 // Variables to construct various edge masks. Formerly: 00277 // #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64 00278 // #define FLAGS_MASK (inT64) 0x0000000700000000i64 00279 // #define LETTER_MASK (inT64) 0x00000000ffffffffi64 00280 int unicharset_size_; 00281 int flag_start_bit_; 00282 int next_node_start_bit_; 00283 uinT64 next_node_mask_; 00284 uinT64 flags_mask_; 00285 uinT64 letter_mask_; 00286 // Level of debug statements to print to stdout. 00287 int debug_level_; 00288 }; 00289 00290 // 00293 // 00294 struct DawgInfo { 00295 DawgInfo() : dawg_index(-1), ref(NO_EDGE) {} 00296 DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {} 00297 bool operator==(const DawgInfo &other) { 00298 return (this->dawg_index == other.dawg_index && this->ref == other.ref); 00299 } 00300 int dawg_index; 00301 EDGE_REF ref; 00302 }; 00303 class DawgInfoVector : public GenericVector<DawgInfo> { 00304 public: 00306 ~DawgInfoVector() { 00307 if (size_reserved_ > 0) { 00308 delete[] data_; 00309 size_used_ = 0; 00310 size_reserved_ = 0; 00311 } 00312 } 00315 void clear() { size_used_ = 0; } 00319 inline bool add_unique(const DawgInfo &new_info, bool debug, 00320 const char *debug_msg) { 00321 for (int i = 0; i < size_used_; ++i) { 00322 if (data_[i] == new_info) return false; 00323 } 00324 push_back(new_info); 00325 if (debug) { 00326 tprintf("%s[%d, " REFFORMAT "]\n", debug_msg, 00327 new_info.dawg_index, new_info.ref); 00328 } 00329 return true; 00330 } 00331 }; 00332 00333 // 00340 // 00341 class SquishedDawg : public Dawg { 00342 public: 00343 SquishedDawg(FILE *file, DawgType type, const STRING &lang, 00344 PermuterType perm, int debug_level) { 00345 read_squished_dawg(file, type, lang, perm, debug_level); 00346 num_forward_edges_in_node0 = num_forward_edges(0); 00347 } 00348 SquishedDawg(const char* filename, DawgType type, 00349 const STRING &lang, PermuterType perm, int debug_level) { 00350 FILE *file = fopen(filename, "rb"); 00351 if (file == NULL) { 00352 tprintf("Failed to open dawg file %s\n", filename); 00353 exit(1); 00354 } 00355 read_squished_dawg(file, type, lang, perm, debug_level); 00356 num_forward_edges_in_node0 = num_forward_edges(0); 00357 fclose(file); 00358 } 00359 SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, 00360 const STRING &lang, PermuterType perm, 00361 int unicharset_size, int debug_level) : 00362 edges_(edges), num_edges_(num_edges) { 00363 init(type, lang, perm, unicharset_size, debug_level); 00364 num_forward_edges_in_node0 = num_forward_edges(0); 00365 if (debug_level > 3) print_all("SquishedDawg:"); 00366 } 00367 ~SquishedDawg(); 00368 00369 int NumEdges() { return num_edges_; } 00370 00372 EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, 00373 bool word_end) const; 00374 00377 void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const { 00378 EDGE_REF edge = node; 00379 if (!edge_occupied(edge) || edge == NO_EDGE) return; 00380 assert(forward_edge(edge)); // we don't expect any backward edges to 00381 do { // be present when this funciton is called 00382 vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge)); 00383 } while (!last_edge(edge++)); 00384 } 00385 00388 NODE_REF next_node(EDGE_REF edge) const { 00389 return next_node_from_edge_rec((edges_[edge])); 00390 } 00391 00394 bool end_of_word(EDGE_REF edge_ref) const { 00395 return end_of_word_from_edge_rec((edges_[edge_ref])); 00396 } 00397 00399 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const { 00400 return unichar_id_from_edge_rec((edges_[edge_ref])); 00401 } 00402 00405 void print_node(NODE_REF node, int max_num_edges) const; 00406 00408 void write_squished_dawg(FILE *file); 00409 00412 void write_squished_dawg(const char *filename) { 00413 FILE *file = fopen(filename, "wb"); 00414 if (file == NULL) { 00415 tprintf("Error opening %s\n", filename); 00416 exit(1); 00417 } 00418 this->write_squished_dawg(file); 00419 fclose(file); 00420 } 00421 00422 private: 00424 inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) { 00425 set_next_node_in_edge_rec(&(edges_[edge_ref]), value); 00426 } 00428 inline void set_empty_edge(EDGE_REF edge_ref) { 00429 (edges_[edge_ref] = next_node_mask_); 00430 } 00432 inline void clear_all_edges() { 00433 for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge); 00434 } 00436 inline void clear_marker_flag(EDGE_REF edge_ref) { 00437 (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_)); 00438 } 00440 inline bool forward_edge(EDGE_REF edge_ref) const { 00441 return (edge_occupied(edge_ref) && 00442 (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); 00443 } 00445 inline bool backward_edge(EDGE_REF edge_ref) const { 00446 return (edge_occupied(edge_ref) && 00447 (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref]))); 00448 } 00450 inline bool edge_occupied(EDGE_REF edge_ref) const { 00451 return (edges_[edge_ref] != next_node_mask_); 00452 } 00454 inline bool last_edge(EDGE_REF edge_ref) const { 00455 return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0; 00456 } 00457 00459 inT32 num_forward_edges(NODE_REF node) const; 00460 00462 void read_squished_dawg(FILE *file, DawgType type, const STRING &lang, 00463 PermuterType perm, int debug_level); 00464 00466 void print_edge(EDGE_REF edge) const; 00467 00469 void print_all(const char* msg) { 00470 tprintf("\n__________________________\n%s\n", msg); 00471 for (int i = 0; i < num_edges_; ++i) print_edge(i); 00472 tprintf("__________________________\n"); 00473 } 00475 NODE_MAP build_node_map(inT32 *num_nodes) const; 00476 00477 00478 // Member variables. 00479 EDGE_ARRAY edges_; 00480 int num_edges_; 00481 int num_forward_edges_in_node0; 00482 }; 00483 00484 } // namespace tesseract 00485 00486 #endif // DICT_DAWG_H_