00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef TRIE_H
00026 #define TRIE_H
00027
00028 #include "dawg.h"
00029 #include "cutil.h"
00030
00031 class UNICHARSET;
00032
00033
00034
00035
00036
00037
00038 typedef inT64 EDGE_INDEX;
00039 typedef bool *NODE_MARKER;
00040 typedef GenericVector<EDGE_RECORD> EDGE_VECTOR;
00041
00042 struct TRIE_NODE_RECORD {
00043 EDGE_VECTOR forward_edges;
00044 EDGE_VECTOR backward_edges;
00045 };
00046 typedef GenericVector<TRIE_NODE_RECORD *> TRIE_NODES;
00047
00048 namespace tesseract {
00049
00056 class Trie : public Dawg {
00057 public:
00058
00059
00060
00061
00062 Trie(DawgType type, const STRING &lang, PermuterType perm,
00063 uinT64 max_num_edges, int unicharset_size) {
00064 init(type, lang, perm, unicharset_size);
00065 num_edges_ = 0;
00066 max_num_edges_ = max_num_edges;
00067 deref_node_index_mask_ = ~letter_mask_;
00068 new_dawg_node();
00069 }
00070 ~Trie() { nodes_.delete_data_pointers(); }
00071
00073 EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id,
00074 bool word_end) const {
00075 EDGE_RECORD *edge_ptr;
00076 EDGE_INDEX edge_index;
00077 if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
00078 &edge_ptr, &edge_index)) return NO_EDGE;
00079 return make_edge_ref(node_ref, edge_index);
00080 }
00081
00086 void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
00087 const EDGE_VECTOR &forward_edges = nodes_[node]->forward_edges;
00088 for (int i = 0; i < forward_edges.size(); ++i) {
00089 vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
00090 make_edge_ref(node, i)));
00091 }
00092 }
00093
00098 NODE_REF next_node(EDGE_REF edge_ref) const {
00099 if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
00100 return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
00101 }
00102
00107 bool end_of_word(EDGE_REF edge_ref) const {
00108 if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
00109 return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
00110 }
00111
00113 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
00114 if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
00115 return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
00116 }
00117
00118
00119
00120 void print_node(NODE_REF node, int max_num_edges) const;
00121
00122
00123
00124
00125
00126 SquishedDawg *trie_to_dawg();
00127
00128
00129 bool read_word_list(const char *filename,
00130 const UNICHARSET &unicharset);
00131
00132
00133 void add_word_to_dawg(const WERD_CHOICE &word);
00134
00135 protected:
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153 inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
00154 uinT64 edge_index = (edge_ref & letter_mask_) >> LETTER_START_BIT;
00155 uinT64 node_index =
00156 (edge_ref & deref_node_index_mask_) >> flag_start_bit_;
00157 TRIE_NODE_RECORD *node_rec = nodes_[node_index];
00158 return &(node_rec->forward_edges[edge_index]);
00159 }
00161 inline EDGE_REF make_edge_ref(NODE_REF node_index,
00162 EDGE_INDEX edge_index) const {
00163 return ((node_index << flag_start_bit_) |
00164 (edge_index << LETTER_START_BIT));
00165 }
00167 inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, int direction,
00168 bool word_end, UNICHAR_ID unichar_id) {
00169 EDGE_RECORD flags = 0;
00170 if (word_end) flags |= WERD_END_FLAG;
00171 if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
00172 *edge = ((nxt << next_node_start_bit_) |
00173 (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
00174 (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
00175 }
00177 inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
00178 tprintf("|" REFFORMAT "|%s%s|%d|", next_node_from_edge_rec(edge_rec),
00179 (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
00180 end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
00181 unichar_id_from_edge_rec(edge_rec));
00182 }
00183
00184
00185 inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
00186 NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
00187 return (node_ref != NO_EDGE &&
00188 nodes_[node_ref]->forward_edges.size() == 1);
00189 }
00190
00191
00192
00193 void print_all(const char* msg, int max_num_edges) {
00194 tprintf("\n__________________________\n%s\n", msg);
00195 for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
00196 tprintf("__________________________\n");
00197 }
00198
00199
00200
00201
00202
00203 bool edge_char_of(NODE_REF node_ref, NODE_REF next_node,
00204 int direction, bool word_end, UNICHAR_ID unichar_id,
00205 EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;
00206
00207
00208
00209 bool add_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
00210 bool word_end, UNICHAR_ID unichar_id);
00211
00212
00213
00214 bool add_new_edge(NODE_REF node1, NODE_REF node2,
00215 bool word_end, UNICHAR_ID unichar_id) {
00216 return (add_edge_linkage(node1, node2, FORWARD_EDGE,
00217 word_end, unichar_id) &&
00218 add_edge_linkage(node2, node1, BACKWARD_EDGE,
00219 word_end, unichar_id));
00220 }
00221
00222
00223
00224 void add_word_ending(EDGE_RECORD *edge,
00225 NODE_REF the_next_node,
00226 UNICHAR_ID unichar_id);
00227
00228
00229 NODE_REF new_dawg_node();
00230
00231
00232
00233 void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
00234 bool word_end, UNICHAR_ID unichar_id);
00235
00236
00237
00238 void remove_edge(NODE_REF node1, NODE_REF node2,
00239 bool word_end, UNICHAR_ID unichar_id) {
00240 remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
00241 remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
00242 }
00243
00244
00245
00246
00247 bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,
00248 const EDGE_RECORD &edge2);
00249
00250
00251
00252
00253
00254
00255 bool reduce_lettered_edges(EDGE_INDEX edge_index,
00256 UNICHAR_ID unichar_id,
00257 NODE_REF node,
00258 const EDGE_VECTOR &backward_edges,
00259 NODE_MARKER reduced_nodes);
00260
00267 void sort_edges(EDGE_VECTOR *edges);
00268
00270 void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes);
00271
00272
00273
00274 TRIE_NODES nodes_;
00275 uinT64 num_edges_;
00276 uinT64 max_num_edges_;
00277 uinT64 deref_direction_mask_;
00278 uinT64 deref_node_index_mask_;
00279 };
00280 }
00281
00282 #endif