Tesseract 3.01
|
00001 /* -*-C-*- 00002 ******************************************************************************** 00003 * 00004 * File: trie.h (Formerly trie.h) 00005 * Description: Functions to build a trie data structure. 00006 * Author: Mark Seaman, SW Productivity 00007 * Created: Fri Oct 16 14:37:00 1987 00008 * Modified: Fri Jul 26 11:26:34 1991 (Mark Seaman) marks@hpgrlt 00009 * Language: C 00010 * Package: N/A 00011 * Status: Reusable Software Component 00012 * 00013 * (c) Copyright 1987, Hewlett-Packard Company. 00014 ** Licensed under the Apache License, Version 2.0 (the "License"); 00015 ** you may not use this file except in compliance with the License. 00016 ** You may obtain a copy of the License at 00017 ** http://www.apache.org/licenses/LICENSE-2.0 00018 ** Unless required by applicable law or agreed to in writing, software 00019 ** distributed under the License is distributed on an "AS IS" BASIS, 00020 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00021 ** See the License for the specific language governing permissions and 00022 ** limitations under the License. 00023 * 00024 *********************************************************************************/ 00025 #ifndef TRIE_H 00026 #define TRIE_H 00027 00028 #include "dawg.h" 00029 #include "cutil.h" 00030 #include "genericvector.h" 00031 00032 class UNICHARSET; 00033 00034 // Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed 00035 // max int32, we will need to change GenericVector to use int64 for size 00036 // and address indices. This does not seem to be needed immediately, 00037 // since currently the largest number of edges limit used by tesseract 00038 // (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32. 00039 // There are also int casts below to satisfy the WIN32 compiler that would 00040 // need to be changed. 00041 // It might be cleanest to change the types of most of the Trie/Dawg related 00042 // typedefs to int and restrict the casts to extracting these values from 00043 // the 64 bit EDGE_RECORD. 00044 typedef inT64 EDGE_INDEX; // index of an edge in a given node 00045 typedef bool *NODE_MARKER; 00046 typedef GenericVector<EDGE_RECORD> EDGE_VECTOR; 00047 00048 struct TRIE_NODE_RECORD { 00049 EDGE_VECTOR forward_edges; 00050 EDGE_VECTOR backward_edges; 00051 }; 00052 typedef GenericVector<TRIE_NODE_RECORD *> TRIE_NODES; 00053 00054 namespace tesseract { 00055 00062 class Trie : public Dawg { 00063 public: 00064 // Minimum number of concrete characters at the beginning of user patterns. 00065 static const int kSaneNumConcreteChars = 4; 00066 // Various unicode whitespace characters are used to denote unichar patterns, 00067 // (character classifier would never produce these whitespace characters as a 00068 // valid classification). 00069 static const char kAlphaPatternUnicode[]; 00070 static const char kDigitPatternUnicode[]; 00071 static const char kAlphanumPatternUnicode[]; 00072 static const char kPuncPatternUnicode[]; 00073 static const char kLowerPatternUnicode[]; 00074 static const char kUpperPatternUnicode[]; 00075 00076 // max_num_edges argument allows limiting the amount of memory this 00077 // Trie can consume (if a new word insert would cause the Trie to 00078 // contain more edges than max_num_edges, all the edges are cleared 00079 // so that new inserts can proceed). 00080 Trie(DawgType type, const STRING &lang, PermuterType perm, 00081 uinT64 max_num_edges, int unicharset_size, int debug_level) { 00082 init(type, lang, perm, unicharset_size, debug_level); 00083 num_edges_ = 0; 00084 max_num_edges_ = max_num_edges; 00085 deref_node_index_mask_ = ~letter_mask_; 00086 new_dawg_node(); // need to allocate node 0 00087 initialized_patterns_ = false; 00088 } 00089 ~Trie() { nodes_.delete_data_pointers(); } 00090 00091 // Reset the Trie to empty. 00092 void clear(); 00093 00095 EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, 00096 bool word_end) const { 00097 EDGE_RECORD *edge_ptr; 00098 EDGE_INDEX edge_index; 00099 if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, 00100 &edge_ptr, &edge_index)) return NO_EDGE; 00101 return make_edge_ref(node_ref, edge_index); 00102 } 00103 00108 void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const { 00109 const EDGE_VECTOR &forward_edges = 00110 nodes_[static_cast<int>(node)]->forward_edges; 00111 for (int i = 0; i < forward_edges.size(); ++i) { 00112 vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]), 00113 make_edge_ref(node, i))); 00114 } 00115 } 00116 00121 NODE_REF next_node(EDGE_REF edge_ref) const { 00122 if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE; 00123 return next_node_from_edge_rec(*deref_edge_ref(edge_ref)); 00124 } 00125 00130 bool end_of_word(EDGE_REF edge_ref) const { 00131 if (edge_ref == NO_EDGE || num_edges_ == 0) return false; 00132 return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref)); 00133 } 00134 00136 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const { 00137 if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID; 00138 return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref)); 00139 } 00140 00141 // Prints the contents of the node indicated by the given NODE_REF. 00142 // At most max_num_edges will be printed. 00143 void print_node(NODE_REF node, int max_num_edges) const; 00144 00145 // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg. 00146 // Eliminates redundant edges and returns the pointer to the SquishedDawg. 00147 // Note: the caller is responsible for deallocating memory associated 00148 // with the returned SquishedDawg pointer. 00149 SquishedDawg *trie_to_dawg(); 00150 00151 // Inserts the list of words from the given file into the Trie. 00152 bool read_word_list(const char *filename, 00153 const UNICHARSET &unicharset); 00154 00155 // Inserts the list of patterns from the given file into the Trie. 00156 // The pattern list file should contain one pattern per line in UTF-8 format. 00157 // 00158 // Each pattern can contain any non-whitespace characters, however only the 00159 // patterns that contain characters from the unicharset of the corresponding 00160 // language will be useful. 00161 // The only meta character is '\'. To be used in a pattern as an ordinary 00162 // string it should be escaped with '\' (e.g. string "C:\Documents" should 00163 // be written in the patterns file as "C:\\Documents"). 00164 // This function supports a very limited regular expression syntax. One can 00165 // express a character, a certain character class and a number of times the 00166 // entity should be repeated in the pattern. 00167 // 00168 // To denote a character class use one of: 00169 // \c - unichar for which UNICHARSET::get_isalpha() is true (character) 00170 // \d - unichar for which UNICHARSET::get_isdigit() is true 00171 // \n - unichar for which UNICHARSET::get_isdigit() and 00172 // UNICHARSET::isalpha() are true 00173 // \p - unichar for which UNICHARSET::get_ispunct() is true 00174 // \a - unichar for which UNICHARSET::get_islower() is true 00175 // \A - unichar for which UNICHARSET::get_isupper() is true 00176 // 00177 // \* could be specified after each character or pattern to indicate that 00178 // the character/pattern can be repeated any number of times before the next 00179 // character/pattern occurs. 00180 // 00181 // Examples: 00182 // 1-8\d\d-GOOG-411 will be expanded to strings: 00183 // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411. 00184 // 00185 // http://www.\n\*.com will be expanded to strings like: 00186 // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com 00187 // 00188 // Note: In choosing which patterns to include please be aware of the fact 00189 // providing very generic patterns will make tesseract run slower. 00190 // For example \n\* at the beginning of the pattern will make Tesseract 00191 // consider all the combinations of proposed character choices for each 00192 // of the segmentations, which will be unacceptably slow. 00193 // Because of potential problems with speed that could be difficult to 00194 // identify, each user pattern has to have at least kSaneNumConcreteChars 00195 // concrete characters from the unicharset at the beginning. 00196 bool read_pattern_list(const char *filename, const UNICHARSET &unicharset); 00197 00198 // Initializes the values of *_pattern_ unichar ids. 00199 // This function should be called before calling read_pattern_list(). 00200 void initialize_patterns(UNICHARSET *unicharset); 00201 00202 // Fills in the given unichar id vector with the unichar ids that represent 00203 // the patterns of the character classes of the given unichar_id. 00204 void unichar_id_to_patterns(UNICHAR_ID unichar_id, 00205 const UNICHARSET &unicharset, 00206 GenericVector<UNICHAR_ID> *vec) const; 00207 00208 // Returns the given EDGE_REF if the EDGE_RECORD that it points to has 00209 // a self loop and the given unichar_id matches the unichar_id stored in the 00210 // EDGE_RECORD, returns NO_EDGE otherwise. 00211 virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, 00212 UNICHAR_ID unichar_id, 00213 bool word_end) const { 00214 if (edge_ref == NO_EDGE) return NO_EDGE; 00215 EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref); 00216 return (marker_flag_from_edge_rec(*edge_rec) && 00217 unichar_id == unichar_id_from_edge_rec(*edge_rec) && 00218 word_end == end_of_word_from_edge_rec(*edge_rec)) ? 00219 edge_ref : NO_EDGE; 00220 } 00221 00222 // Adds a word to the Trie (creates the necessary nodes and edges). 00223 // 00224 // If repetitions vector is not NULL, each entry in the vector indicates 00225 // whether the unichar id with the corresponding index in the word is allowed 00226 // to repeat an unlimited number of times. For each entry that is true, MARKER 00227 // flag of the corresponding edge created for this unichar id is set to true). 00228 void add_word_to_dawg(const WERD_CHOICE &word, 00229 const GenericVector<bool> *repetitions); 00230 void add_word_to_dawg(const WERD_CHOICE &word) { 00231 add_word_to_dawg(word, NULL); 00232 } 00233 00234 protected: 00235 // The structure of an EDGE_REF for Trie edges is as follows: 00236 // [LETTER_START_BIT, flag_start_bit_): 00237 // edge index in *_edges in a TRIE_NODE_RECORD 00238 // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector) 00239 // 00240 // With this arrangement there are enough bits to represent edge indices 00241 // (each node can have at most unicharset_size_ forward edges and 00242 // the position of flag_start_bit is set to be log2(unicharset_size_)). 00243 // It is also possible to accommodate a maximum number of nodes that is at 00244 // least as large as that of the SquishedDawg representation (in SquishedDawg 00245 // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent 00246 // the next node index). 00247 // 00248 00249 // Returns the pointer to EDGE_RECORD after decoding the location 00250 // of the edge from the information in the given EDGE_REF. 00251 // This function assumes that EDGE_REF holds valid node/edge indices. 00252 inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const { 00253 int edge_index = static_cast<int>( 00254 (edge_ref & letter_mask_) >> LETTER_START_BIT); 00255 int node_index = static_cast<int>( 00256 (edge_ref & deref_node_index_mask_) >> flag_start_bit_); 00257 TRIE_NODE_RECORD *node_rec = nodes_[node_index]; 00258 return &(node_rec->forward_edges[edge_index]); 00259 } 00261 inline EDGE_REF make_edge_ref(NODE_REF node_index, 00262 EDGE_INDEX edge_index) const { 00263 return ((node_index << flag_start_bit_) | 00264 (edge_index << LETTER_START_BIT)); 00265 } 00267 inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, 00268 int direction, bool word_end, UNICHAR_ID unichar_id) { 00269 EDGE_RECORD flags = 0; 00270 if (repeats) flags |= MARKER_FLAG; 00271 if (word_end) flags |= WERD_END_FLAG; 00272 if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG; 00273 *edge = ((nxt << next_node_start_bit_) | 00274 (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) | 00275 (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT)); 00276 } 00278 inline void print_edge_rec(const EDGE_RECORD &edge_rec) const { 00279 tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec), 00280 marker_flag_from_edge_rec(edge_rec) ? "R," : "", 00281 (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B", 00282 end_of_word_from_edge_rec(edge_rec) ? ",E" : "", 00283 unichar_id_from_edge_rec(edge_rec)); 00284 } 00285 // Returns true if the next node in recorded the given EDGE_RECORD 00286 // has exactly one forward edge. 00287 inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) { 00288 NODE_REF node_ref = next_node_from_edge_rec(edge_rec); 00289 return (node_ref != NO_EDGE && 00290 nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1); 00291 } 00292 00293 // Prints the contents of the Trie. 00294 // At most max_num_edges will be printed for each node. 00295 void print_all(const char* msg, int max_num_edges) { 00296 tprintf("\n__________________________\n%s\n", msg); 00297 for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges); 00298 tprintf("__________________________\n"); 00299 } 00300 00301 // Finds the edge with the given direction, word_end and unichar_id 00302 // in the node indicated by node_ref. Fills in the pointer to the 00303 // EDGE_RECORD and the index of the edge with the the values 00304 // corresponding to the edge found. Returns true if an edge was found. 00305 bool edge_char_of(NODE_REF node_ref, NODE_REF next_node, 00306 int direction, bool word_end, UNICHAR_ID unichar_id, 00307 EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const; 00308 00309 // Adds an single edge linkage between node1 and node2 in the direction 00310 // indicated by direction argument. 00311 bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, 00312 int direction, bool word_end, 00313 UNICHAR_ID unichar_id); 00314 00315 // Adds forward edge linkage from node1 to node2 and the corresponding 00316 // backward edge linkage in the other direction. 00317 bool add_new_edge(NODE_REF node1, NODE_REF node2, 00318 bool repeats, bool word_end, UNICHAR_ID unichar_id) { 00319 return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, 00320 word_end, unichar_id) && 00321 add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, 00322 word_end, unichar_id)); 00323 } 00324 00325 // Sets the word ending flags in an already existing edge pair. 00326 // Returns true on success. 00327 void add_word_ending(EDGE_RECORD *edge, 00328 NODE_REF the_next_node, 00329 bool repeats, 00330 UNICHAR_ID unichar_id); 00331 00332 // Allocates space for a new node in the Trie. 00333 NODE_REF new_dawg_node(); 00334 00335 // Removes a single edge linkage to between node1 and node2 in the 00336 // direction indicated by direction argument. 00337 void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, 00338 bool word_end, UNICHAR_ID unichar_id); 00339 00340 // Removes forward edge linkage from node1 to node2 and the corresponding 00341 // backward edge linkage in the other direction. 00342 void remove_edge(NODE_REF node1, NODE_REF node2, 00343 bool word_end, UNICHAR_ID unichar_id) { 00344 remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id); 00345 remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id); 00346 } 00347 00348 // Compares edge1 and edge2 in the given node to see if they point to two 00349 // next nodes that could be collapsed. If they do, performs the reduction 00350 // and returns true. 00351 bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, 00352 const EDGE_RECORD &edge2); 00353 00354 // Assuming that edge_index indicates the first edge in a group of edges 00355 // in this node with a particular letter value, looks through these edges 00356 // to see if any of them can be collapsed. If so does it. Returns to the 00357 // caller when all edges with this letter have been reduced. 00358 // Returns true if further reduction is possible with this same letter. 00359 bool reduce_lettered_edges(EDGE_INDEX edge_index, 00360 UNICHAR_ID unichar_id, 00361 NODE_REF node, 00362 const EDGE_VECTOR &backward_edges, 00363 NODE_MARKER reduced_nodes); 00364 00371 void sort_edges(EDGE_VECTOR *edges); 00372 00374 void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes); 00375 00376 // Returns the pattern unichar id for the given character class code. 00377 UNICHAR_ID character_class_to_pattern(char ch); 00378 00379 // Member variables 00380 TRIE_NODES nodes_; 00381 uinT64 num_edges_; 00382 uinT64 max_num_edges_; 00383 uinT64 deref_direction_mask_; 00384 uinT64 deref_node_index_mask_; 00385 // Variables for translating character class codes denoted in user patterns 00386 // file to the unichar ids used to represent them in a Trie. 00387 bool initialized_patterns_; 00388 UNICHAR_ID alpha_pattern_; 00389 UNICHAR_ID digit_pattern_; 00390 UNICHAR_ID alphanum_pattern_; 00391 UNICHAR_ID punc_pattern_; 00392 UNICHAR_ID lower_pattern_; 00393 UNICHAR_ID upper_pattern_; 00394 }; 00395 } // namespace tesseract 00396 00397 #endif