Tesseract 3.01
|
#include <trie.h>
Public Member Functions | |
Trie (DawgType type, const STRING &lang, PermuterType perm, uinT64 max_num_edges, int unicharset_size, int debug_level) | |
~Trie () | |
void | clear () |
EDGE_REF | edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const |
void | unichar_ids_of (NODE_REF node, NodeChildVector *vec) const |
NODE_REF | next_node (EDGE_REF edge_ref) const |
bool | end_of_word (EDGE_REF edge_ref) const |
UNICHAR_ID | edge_letter (EDGE_REF edge_ref) const |
void | print_node (NODE_REF node, int max_num_edges) const |
SquishedDawg * | trie_to_dawg () |
bool | read_word_list (const char *filename, const UNICHARSET &unicharset) |
bool | read_pattern_list (const char *filename, const UNICHARSET &unicharset) |
void | initialize_patterns (UNICHARSET *unicharset) |
void | unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const |
virtual EDGE_REF | pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const |
void | add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions) |
void | add_word_to_dawg (const WERD_CHOICE &word) |
Static Public Attributes | |
static const int | kSaneNumConcreteChars = 4 |
static const char | kAlphaPatternUnicode [] = "\u2000" |
static const char | kDigitPatternUnicode [] = "\u2001" |
static const char | kAlphanumPatternUnicode [] = "\u2002" |
static const char | kPuncPatternUnicode [] = "\u2003" |
static const char | kLowerPatternUnicode [] = "\u2004" |
static const char | kUpperPatternUnicode [] = "\u2005" |
Protected Member Functions | |
EDGE_RECORD * | deref_edge_ref (EDGE_REF edge_ref) const |
EDGE_REF | make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const |
void | link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id) |
void | print_edge_rec (const EDGE_RECORD &edge_rec) const |
bool | can_be_eliminated (const EDGE_RECORD &edge_rec) |
void | print_all (const char *msg, int max_num_edges) |
bool | edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const |
bool | add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id) |
bool | add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id) |
void | add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id) |
NODE_REF | new_dawg_node () |
void | remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id) |
void | remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id) |
bool | eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2) |
bool | reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, const EDGE_VECTOR &backward_edges, NODE_MARKER reduced_nodes) |
void | sort_edges (EDGE_VECTOR *edges) |
void | reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes) |
UNICHAR_ID | character_class_to_pattern (char ch) |
Protected Attributes | |
TRIE_NODES | nodes_ |
vector of nodes in the Trie | |
uinT64 | num_edges_ |
sum of all edges (forward and backward) | |
uinT64 | max_num_edges_ |
maximum number of edges allowed | |
uinT64 | deref_direction_mask_ |
mask for EDGE_REF to extract direction | |
uinT64 | deref_node_index_mask_ |
mask for EDGE_REF to extract node index | |
bool | initialized_patterns_ |
UNICHAR_ID | alpha_pattern_ |
UNICHAR_ID | digit_pattern_ |
UNICHAR_ID | alphanum_pattern_ |
UNICHAR_ID | punc_pattern_ |
UNICHAR_ID | lower_pattern_ |
UNICHAR_ID | upper_pattern_ |
Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.
tesseract::Trie::Trie | ( | DawgType | type, |
const STRING & | lang, | ||
PermuterType | perm, | ||
uinT64 | max_num_edges, | ||
int | unicharset_size, | ||
int | debug_level | ||
) | [inline] |
tesseract::Trie::~Trie | ( | ) | [inline] |
bool tesseract::Trie::add_edge_linkage | ( | NODE_REF | node1, |
NODE_REF | node2, | ||
bool | repeats, | ||
int | direction, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id | ||
) | [protected] |
bool tesseract::Trie::add_new_edge | ( | NODE_REF | node1, |
NODE_REF | node2, | ||
bool | repeats, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id | ||
) | [inline, protected] |
void tesseract::Trie::add_word_ending | ( | EDGE_RECORD * | edge, |
NODE_REF | the_next_node, | ||
bool | repeats, | ||
UNICHAR_ID | unichar_id | ||
) | [protected] |
void tesseract::Trie::add_word_to_dawg | ( | const WERD_CHOICE & | word, |
const GenericVector< bool > * | repetitions | ||
) |
void tesseract::Trie::add_word_to_dawg | ( | const WERD_CHOICE & | word | ) | [inline] |
bool tesseract::Trie::can_be_eliminated | ( | const EDGE_RECORD & | edge_rec | ) | [inline, protected] |
UNICHAR_ID tesseract::Trie::character_class_to_pattern | ( | char | ch | ) | [protected] |
void tesseract::Trie::clear | ( | ) |
EDGE_RECORD* tesseract::Trie::deref_edge_ref | ( | EDGE_REF | edge_ref | ) | const [inline, protected] |
EDGE_REF tesseract::Trie::edge_char_of | ( | NODE_REF | node_ref, |
UNICHAR_ID | unichar_id, | ||
bool | word_end | ||
) | const [inline, virtual] |
Returns the edge that corresponds to the letter out of this node.
Implements tesseract::Dawg.
bool tesseract::Trie::edge_char_of | ( | NODE_REF | node_ref, |
NODE_REF | next_node, | ||
int | direction, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id, | ||
EDGE_RECORD ** | edge_ptr, | ||
EDGE_INDEX * | edge_index | ||
) | const [protected] |
UNICHAR_ID tesseract::Trie::edge_letter | ( | EDGE_REF | edge_ref | ) | const [inline, virtual] |
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
bool tesseract::Trie::eliminate_redundant_edges | ( | NODE_REF | node, |
const EDGE_RECORD & | edge1, | ||
const EDGE_RECORD & | edge2 | ||
) | [protected] |
bool tesseract::Trie::end_of_word | ( | EDGE_REF | edge_ref | ) | const [inline, virtual] |
Returns true if the edge indicated by the given EDGE_REF marks the end of a word.
Implements tesseract::Dawg.
void tesseract::Trie::initialize_patterns | ( | UNICHARSET * | unicharset | ) |
void tesseract::Trie::link_edge | ( | EDGE_RECORD * | edge, |
NODE_REF | nxt, | ||
bool | repeats, | ||
int | direction, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id | ||
) | [inline, protected] |
Sets up this edge record to the requested values.
EDGE_REF tesseract::Trie::make_edge_ref | ( | NODE_REF | node_index, |
EDGE_INDEX | edge_index | ||
) | const [inline, protected] |
Constructs EDGE_REF from the given node_index and edge_index.
NODE_REF tesseract::Trie::new_dawg_node | ( | ) | [protected] |
Returns the next node visited by following the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
virtual EDGE_REF tesseract::Trie::pattern_loop_edge | ( | EDGE_REF | edge_ref, |
UNICHAR_ID | unichar_id, | ||
bool | word_end | ||
) | const [inline, virtual] |
Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.
Reimplemented from tesseract::Dawg.
void tesseract::Trie::print_all | ( | const char * | msg, |
int | max_num_edges | ||
) | [inline, protected] |
void tesseract::Trie::print_edge_rec | ( | const EDGE_RECORD & | edge_rec | ) | const [inline, protected] |
Prints the given EDGE_RECORD.
void tesseract::Trie::print_node | ( | NODE_REF | node, |
int | max_num_edges | ||
) | const [virtual] |
Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.
Implements tesseract::Dawg.
bool tesseract::Trie::read_pattern_list | ( | const char * | filename, |
const UNICHARSET & | unicharset | ||
) |
bool tesseract::Trie::read_word_list | ( | const char * | filename, |
const UNICHARSET & | unicharset | ||
) |
bool tesseract::Trie::reduce_lettered_edges | ( | EDGE_INDEX | edge_index, |
UNICHAR_ID | unichar_id, | ||
NODE_REF | node, | ||
const EDGE_VECTOR & | backward_edges, | ||
NODE_MARKER | reduced_nodes | ||
) | [protected] |
void tesseract::Trie::reduce_node_input | ( | NODE_REF | node, |
NODE_MARKER | reduced_nodes | ||
) | [protected] |
Eliminates any redundant edges from this node in the Trie.
void tesseract::Trie::remove_edge | ( | NODE_REF | node1, |
NODE_REF | node2, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id | ||
) | [inline, protected] |
void tesseract::Trie::remove_edge_linkage | ( | NODE_REF | node1, |
NODE_REF | node2, | ||
int | direction, | ||
bool | word_end, | ||
UNICHAR_ID | unichar_id | ||
) | [protected] |
void tesseract::Trie::sort_edges | ( | EDGE_VECTOR * | edges | ) | [protected] |
Order num_edges of consequtive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.
SquishedDawg * tesseract::Trie::trie_to_dawg | ( | ) |
void tesseract::Trie::unichar_id_to_patterns | ( | UNICHAR_ID | unichar_id, |
const UNICHARSET & | unicharset, | ||
GenericVector< UNICHAR_ID > * | vec | ||
) | const [virtual] |
Fills vec with unichar ids that represent the character classes of the given unichar_id.
Reimplemented from tesseract::Dawg.
void tesseract::Trie::unichar_ids_of | ( | NODE_REF | node, |
NodeChildVector * | vec | ||
) | const [inline, virtual] |
Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.
Implements tesseract::Dawg.
UNICHAR_ID tesseract::Trie::alpha_pattern_ [protected] |
UNICHAR_ID tesseract::Trie::alphanum_pattern_ [protected] |
uinT64 tesseract::Trie::deref_direction_mask_ [protected] |
mask for EDGE_REF to extract direction
uinT64 tesseract::Trie::deref_node_index_mask_ [protected] |
mask for EDGE_REF to extract node index
UNICHAR_ID tesseract::Trie::digit_pattern_ [protected] |
bool tesseract::Trie::initialized_patterns_ [protected] |
const char tesseract::Trie::kAlphanumPatternUnicode = "\u2002" [static] |
const char tesseract::Trie::kAlphaPatternUnicode = "\u2000" [static] |
const char tesseract::Trie::kDigitPatternUnicode = "\u2001" [static] |
const char tesseract::Trie::kLowerPatternUnicode = "\u2004" [static] |
const char tesseract::Trie::kPuncPatternUnicode = "\u2003" [static] |
const int tesseract::Trie::kSaneNumConcreteChars = 4 [static] |
const char tesseract::Trie::kUpperPatternUnicode = "\u2005" [static] |
UNICHAR_ID tesseract::Trie::lower_pattern_ [protected] |
uinT64 tesseract::Trie::max_num_edges_ [protected] |
maximum number of edges allowed
TRIE_NODES tesseract::Trie::nodes_ [protected] |
vector of nodes in the Trie
uinT64 tesseract::Trie::num_edges_ [protected] |
sum of all edges (forward and backward)
UNICHAR_ID tesseract::Trie::punc_pattern_ [protected] |
UNICHAR_ID tesseract::Trie::upper_pattern_ [protected] |