Tesseract 3.01
tesseract::Trie Class Reference

#include <trie.h>

Inheritance diagram for tesseract::Trie:
tesseract::Dawg

List of all members.

Public Member Functions

 Trie (DawgType type, const STRING &lang, PermuterType perm, uinT64 max_num_edges, int unicharset_size, int debug_level)
 ~Trie ()
void clear ()
EDGE_REF edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
void unichar_ids_of (NODE_REF node, NodeChildVector *vec) const
NODE_REF next_node (EDGE_REF edge_ref) const
bool end_of_word (EDGE_REF edge_ref) const
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const
void print_node (NODE_REF node, int max_num_edges) const
SquishedDawgtrie_to_dawg ()
bool read_word_list (const char *filename, const UNICHARSET &unicharset)
bool read_pattern_list (const char *filename, const UNICHARSET &unicharset)
void initialize_patterns (UNICHARSET *unicharset)
void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
void add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
void add_word_to_dawg (const WERD_CHOICE &word)

Static Public Attributes

static const int kSaneNumConcreteChars = 4
static const char kAlphaPatternUnicode [] = "\u2000"
static const char kDigitPatternUnicode [] = "\u2001"
static const char kAlphanumPatternUnicode [] = "\u2002"
static const char kPuncPatternUnicode [] = "\u2003"
static const char kLowerPatternUnicode [] = "\u2004"
static const char kUpperPatternUnicode [] = "\u2005"

Protected Member Functions

EDGE_RECORDderef_edge_ref (EDGE_REF edge_ref) const
EDGE_REF make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const
void link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
void print_edge_rec (const EDGE_RECORD &edge_rec) const
bool can_be_eliminated (const EDGE_RECORD &edge_rec)
void print_all (const char *msg, int max_num_edges)
bool edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const
bool add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
bool add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
void add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
NODE_REF new_dawg_node ()
void remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
void remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
bool eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
bool reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, const EDGE_VECTOR &backward_edges, NODE_MARKER reduced_nodes)
void sort_edges (EDGE_VECTOR *edges)
void reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes)
UNICHAR_ID character_class_to_pattern (char ch)

Protected Attributes

TRIE_NODES nodes_
 vector of nodes in the Trie
uinT64 num_edges_
 sum of all edges (forward and backward)
uinT64 max_num_edges_
 maximum number of edges allowed
uinT64 deref_direction_mask_
 mask for EDGE_REF to extract direction
uinT64 deref_node_index_mask_
 mask for EDGE_REF to extract node index
bool initialized_patterns_
UNICHAR_ID alpha_pattern_
UNICHAR_ID digit_pattern_
UNICHAR_ID alphanum_pattern_
UNICHAR_ID punc_pattern_
UNICHAR_ID lower_pattern_
UNICHAR_ID upper_pattern_

Detailed Description

Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.


Constructor & Destructor Documentation

tesseract::Trie::Trie ( DawgType  type,
const STRING lang,
PermuterType  perm,
uinT64  max_num_edges,
int  unicharset_size,
int  debug_level 
) [inline]
tesseract::Trie::~Trie ( ) [inline]

Member Function Documentation

bool tesseract::Trie::add_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
) [protected]
bool tesseract::Trie::add_new_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
bool  word_end,
UNICHAR_ID  unichar_id 
) [inline, protected]
void tesseract::Trie::add_word_ending ( EDGE_RECORD edge,
NODE_REF  the_next_node,
bool  repeats,
UNICHAR_ID  unichar_id 
) [protected]
void tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word,
const GenericVector< bool > *  repetitions 
)
void tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word) [inline]
bool tesseract::Trie::can_be_eliminated ( const EDGE_RECORD edge_rec) [inline, protected]
UNICHAR_ID tesseract::Trie::character_class_to_pattern ( char  ch) [protected]
void tesseract::Trie::clear ( )
EDGE_RECORD* tesseract::Trie::deref_edge_ref ( EDGE_REF  edge_ref) const [inline, protected]
EDGE_REF tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const [inline, virtual]

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

bool tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
NODE_REF  next_node,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id,
EDGE_RECORD **  edge_ptr,
EDGE_INDEX edge_index 
) const [protected]
UNICHAR_ID tesseract::Trie::edge_letter ( EDGE_REF  edge_ref) const [inline, virtual]

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

bool tesseract::Trie::eliminate_redundant_edges ( NODE_REF  node,
const EDGE_RECORD edge1,
const EDGE_RECORD edge2 
) [protected]
bool tesseract::Trie::end_of_word ( EDGE_REF  edge_ref) const [inline, virtual]

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

void tesseract::Trie::initialize_patterns ( UNICHARSET unicharset)
void tesseract::Trie::link_edge ( EDGE_RECORD edge,
NODE_REF  nxt,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
) [inline, protected]

Sets up this edge record to the requested values.

EDGE_REF tesseract::Trie::make_edge_ref ( NODE_REF  node_index,
EDGE_INDEX  edge_index 
) const [inline, protected]

Constructs EDGE_REF from the given node_index and edge_index.

NODE_REF tesseract::Trie::new_dawg_node ( ) [protected]
NODE_REF tesseract::Trie::next_node ( EDGE_REF  edge_ref) const [inline, virtual]

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

virtual EDGE_REF tesseract::Trie::pattern_loop_edge ( EDGE_REF  edge_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const [inline, virtual]

Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.

Reimplemented from tesseract::Dawg.

void tesseract::Trie::print_all ( const char *  msg,
int  max_num_edges 
) [inline, protected]
void tesseract::Trie::print_edge_rec ( const EDGE_RECORD edge_rec) const [inline, protected]

Prints the given EDGE_RECORD.

void tesseract::Trie::print_node ( NODE_REF  node,
int  max_num_edges 
) const [virtual]

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

bool tesseract::Trie::read_pattern_list ( const char *  filename,
const UNICHARSET unicharset 
)
bool tesseract::Trie::read_word_list ( const char *  filename,
const UNICHARSET unicharset 
)
bool tesseract::Trie::reduce_lettered_edges ( EDGE_INDEX  edge_index,
UNICHAR_ID  unichar_id,
NODE_REF  node,
const EDGE_VECTOR backward_edges,
NODE_MARKER  reduced_nodes 
) [protected]
void tesseract::Trie::reduce_node_input ( NODE_REF  node,
NODE_MARKER  reduced_nodes 
) [protected]

Eliminates any redundant edges from this node in the Trie.

void tesseract::Trie::remove_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  word_end,
UNICHAR_ID  unichar_id 
) [inline, protected]
void tesseract::Trie::remove_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
) [protected]
void tesseract::Trie::sort_edges ( EDGE_VECTOR edges) [protected]

Order num_edges of consequtive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.

SquishedDawg * tesseract::Trie::trie_to_dawg ( )
void tesseract::Trie::unichar_id_to_patterns ( UNICHAR_ID  unichar_id,
const UNICHARSET unicharset,
GenericVector< UNICHAR_ID > *  vec 
) const [virtual]

Fills vec with unichar ids that represent the character classes of the given unichar_id.

Reimplemented from tesseract::Dawg.

void tesseract::Trie::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec 
) const [inline, virtual]

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.


Member Data Documentation

mask for EDGE_REF to extract direction

mask for EDGE_REF to extract node index

const char tesseract::Trie::kAlphanumPatternUnicode = "\u2002" [static]
const char tesseract::Trie::kAlphaPatternUnicode = "\u2000" [static]
const char tesseract::Trie::kDigitPatternUnicode = "\u2001" [static]
const char tesseract::Trie::kLowerPatternUnicode = "\u2004" [static]
const char tesseract::Trie::kPuncPatternUnicode = "\u2003" [static]
const char tesseract::Trie::kUpperPatternUnicode = "\u2005" [static]

maximum number of edges allowed

vector of nodes in the Trie

sum of all edges (forward and backward)


The documentation for this class was generated from the following files:
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines