Tesseract 3.01
/data/source/tesseract-ocr/dict/dawg.h
Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:         dawg.h  (Formerly dawg.h)
00005  * Description:  Definition of a class that represents Directed Accyclic Word
00006  *               Graph (DAWG), functions to build and manipulate the DAWG.
00007  * Author:       Mark Seaman, SW Productivity
00008  * Created:      Fri Oct 16 14:37:00 1987
00009  * Modified:     Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt
00010  * Language:     C
00011  * Package:      N/A
00012  * Status:       Reusable Software Component
00013  *
00014  * (c) Copyright 1987, Hewlett-Packard Company.
00015  ** Licensed under the Apache License, Version 2.0 (the "License");
00016  ** you may not use this file except in compliance with the License.
00017  ** You may obtain a copy of the License at
00018  ** http://www.apache.org/licenses/LICENSE-2.0
00019  ** Unless required by applicable law or agreed to in writing, software
00020  ** distributed under the License is distributed on an "AS IS" BASIS,
00021  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00022  ** See the License for the specific language governing permissions and
00023  ** limitations under the License.
00024  *
00025  *********************************************************************************/
00026 
00027 #ifndef DICT_DAWG_H_
00028 #define DICT_DAWG_H_
00029 
00030 /*----------------------------------------------------------------------
00031               I n c l u d e s
00032 ----------------------------------------------------------------------*/
00033 
00034 #include "elst.h"
00035 #include "ratngs.h"
00036 #include "params.h"
00037 
00038 #ifndef __GNUC__
00039 #ifdef __MSW32__
00040 #define NO_EDGE                (inT64) 0xffffffffffffffffi64
00041 #endif  /*__MSW32__*/
00042 #else
00043 #define NO_EDGE                (inT64) 0xffffffffffffffffll
00044 #endif /*__GNUC__*/
00045 
00046 /*----------------------------------------------------------------------
00047               T y p e s
00048 ----------------------------------------------------------------------*/
00049 class UNICHARSET;
00050 
00051 typedef uinT64 EDGE_RECORD;
00052 typedef EDGE_RECORD *EDGE_ARRAY;
00053 typedef inT64 EDGE_REF;
00054 typedef inT64 NODE_REF;
00055 typedef EDGE_REF *NODE_MAP;
00056 
00057 namespace tesseract {
00058 
00059 struct NodeChild {
00060   UNICHAR_ID unichar_id;
00061   EDGE_REF edge_ref;
00062   NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {}
00063   NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
00064 };
00065 
00066 typedef GenericVector<NodeChild> NodeChildVector;
00067 typedef GenericVector<int> SuccessorList;
00068 typedef GenericVector<SuccessorList *> SuccessorListsVector;
00069 
00070 enum DawgType {
00071   DAWG_TYPE_PUNCTUATION,
00072   DAWG_TYPE_WORD,
00073   DAWG_TYPE_NUMBER,
00074   DAWG_TYPE_PATTERN,
00075 
00076   DAWG_TYPE_COUNT  // number of enum entries
00077 };
00078 
00079 /*----------------------------------------------------------------------
00080               C o n s t a n t s
00081 ----------------------------------------------------------------------*/
00082 
00083 #define FORWARD_EDGE           (inT32) 0
00084 #define BACKWARD_EDGE          (inT32) 1
00085 #define MAX_NODE_EDGES_DISPLAY (inT64) 100
00086 #define MARKER_FLAG            (inT64) 1
00087 #define DIRECTION_FLAG         (inT64) 2
00088 #define WERD_END_FLAG          (inT64) 4
00089 #define LETTER_START_BIT       0
00090 #define NUM_FLAG_BITS          3
00091 #define REFFORMAT "%lld"
00092 
00093 // Set kBeginningDawgsType[i] to true if a Dawg of
00094 // DawgType i can contain the beginning of a word.
00095 static const bool kBeginningDawgsType[] = { 1, 1, 1, 1 };
00096 
00097 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
00098   { 0, 1, 1, 0 },  // for DAWG_TYPE_PUNCTUATION
00099   { 1, 0, 0, 0 },  // for DAWG_TYPE_WORD
00100   { 1, 0, 0, 0 },  // for DAWG_TYPE_NUMBER
00101   { 0, 0, 0, 0 },  // for DAWG_TYPE_PATTERN
00102 };
00103 
00104 static const char kWildcard[] = "*";
00105 
00106 
00107 /*----------------------------------------------------------------------
00108               C l a s s e s   a n d   S t r u c t s
00109 ----------------------------------------------------------------------*/
00110 //
00120 //
00121 class Dawg {
00122  public:
00124   static const inT16 kDawgMagicNumber = 42;
00128   static const UNICHAR_ID kPatternUnicharID = 0;
00129 
00130   inline DawgType type() const { return type_; }
00131   inline const STRING &lang() const { return lang_; }
00132   inline PermuterType permuter() const { return perm_; }
00133 
00134   virtual ~Dawg() {};
00135 
00137   bool word_in_dawg(const WERD_CHOICE &word) const;
00138 
00141   int check_for_words(const char *filename,
00142                       const UNICHARSET &unicharset,
00143                       bool enable_wildcard) const;
00144 
00145   // Pure virtual function that should be implemented by the derived classes.
00146 
00148   virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
00149                                 bool word_end) const = 0;
00150 
00153   virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0;
00154 
00157   virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
00158 
00161   virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
00162 
00164   virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
00165 
00168   virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
00169 
00172   virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,
00173                                       const UNICHARSET &unicharset,
00174                                       GenericVector<UNICHAR_ID> *vec) const {};
00175 
00179   virtual EDGE_REF pattern_loop_edge(
00180       EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const {
00181     return false;
00182   }
00183 
00184  protected:
00185   Dawg() {}
00186 
00188   inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00189     return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
00190   }
00192   inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00193     return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
00194   }
00196   inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00197     return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
00198       BACKWARD_EDGE : FORWARD_EDGE;
00199   }
00201   inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
00202     return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
00203   }
00205   inline UNICHAR_ID unichar_id_from_edge_rec(
00206       const EDGE_RECORD &edge_rec) const {
00207     return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
00208   }
00210   inline void set_next_node_in_edge_rec(
00211       EDGE_RECORD *edge_rec, EDGE_REF value) {
00212     *edge_rec &= (~next_node_mask_);
00213     *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
00214   }
00216   inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
00217     *edge_rec |= (MARKER_FLAG << flag_start_bit_);
00218   }
00226   inline int given_greater_than_edge_rec(NODE_REF next_node,
00227                                          bool word_end,
00228                                          UNICHAR_ID unichar_id,
00229                                          const EDGE_RECORD &edge_rec) const {
00230     UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
00231     NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
00232     bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
00233     if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
00234                        curr_word_end, curr_unichar_id)) return 0;
00235     if (unichar_id > curr_unichar_id) return 1;
00236     if (unichar_id == curr_unichar_id) {
00237       if (next_node > curr_next_node) return 1;
00238       if (next_node == curr_next_node) {
00239         if (word_end > curr_word_end) return 1;
00240       }
00241     }
00242     return -1;
00243   }
00247   inline bool edge_rec_match(NODE_REF next_node,
00248                              bool word_end,
00249                              UNICHAR_ID unichar_id,
00250                              NODE_REF other_next_node,
00251                              bool other_word_end,
00252                              UNICHAR_ID other_unichar_id) const {
00253     return ((unichar_id == other_unichar_id) &&
00254             (next_node == NO_EDGE || next_node == other_next_node) &&
00255             (!word_end || (word_end == other_word_end)));
00256   }
00257 
00260   void init(DawgType type, const STRING &lang,
00261             PermuterType perm, int unicharset_size, int debug_level);
00262 
00268   bool match_words(WERD_CHOICE *word, inT32 index,
00269                    NODE_REF node, UNICHAR_ID wildcard) const;
00270 
00271   // Member Variables.
00272   DawgType type_;
00273   STRING lang_;
00275   PermuterType perm_;
00276   // Variables to construct various edge masks. Formerly:
00277   // #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64
00278   // #define FLAGS_MASK     (inT64) 0x0000000700000000i64
00279   // #define LETTER_MASK    (inT64) 0x00000000ffffffffi64
00280   int unicharset_size_;
00281   int flag_start_bit_;
00282   int next_node_start_bit_;
00283   uinT64 next_node_mask_;
00284   uinT64 flags_mask_;
00285   uinT64 letter_mask_;
00286   // Level of debug statements to print to stdout.
00287   int debug_level_;
00288 };
00289 
00290 //
00293 //
00294 struct DawgInfo {
00295   DawgInfo() : dawg_index(-1), ref(NO_EDGE) {}
00296   DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {}
00297   bool operator==(const DawgInfo &other) {
00298     return (this->dawg_index == other.dawg_index && this->ref == other.ref);
00299   }
00300   int dawg_index;
00301   EDGE_REF ref;
00302 };
00303 class DawgInfoVector : public GenericVector<DawgInfo> {
00304  public:
00306   ~DawgInfoVector() {
00307     if (size_reserved_ > 0) {
00308       delete[] data_;
00309       size_used_ = 0;
00310       size_reserved_ = 0;
00311     }
00312   }
00315   void clear() { size_used_ = 0; }
00319   inline bool add_unique(const DawgInfo &new_info, bool debug,
00320                          const char *debug_msg) {
00321     for (int i = 0; i < size_used_; ++i) {
00322       if (data_[i] == new_info) return false;
00323     }
00324     push_back(new_info);
00325     if (debug) {
00326       tprintf("%s[%d, " REFFORMAT "]\n", debug_msg,
00327               new_info.dawg_index, new_info.ref);
00328     }
00329     return true;
00330   }
00331 };
00332 
00333 //
00340 //
00341 class SquishedDawg : public Dawg {
00342  public:
00343   SquishedDawg(FILE *file, DawgType type, const STRING &lang,
00344                PermuterType perm, int debug_level) {
00345     read_squished_dawg(file, type, lang, perm, debug_level);
00346     num_forward_edges_in_node0 = num_forward_edges(0);
00347   }
00348   SquishedDawg(const char* filename, DawgType type,
00349                const STRING &lang, PermuterType perm, int debug_level) {
00350     FILE *file = fopen(filename, "rb");
00351     if (file == NULL) {
00352       tprintf("Failed to open dawg file %s\n", filename);
00353       exit(1);
00354     }
00355     read_squished_dawg(file, type, lang, perm, debug_level);
00356     num_forward_edges_in_node0 = num_forward_edges(0);
00357     fclose(file);
00358   }
00359   SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
00360                const STRING &lang, PermuterType perm,
00361                int unicharset_size, int debug_level) :
00362     edges_(edges), num_edges_(num_edges) {
00363     init(type, lang, perm, unicharset_size, debug_level);
00364     num_forward_edges_in_node0 = num_forward_edges(0);
00365     if (debug_level > 3) print_all("SquishedDawg:");
00366   }
00367   ~SquishedDawg();
00368 
00369   int NumEdges() { return num_edges_; }
00370 
00372   EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
00373                         bool word_end) const;
00374 
00377   void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
00378     EDGE_REF edge = node;
00379     if (!edge_occupied(edge) || edge == NO_EDGE) return;
00380     assert(forward_edge(edge));  // we don't expect any backward edges to
00381     do {                         // be present when this funciton is called
00382       vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
00383     } while (!last_edge(edge++));
00384   }
00385 
00388   NODE_REF next_node(EDGE_REF edge) const {
00389     return next_node_from_edge_rec((edges_[edge]));
00390   }
00391 
00394   bool end_of_word(EDGE_REF edge_ref) const {
00395     return end_of_word_from_edge_rec((edges_[edge_ref]));
00396   }
00397 
00399   UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
00400     return unichar_id_from_edge_rec((edges_[edge_ref]));
00401   }
00402 
00405   void print_node(NODE_REF node, int max_num_edges) const;
00406 
00408   void write_squished_dawg(FILE *file);
00409 
00412   void write_squished_dawg(const char *filename) {
00413     FILE *file = fopen(filename, "wb");
00414     if (file == NULL) {
00415       tprintf("Error opening %s\n", filename);
00416       exit(1);
00417     }
00418     this->write_squished_dawg(file);
00419     fclose(file);
00420   }
00421 
00422  private:
00424   inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
00425     set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
00426   }
00428   inline void set_empty_edge(EDGE_REF edge_ref) {
00429     (edges_[edge_ref] = next_node_mask_);
00430   }
00432   inline void clear_all_edges() {
00433     for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
00434   }
00436   inline void clear_marker_flag(EDGE_REF edge_ref) {
00437      (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
00438   }
00440   inline bool forward_edge(EDGE_REF edge_ref) const {
00441     return (edge_occupied(edge_ref) &&
00442             (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
00443   }
00445   inline bool backward_edge(EDGE_REF edge_ref) const {
00446     return (edge_occupied(edge_ref) &&
00447             (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
00448   }
00450   inline bool edge_occupied(EDGE_REF edge_ref) const {
00451     return (edges_[edge_ref] != next_node_mask_);
00452   }
00454   inline bool last_edge(EDGE_REF edge_ref) const {
00455     return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
00456   }
00457 
00459   inT32 num_forward_edges(NODE_REF node) const;
00460 
00462   void read_squished_dawg(FILE *file, DawgType type, const STRING &lang,
00463                           PermuterType perm, int debug_level);
00464 
00466   void print_edge(EDGE_REF edge) const;
00467 
00469   void print_all(const char* msg) {
00470     tprintf("\n__________________________\n%s\n", msg);
00471     for (int i = 0; i < num_edges_; ++i) print_edge(i);
00472     tprintf("__________________________\n");
00473   }
00475   NODE_MAP build_node_map(inT32 *num_nodes) const;
00476 
00477 
00478   // Member variables.
00479   EDGE_ARRAY edges_;
00480   int num_edges_;
00481   int num_forward_edges_in_node0;
00482 };
00483 
00484 }  // namespace tesseract
00485 
00486 #endif  // DICT_DAWG_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines