Tesseract 3.01
/data/source/tesseract-ocr/cube/word_list_lang_model.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        word_list_lang_model.h
00003  * Description: Declaration of the Word List Language Model Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2008
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // The WordListLangModel class abstracts a language model that is based on
00021 // a list of words. It inherits from the LangModel abstract class
00022 // Besides providing the methods inherited from the LangModel abstract class,
00023 // the class provided methods to add new strings to the Language Model:
00024 // AddString & AddString32
00025 
00026 #ifndef WORD_LIST_LANG_MODEL_H
00027 #define WORD_LIST_LANG_MODEL_H
00028 
00029 #include <vector>
00030 
00031 #include "cube_reco_context.h"
00032 #include "lang_model.h"
00033 #include "tess_lang_mod_edge.h"
00034 
00035 namespace tesseract {
00036 
00037 class Trie;
00038 
00039 class WordListLangModel : public LangModel {
00040  public:
00041   explicit WordListLangModel(CubeRecoContext *cntxt);
00042   ~WordListLangModel();
00043   // Returns an edge pointer to the Root
00044   LangModEdge *Root();
00045   // Returns the edges that fan-out of the specified edge and their count
00046   LangModEdge **GetEdges(CharAltList *alt_list,
00047                          LangModEdge *edge,
00048                          int *edge_cnt);
00049   // Returns is a sequence of 32-bit characters are valid within this language
00050   // model or net. And EndOfWord flag is specified. If true, the sequence has
00051   // to end on a valid word. The function also optionally returns the list
00052   // of language model edges traversed to parse the string
00053   bool IsValidSequence(const char_32 *sequence,
00054                        bool eow_flag,
00055                        LangModEdge **edges);
00056   bool IsLeadingPunc(char_32 ch) { return false; }  // not yet implemented
00057   bool IsTrailingPunc(char_32 ch) { return false; }  // not yet implemented
00058   bool IsDigit(char_32 ch) { return false; }  // not yet implemented
00059   // Adds a new UTF-8 string to the language model
00060   bool AddString(const char *char_ptr);
00061   // Adds a new UTF-32 string to the language model
00062   bool AddString32(const char_32 *char_32_ptr);
00063   // Compute all the variants of a 32-bit string in terms of the class-ids.
00064   // This is needed for languages that have ligatures. A word can then have
00065   // more than one spelling in terms of the class-ids.
00066   static void WordVariants(const CharSet &char_set, string_32 str32,
00067                            vector<WERD_CHOICE> *word_variants);
00068  private:
00069   // constants needed to configure the language model
00070   static const int kMaxEdge = 512;
00071   static const int kMaxDawgEdges = 20000;
00072 
00073   CubeRecoContext *cntxt_;
00074   Trie *dawg_;
00075   bool init_;
00076   // Initialize the language model
00077   bool Init();
00078   // Cleanup
00079   void Cleanup();
00080   // Recursive helper function for WordVariants().
00081   static void WordVariants(const CharSet &char_set,
00082                            string_32 prefix_str32, WERD_CHOICE *word_so_far,
00083                            string_32 str32, vector<WERD_CHOICE> *word_variants);
00084 };
00085 }  // tesseract
00086 
00087 #endif  // WORD_LIST_LANG_MODEL_H
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines