Tesseract 3.01
/data/source/tesseract-ocr/cube/char_bigrams.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        char_bigrams.h
00003  * Description: Declaration of a Character Bigrams Class
00004  * Author:    Ahmad Abdulkader
00005  * Created:   2007
00006  *
00007  * (C) Copyright 2008, Google Inc.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 // The CharBigram class represents the interface to the character bigram
00021 // table used by Cube
00022 // A CharBigram object can be constructed from the Char Bigrams file
00023 // Given a sequence of characters, the "Cost" method returns the Char Bigram
00024 // cost of the string according to the table
00025 
00026 #ifndef CHAR_BIGRAMS_H
00027 #define CHAR_BIGRAMS_H
00028 
00029 #include <string>
00030 #include "char_set.h"
00031 
00032 namespace tesseract {
00033 
00034 // structure representing a single bigram value
00035 struct Bigram {
00036   int cnt;
00037   int cost;
00038 };
00039 
00040 // structure representing the char bigram array of characters
00041 // following a specific character
00042 struct CharBigram {
00043   int total_cnt;
00044   char_32 max_char;
00045   Bigram *bigram;
00046 };
00047 
00048 // structure representing the whole bigram table
00049 struct CharBigramTable {
00050   int total_cnt;
00051   int worst_cost;
00052   char_32 max_char;
00053   CharBigram *char_bigram;
00054 };
00055 
00056 class CharBigrams {
00057  public:
00058   CharBigrams();
00059   ~CharBigrams();
00060   // Construct the CharBigrams class from a file
00061   static CharBigrams *Create(const string &data_file_path,
00062                              const string &lang);
00063   // Top-level function to return the mean character bigram cost of a
00064   // sequence of characters.  If char_set and unicharset are not NULL
00065   // and cube and tesseract share the same unicharset, use
00066   // tesseract functions to return a case-invariant cost.
00067   // This avoids unnecessarily penalizing all-one-case words or
00068   // capitalized words (first-letter upper-case and remaining letters
00069   // lower-case).
00070   int Cost(const char_32 *str, CharSet *char_set, UNICHARSET *unicharset) const;
00071 
00072  protected:
00073   // Returns the character bigram cost of two characters.
00074   int PairCost(char_32 ch1, char_32 ch2) const;
00075   // Returns the mean character bigram cost of a sequence of
00076   // characters. Adds a space at the beginning and end to account for
00077   // cost of starting and ending characters.
00078   int MeanCostWithSpaces(const char_32 *char_32_ptr) const;
00079 
00080  private:
00081   // Only words this length or greater qualify for case-invariant character
00082   // bigram cost.
00083   static const int kMinLengthCaseInvariant = 4;
00084 
00085 
00086   CharBigramTable bigram_table_;
00087 };
00088 }
00089 
00090 #endif  // CHAR_BIGRAMS_H
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines