Tesseract 3.01
/data/source/tesseract-ocr/ccstruct/ratngs.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        ratngs.h  (Formerly ratings.h)
00003  * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes.
00004  * Author:      Ray Smith
00005  * Created:     Thu Apr 23 11:40:38 BST 1992
00006  *
00007  * (C) Copyright 1992, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef           RATNGS_H
00021 #define           RATNGS_H
00022 
00023 #include <assert.h>
00024 
00025 #include "clst.h"
00026 #include "genericvector.h"
00027 #include "notdll.h"
00028 #include "unichar.h"
00029 #include "unicharset.h"
00030 #include "werd.h"
00031 
00032 class BLOB_CHOICE: public ELIST_LINK
00033 {
00034   public:
00035     BLOB_CHOICE() {
00036       unichar_id_ = INVALID_UNICHAR_ID;
00037       fontinfo_id_ = -1;
00038       fontinfo_id2_ = -1;
00039       rating_ = MAX_FLOAT32;
00040       certainty_ = -MAX_FLOAT32;
00041       script_id_ = -1;
00042       language_model_state_ = NULL;
00043     }
00044     BLOB_CHOICE(UNICHAR_ID src_unichar_id,  // character id
00045                 float src_rating,          // rating
00046                 float src_cert,            // certainty
00047                 inT16 src_fontinfo_id,      // font
00048                 inT16 src_fontinfo_id2,     // 2nd choice font
00049                 int script_id);            // script
00050     BLOB_CHOICE(const BLOB_CHOICE &other);
00051     ~BLOB_CHOICE() {}
00052 
00053     UNICHAR_ID unichar_id() const {
00054       return unichar_id_;
00055     }
00056     float rating() const {
00057       return rating_;
00058     }
00059     float certainty() const {
00060       return certainty_;
00061     }
00062     inT16 fontinfo_id() const {
00063       return fontinfo_id_;
00064     }
00065     inT16 fontinfo_id2() const {
00066       return fontinfo_id2_;
00067     }
00068     int script_id() const {
00069       return script_id_;
00070     }
00071     void *language_model_state() {
00072       return language_model_state_;
00073     }
00074     inT16 xgap_before() {
00075       return xgap_before_;
00076     }
00077     inT16 xgap_after() {
00078       return xgap_after_;
00079     }
00080 
00081     void set_unichar_id(UNICHAR_ID newunichar_id) {
00082       unichar_id_ = newunichar_id;
00083     }
00084     void set_rating(float newrat) {
00085       rating_ = newrat;
00086     }
00087     void set_certainty(float newrat) {
00088       certainty_ = newrat;
00089     }
00090     void set_fontinfo_id(inT16 newfont) {
00091       fontinfo_id_ = newfont;
00092     }
00093     void set_fontinfo_id2(inT16 newfont) {
00094       fontinfo_id2_ = newfont;
00095     }
00096     void set_script(int newscript_id) {
00097       script_id_ = newscript_id;
00098     }
00099     void set_language_model_state(void *language_model_state) {
00100       language_model_state_ = language_model_state;
00101     }
00102     void set_xgap_before(inT16 gap) {
00103       xgap_before_ = gap;
00104     }
00105     void set_xgap_after(inT16 gap) {
00106       xgap_after_ = gap;
00107     }
00108     static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) {
00109       BLOB_CHOICE* choice = new BLOB_CHOICE;
00110       *choice = *src;
00111       return choice;
00112     }
00113     void print(const UNICHARSET *unicharset) {
00114       tprintf("r%.2f c%.2f : %d %s", rating_, certainty_, unichar_id_,
00115               (unicharset == NULL) ? "" :
00116               unicharset->debug_str(unichar_id_).string());
00117     }
00118 
00119  private:
00120   UNICHAR_ID unichar_id_;          // unichar id
00121   inT16 fontinfo_id_;              // char font information
00122   inT16 fontinfo_id2_;             // 2nd choice font information
00123   float rating_;                  // size related
00124   float certainty_;               // absolute
00125   int script_id_;
00126   // Stores language model information about this BLOB_CHOICE. Used during
00127   // the segmentation search for BLOB_CHOICEs in BLOB_CHOICE_LISTs that are
00128   // recorded in the ratings matrix.
00129   // The pointer is owned/managed by the segmentation search.
00130   void *language_model_state_;
00131   inT16 xgap_before_;
00132   inT16 xgap_after_;
00133 };
00134 
00135 // Make BLOB_CHOICE listable.
00136 ELISTIZEH (BLOB_CHOICE) CLISTIZEH (BLOB_CHOICE_LIST)
00137 
00138 // Permuter codes used in WERD_CHOICEs.
00139 enum PermuterType {
00140   NO_PERM,            // 0
00141   PUNC_PERM,          // 1
00142   TOP_CHOICE_PERM,    // 2
00143   LOWER_CASE_PERM,    // 3
00144   UPPER_CASE_PERM,    // 4
00145   NGRAM_PERM,         // 5
00146   NUMBER_PERM,        // 6
00147   USER_PATTERN_PERM,  // 7
00148   SYSTEM_DAWG_PERM,   // 8
00149   DOC_DAWG_PERM,      // 9
00150   USER_DAWG_PERM,     // 10
00151   FREQ_DAWG_PERM,     // 11
00152   COMPOUND_PERM,      // 12
00153 };
00154 
00155 class WERD_CHOICE {
00156  public:
00157   static const float kBadRating;
00158 
00159   WERD_CHOICE() { this->init(8); }
00160   WERD_CHOICE(int reserved) { this->init(reserved); }
00161   WERD_CHOICE(const char *src_string,
00162               const char *src_lengths,
00163               float src_rating,
00164               float src_certainty,
00165               uinT8 src_permuter,
00166               const UNICHARSET &unicharset) {
00167     this->init(src_string, src_lengths, src_rating,
00168                src_certainty, src_permuter, unicharset);
00169   }
00170   WERD_CHOICE (const char *src_string, const UNICHARSET &unicharset);
00171   WERD_CHOICE(const WERD_CHOICE &word) {
00172     this->init(word.length());
00173     this->operator=(word);
00174   }
00175   ~WERD_CHOICE();
00176 
00177   inline int length() const {
00178     return length_;
00179   }
00180   inline const UNICHAR_ID *unichar_ids() const {
00181     return unichar_ids_;
00182   }
00183   inline const UNICHAR_ID unichar_id(int index) const {
00184     assert(index < length_);
00185     return unichar_ids_[index];
00186   }
00187   inline const char *fragment_lengths() const {
00188     return fragment_lengths_;
00189   }
00190   inline const char fragment_length(int index) const {
00191     assert(index < length_);
00192     return fragment_lengths_[index];
00193   }
00194   inline float rating() const {
00195     return rating_;
00196   }
00197   inline float certainty() const {
00198     return certainty_;
00199   }
00200   inline uinT8 permuter() const {
00201     return permuter_;
00202   }
00203   inline bool fragment_mark() const {
00204     return fragment_mark_;
00205   }
00206   inline BLOB_CHOICE_LIST_CLIST* blob_choices() {
00207     return blob_choices_;
00208   }
00209   inline void set_unichar_id(UNICHAR_ID unichar_id, int index) {
00210     assert(index < length_);
00211     unichar_ids_[index] = unichar_id;
00212   }
00213   inline void set_fragment_length(char flen, int index) {
00214     assert(index < length_);
00215     fragment_lengths_[index] = flen;
00216   }
00217   inline void set_rating(float new_val) {
00218     rating_ = new_val;
00219   }
00220   inline void set_certainty(float new_val) {
00221     certainty_ = new_val;
00222   }
00223   inline void set_permuter(uinT8 perm) {
00224     permuter_ = perm;
00225   }
00226   inline void set_fragment_mark(bool new_fragment_mark) {
00227     fragment_mark_ = new_fragment_mark;
00228   }
00229   // Note: this function should only be used if all the fields
00230   // are populated manually with set_* functions (rather than
00231   // (copy)constructors and append_* functions).
00232   inline void set_length(int len) {
00233     ASSERT_HOST(reserved_ >= len);
00234     length_ = len;
00235   }
00236   void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices);
00237 
00239   inline void double_the_size() {
00240     unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy(
00241         reserved_, unichar_ids_);
00242     fragment_lengths_ = GenericVector<char>::double_the_size_memcpy(
00243         reserved_, fragment_lengths_);
00244     reserved_ *= 2;
00245   }
00246 
00249   inline void init(int reserved) {
00250     reserved_ = reserved;
00251     unichar_ids_ = new UNICHAR_ID[reserved];
00252     fragment_lengths_ = new char[reserved];
00253     length_ = 0;
00254     rating_ = 0.0;
00255     certainty_ = MAX_FLOAT32;
00256     permuter_ = NO_PERM;
00257     fragment_mark_ = false;
00258     blob_choices_ = NULL;
00259     unichar_string_ = "";
00260     unichar_lengths_ = "";
00261   }
00262 
00268   void init(const char *src_string, const char *src_lengths,
00269             float src_rating, float src_certainty,
00270             uinT8 src_permuter, const UNICHARSET &current_unicharset);
00271 
00273   inline void make_bad() {
00274     length_ = 0;
00275     rating_ = kBadRating;
00276     certainty_ = -MAX_FLOAT32;
00277     fragment_mark_ = false;
00278     unichar_string_ = "";
00279     unichar_lengths_ = "";
00280   }
00281 
00285   inline void append_unichar_id_space_allocated(
00286       UNICHAR_ID unichar_id, char fragment_length,
00287       float rating, float certainty) {
00288     assert(reserved_ > length_);
00289     length_++;
00290     this->set_unichar_id(unichar_id, fragment_length,
00291                          rating, certainty, length_-1);
00292   }
00293 
00294   void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
00295                          float rating, float certainty);
00296 
00297   inline void set_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
00298                              float rating, float certainty, int index) {
00299     assert(index < length_);
00300     unichar_ids_[index] = unichar_id;
00301     fragment_lengths_[index] = fragment_length;
00302     rating_ += rating;
00303     if (certainty < certainty_) {
00304       certainty_ = certainty;
00305     }
00306   }
00307 
00308   bool contains_unichar_id(UNICHAR_ID unichar_id) const;
00309   void remove_unichar_ids(int index, int num);
00310   inline void remove_last_unichar_id() { --length_; }
00311   inline void remove_unichar_id(int index) { this->remove_unichar_ids(index, 1); }
00312   void string_and_lengths(const UNICHARSET &current_unicharset,
00313                           STRING *word_str, STRING *word_lengths_str) const;
00314   const STRING debug_string(const UNICHARSET &current_unicharset) const {
00315     STRING word_str;
00316     for (int i = 0; i < length_; ++i) {
00317       word_str += current_unicharset.debug_str(unichar_ids_[i]);
00318       word_str += " ";
00319     }
00320     return word_str;
00321   }
00325   void populate_unichars(const UNICHARSET &current_unicharset) {
00326     this->string_and_lengths(current_unicharset, &unichar_string_,
00327                              &unichar_lengths_);
00328   }
00331   void depopulate_unichars() {
00332     unichar_string_ = "";
00333     unichar_lengths_ = "";
00334   }
00337   const STRING &unichar_string() const {
00338     assert(unichar_string_.length() <= 0 ||
00339            unichar_string_.length() >= length_);  // sanity check
00340     return unichar_string_;
00341   }
00344   const STRING &unichar_lengths() const {
00345     assert(unichar_lengths_.length() <= 0 ||
00346            unichar_lengths_.length() == length_);  // sanity check
00347     return unichar_lengths_;
00348   }
00349   const void print() const { this->print(""); }
00350   const void print(const char *msg) const;
00351 
00352   WERD_CHOICE& operator+= (     // concatanate
00353     const WERD_CHOICE & second);// second on first
00354 
00355   WERD_CHOICE& operator= (const WERD_CHOICE& source);
00356 
00357  private:
00358   UNICHAR_ID *unichar_ids_;  // unichar ids that represent the text of the word
00359   char *fragment_lengths_;   // number of fragments in each unichar
00360   int reserved_;             // size of the above arrays
00361   int length_;               // word length
00362   float rating_;             // size related
00363   float certainty_;          // absolute
00364   uinT8 permuter_;           // permuter code
00365   bool fragment_mark_;       // if true, indicates that this choice
00366                              // was chosen over a better one that
00367                              // contained a fragment
00368   BLOB_CHOICE_LIST_CLIST *blob_choices_;  // best choices for each blob
00369 
00370   // The following variables are only populated by calling populate_unichars().
00371   // They are not synchronized with the values in unichar_ids otherwise.
00372   STRING unichar_string_;
00373   STRING unichar_lengths_;
00374   bool unichar_info_present;
00375 
00376  private:
00377   void delete_blob_choices();
00378 };
00379 
00380 // Make WERD_CHOICE listable.
00381 ELISTIZEH (WERD_CHOICE)
00382 typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR;
00383 typedef GenericVector<WERD_CHOICE_LIST *> WERD_CHOICE_LIST_VECTOR;
00384 
00385 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings);
00386 void print_ratings_list(
00387     const char *msg,                      // intro message
00388     BLOB_CHOICE_LIST *ratings,            // list of results
00389     const UNICHARSET &current_unicharset  // unicharset that can be used
00390                                           // for id-to-unichar conversion
00391     );
00392 void print_ratings_info(
00393     FILE *fp,                             // file to use
00394     BLOB_CHOICE_LIST *ratings,            // list of results
00395     const UNICHARSET &current_unicharset  // unicharset that can be used
00396                                           // for id-to-unichar conversion
00397     );
00398 void print_char_choices_list(
00399     const char *msg,
00400     const BLOB_CHOICE_LIST_VECTOR &char_choices,
00401     const UNICHARSET &current_unicharset,
00402     BOOL8 detailed
00403     );
00404 
00405 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines