Tesseract 3.01
|
00001 /********************************************************************** 00002 * File: ratngs.h (Formerly ratings.h) 00003 * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes. 00004 * Author: Ray Smith 00005 * Created: Thu Apr 23 11:40:38 BST 1992 00006 * 00007 * (C) Copyright 1992, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 **********************************************************************/ 00019 00020 #ifndef RATNGS_H 00021 #define RATNGS_H 00022 00023 #include <assert.h> 00024 00025 #include "clst.h" 00026 #include "genericvector.h" 00027 #include "notdll.h" 00028 #include "unichar.h" 00029 #include "unicharset.h" 00030 #include "werd.h" 00031 00032 class BLOB_CHOICE: public ELIST_LINK 00033 { 00034 public: 00035 BLOB_CHOICE() { 00036 unichar_id_ = INVALID_UNICHAR_ID; 00037 fontinfo_id_ = -1; 00038 fontinfo_id2_ = -1; 00039 rating_ = MAX_FLOAT32; 00040 certainty_ = -MAX_FLOAT32; 00041 script_id_ = -1; 00042 language_model_state_ = NULL; 00043 } 00044 BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id 00045 float src_rating, // rating 00046 float src_cert, // certainty 00047 inT16 src_fontinfo_id, // font 00048 inT16 src_fontinfo_id2, // 2nd choice font 00049 int script_id); // script 00050 BLOB_CHOICE(const BLOB_CHOICE &other); 00051 ~BLOB_CHOICE() {} 00052 00053 UNICHAR_ID unichar_id() const { 00054 return unichar_id_; 00055 } 00056 float rating() const { 00057 return rating_; 00058 } 00059 float certainty() const { 00060 return certainty_; 00061 } 00062 inT16 fontinfo_id() const { 00063 return fontinfo_id_; 00064 } 00065 inT16 fontinfo_id2() const { 00066 return fontinfo_id2_; 00067 } 00068 int script_id() const { 00069 return script_id_; 00070 } 00071 void *language_model_state() { 00072 return language_model_state_; 00073 } 00074 inT16 xgap_before() { 00075 return xgap_before_; 00076 } 00077 inT16 xgap_after() { 00078 return xgap_after_; 00079 } 00080 00081 void set_unichar_id(UNICHAR_ID newunichar_id) { 00082 unichar_id_ = newunichar_id; 00083 } 00084 void set_rating(float newrat) { 00085 rating_ = newrat; 00086 } 00087 void set_certainty(float newrat) { 00088 certainty_ = newrat; 00089 } 00090 void set_fontinfo_id(inT16 newfont) { 00091 fontinfo_id_ = newfont; 00092 } 00093 void set_fontinfo_id2(inT16 newfont) { 00094 fontinfo_id2_ = newfont; 00095 } 00096 void set_script(int newscript_id) { 00097 script_id_ = newscript_id; 00098 } 00099 void set_language_model_state(void *language_model_state) { 00100 language_model_state_ = language_model_state; 00101 } 00102 void set_xgap_before(inT16 gap) { 00103 xgap_before_ = gap; 00104 } 00105 void set_xgap_after(inT16 gap) { 00106 xgap_after_ = gap; 00107 } 00108 static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) { 00109 BLOB_CHOICE* choice = new BLOB_CHOICE; 00110 *choice = *src; 00111 return choice; 00112 } 00113 void print(const UNICHARSET *unicharset) { 00114 tprintf("r%.2f c%.2f : %d %s", rating_, certainty_, unichar_id_, 00115 (unicharset == NULL) ? "" : 00116 unicharset->debug_str(unichar_id_).string()); 00117 } 00118 00119 private: 00120 UNICHAR_ID unichar_id_; // unichar id 00121 inT16 fontinfo_id_; // char font information 00122 inT16 fontinfo_id2_; // 2nd choice font information 00123 float rating_; // size related 00124 float certainty_; // absolute 00125 int script_id_; 00126 // Stores language model information about this BLOB_CHOICE. Used during 00127 // the segmentation search for BLOB_CHOICEs in BLOB_CHOICE_LISTs that are 00128 // recorded in the ratings matrix. 00129 // The pointer is owned/managed by the segmentation search. 00130 void *language_model_state_; 00131 inT16 xgap_before_; 00132 inT16 xgap_after_; 00133 }; 00134 00135 // Make BLOB_CHOICE listable. 00136 ELISTIZEH (BLOB_CHOICE) CLISTIZEH (BLOB_CHOICE_LIST) 00137 00138 // Permuter codes used in WERD_CHOICEs. 00139 enum PermuterType { 00140 NO_PERM, // 0 00141 PUNC_PERM, // 1 00142 TOP_CHOICE_PERM, // 2 00143 LOWER_CASE_PERM, // 3 00144 UPPER_CASE_PERM, // 4 00145 NGRAM_PERM, // 5 00146 NUMBER_PERM, // 6 00147 USER_PATTERN_PERM, // 7 00148 SYSTEM_DAWG_PERM, // 8 00149 DOC_DAWG_PERM, // 9 00150 USER_DAWG_PERM, // 10 00151 FREQ_DAWG_PERM, // 11 00152 COMPOUND_PERM, // 12 00153 }; 00154 00155 class WERD_CHOICE { 00156 public: 00157 static const float kBadRating; 00158 00159 WERD_CHOICE() { this->init(8); } 00160 WERD_CHOICE(int reserved) { this->init(reserved); } 00161 WERD_CHOICE(const char *src_string, 00162 const char *src_lengths, 00163 float src_rating, 00164 float src_certainty, 00165 uinT8 src_permuter, 00166 const UNICHARSET &unicharset) { 00167 this->init(src_string, src_lengths, src_rating, 00168 src_certainty, src_permuter, unicharset); 00169 } 00170 WERD_CHOICE (const char *src_string, const UNICHARSET &unicharset); 00171 WERD_CHOICE(const WERD_CHOICE &word) { 00172 this->init(word.length()); 00173 this->operator=(word); 00174 } 00175 ~WERD_CHOICE(); 00176 00177 inline int length() const { 00178 return length_; 00179 } 00180 inline const UNICHAR_ID *unichar_ids() const { 00181 return unichar_ids_; 00182 } 00183 inline const UNICHAR_ID unichar_id(int index) const { 00184 assert(index < length_); 00185 return unichar_ids_[index]; 00186 } 00187 inline const char *fragment_lengths() const { 00188 return fragment_lengths_; 00189 } 00190 inline const char fragment_length(int index) const { 00191 assert(index < length_); 00192 return fragment_lengths_[index]; 00193 } 00194 inline float rating() const { 00195 return rating_; 00196 } 00197 inline float certainty() const { 00198 return certainty_; 00199 } 00200 inline uinT8 permuter() const { 00201 return permuter_; 00202 } 00203 inline bool fragment_mark() const { 00204 return fragment_mark_; 00205 } 00206 inline BLOB_CHOICE_LIST_CLIST* blob_choices() { 00207 return blob_choices_; 00208 } 00209 inline void set_unichar_id(UNICHAR_ID unichar_id, int index) { 00210 assert(index < length_); 00211 unichar_ids_[index] = unichar_id; 00212 } 00213 inline void set_fragment_length(char flen, int index) { 00214 assert(index < length_); 00215 fragment_lengths_[index] = flen; 00216 } 00217 inline void set_rating(float new_val) { 00218 rating_ = new_val; 00219 } 00220 inline void set_certainty(float new_val) { 00221 certainty_ = new_val; 00222 } 00223 inline void set_permuter(uinT8 perm) { 00224 permuter_ = perm; 00225 } 00226 inline void set_fragment_mark(bool new_fragment_mark) { 00227 fragment_mark_ = new_fragment_mark; 00228 } 00229 // Note: this function should only be used if all the fields 00230 // are populated manually with set_* functions (rather than 00231 // (copy)constructors and append_* functions). 00232 inline void set_length(int len) { 00233 ASSERT_HOST(reserved_ >= len); 00234 length_ = len; 00235 } 00236 void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices); 00237 00239 inline void double_the_size() { 00240 unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy( 00241 reserved_, unichar_ids_); 00242 fragment_lengths_ = GenericVector<char>::double_the_size_memcpy( 00243 reserved_, fragment_lengths_); 00244 reserved_ *= 2; 00245 } 00246 00249 inline void init(int reserved) { 00250 reserved_ = reserved; 00251 unichar_ids_ = new UNICHAR_ID[reserved]; 00252 fragment_lengths_ = new char[reserved]; 00253 length_ = 0; 00254 rating_ = 0.0; 00255 certainty_ = MAX_FLOAT32; 00256 permuter_ = NO_PERM; 00257 fragment_mark_ = false; 00258 blob_choices_ = NULL; 00259 unichar_string_ = ""; 00260 unichar_lengths_ = ""; 00261 } 00262 00268 void init(const char *src_string, const char *src_lengths, 00269 float src_rating, float src_certainty, 00270 uinT8 src_permuter, const UNICHARSET ¤t_unicharset); 00271 00273 inline void make_bad() { 00274 length_ = 0; 00275 rating_ = kBadRating; 00276 certainty_ = -MAX_FLOAT32; 00277 fragment_mark_ = false; 00278 unichar_string_ = ""; 00279 unichar_lengths_ = ""; 00280 } 00281 00285 inline void append_unichar_id_space_allocated( 00286 UNICHAR_ID unichar_id, char fragment_length, 00287 float rating, float certainty) { 00288 assert(reserved_ > length_); 00289 length_++; 00290 this->set_unichar_id(unichar_id, fragment_length, 00291 rating, certainty, length_-1); 00292 } 00293 00294 void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length, 00295 float rating, float certainty); 00296 00297 inline void set_unichar_id(UNICHAR_ID unichar_id, char fragment_length, 00298 float rating, float certainty, int index) { 00299 assert(index < length_); 00300 unichar_ids_[index] = unichar_id; 00301 fragment_lengths_[index] = fragment_length; 00302 rating_ += rating; 00303 if (certainty < certainty_) { 00304 certainty_ = certainty; 00305 } 00306 } 00307 00308 bool contains_unichar_id(UNICHAR_ID unichar_id) const; 00309 void remove_unichar_ids(int index, int num); 00310 inline void remove_last_unichar_id() { --length_; } 00311 inline void remove_unichar_id(int index) { this->remove_unichar_ids(index, 1); } 00312 void string_and_lengths(const UNICHARSET ¤t_unicharset, 00313 STRING *word_str, STRING *word_lengths_str) const; 00314 const STRING debug_string(const UNICHARSET ¤t_unicharset) const { 00315 STRING word_str; 00316 for (int i = 0; i < length_; ++i) { 00317 word_str += current_unicharset.debug_str(unichar_ids_[i]); 00318 word_str += " "; 00319 } 00320 return word_str; 00321 } 00325 void populate_unichars(const UNICHARSET ¤t_unicharset) { 00326 this->string_and_lengths(current_unicharset, &unichar_string_, 00327 &unichar_lengths_); 00328 } 00331 void depopulate_unichars() { 00332 unichar_string_ = ""; 00333 unichar_lengths_ = ""; 00334 } 00337 const STRING &unichar_string() const { 00338 assert(unichar_string_.length() <= 0 || 00339 unichar_string_.length() >= length_); // sanity check 00340 return unichar_string_; 00341 } 00344 const STRING &unichar_lengths() const { 00345 assert(unichar_lengths_.length() <= 0 || 00346 unichar_lengths_.length() == length_); // sanity check 00347 return unichar_lengths_; 00348 } 00349 const void print() const { this->print(""); } 00350 const void print(const char *msg) const; 00351 00352 WERD_CHOICE& operator+= ( // concatanate 00353 const WERD_CHOICE & second);// second on first 00354 00355 WERD_CHOICE& operator= (const WERD_CHOICE& source); 00356 00357 private: 00358 UNICHAR_ID *unichar_ids_; // unichar ids that represent the text of the word 00359 char *fragment_lengths_; // number of fragments in each unichar 00360 int reserved_; // size of the above arrays 00361 int length_; // word length 00362 float rating_; // size related 00363 float certainty_; // absolute 00364 uinT8 permuter_; // permuter code 00365 bool fragment_mark_; // if true, indicates that this choice 00366 // was chosen over a better one that 00367 // contained a fragment 00368 BLOB_CHOICE_LIST_CLIST *blob_choices_; // best choices for each blob 00369 00370 // The following variables are only populated by calling populate_unichars(). 00371 // They are not synchronized with the values in unichar_ids otherwise. 00372 STRING unichar_string_; 00373 STRING unichar_lengths_; 00374 bool unichar_info_present; 00375 00376 private: 00377 void delete_blob_choices(); 00378 }; 00379 00380 // Make WERD_CHOICE listable. 00381 ELISTIZEH (WERD_CHOICE) 00382 typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR; 00383 typedef GenericVector<WERD_CHOICE_LIST *> WERD_CHOICE_LIST_VECTOR; 00384 00385 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings); 00386 void print_ratings_list( 00387 const char *msg, // intro message 00388 BLOB_CHOICE_LIST *ratings, // list of results 00389 const UNICHARSET ¤t_unicharset // unicharset that can be used 00390 // for id-to-unichar conversion 00391 ); 00392 void print_ratings_info( 00393 FILE *fp, // file to use 00394 BLOB_CHOICE_LIST *ratings, // list of results 00395 const UNICHARSET ¤t_unicharset // unicharset that can be used 00396 // for id-to-unichar conversion 00397 ); 00398 void print_char_choices_list( 00399 const char *msg, 00400 const BLOB_CHOICE_LIST_VECTOR &char_choices, 00401 const UNICHARSET ¤t_unicharset, 00402 BOOL8 detailed 00403 ); 00404 00405 #endif