Tesseract 3.01
|
00001 00002 // File: unicharset.h 00003 // Description: Unicode character/ligature set class. 00004 // Author: Thomas Kielbus 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__ 00021 #define TESSERACT_CCUTIL_UNICHARSET_H__ 00022 00023 #include "assert.h" 00024 #include "strngs.h" 00025 #include "unichar.h" 00026 #include "unicharmap.h" 00027 #include "params.h" 00028 00029 class CHAR_FRAGMENT { 00030 public: 00031 // Minimum number of characters used for fragment representation. 00032 static const int kMinLen = 6; 00033 // Maximum number of characters used for fragment representation. 00034 static const int kMaxLen = 3 + UNICHAR_LEN + 2; 00035 // Special character used in representing character fragments. 00036 static const char kSeparator = '|'; 00037 // Maximum number of fragments per character. 00038 static const int kMaxChunks = 3; 00039 00040 // Setters and Getters. 00041 inline void set_all(const char *unichar, int pos, int total) { 00042 this->set_unichar(unichar); 00043 this->set_pos(pos); 00044 this->set_total(total); 00045 } 00046 inline void set_unichar(const char *uch) { 00047 strncpy(this->unichar, uch, UNICHAR_LEN); 00048 this->unichar[UNICHAR_LEN] = '\0'; 00049 } 00050 inline void set_pos(int p) { this->pos = p; } 00051 inline void set_total(int t) { this->total = t; } 00052 inline const char* get_unichar() const { return this->unichar; } 00053 inline int get_pos() const { return this->pos; } 00054 inline int get_total() const { return this->total; } 00055 00056 // Returns the string that represents a fragment 00057 // with the given unichar, pos and total. 00058 static STRING to_string(const char *unichar, int pos, int total) { 00059 if (total == 1) return STRING(unichar); 00060 STRING result = ""; 00061 result += kSeparator; 00062 result += unichar; 00063 char buffer[kMaxLen]; 00064 snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total); 00065 result += buffer; 00066 return result; 00067 } 00068 // Returns the string that represents this fragment. 00069 STRING to_string() const { 00070 return to_string(this->unichar, this->pos, this->total); 00071 } 00072 00073 // Checks whether a fragment has the same unichar, 00074 // position and total as the given inputs. 00075 inline bool equals(const char *other_unichar, 00076 int other_pos, int other_total) const { 00077 return (strcmp(this->unichar, other_unichar) == 0 && 00078 this->pos == other_pos && this->total == other_total); 00079 } 00080 inline bool equals(const CHAR_FRAGMENT *other) const { 00081 return this->equals(other->get_unichar(), 00082 other->get_pos(), 00083 other->get_total()); 00084 } 00085 00086 // Checks whether a given fragment is a continuation of this fragment. 00087 // Assumes that the given fragment pointer is not NULL. 00088 inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const { 00089 return (strcmp(this->unichar, fragment->get_unichar()) == 0 && 00090 this->total == fragment->get_total() && 00091 this->pos == fragment->get_pos() + 1); 00092 } 00093 00094 // Returns true if this fragment is a beginning fragment. 00095 inline bool is_beginning() const { return this->pos == 0; } 00096 00097 // Returns true if this fragment is an ending fragment. 00098 inline bool is_ending() const { return this->pos == this->total-1; } 00099 00100 // Parses the string to see whether it represents a character fragment 00101 // (rather than a regular character). If so, allocates memory for a new 00102 // CHAR_FRAGMENT instance and fills it in with the corresponding fragment 00103 // information. Fragments are of the form: 00104 // |m|1|2, meaning chunk 1 of 2 of character m. 00105 // 00106 // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT 00107 // instance, otherwise (if the string does not represent a fragment or it 00108 // looks like it does, but parsing it as a fragment fails) returns NULL. 00109 // 00110 // Note: The caller is responsible for deallocating memory 00111 // associated with the returned pointer. 00112 static CHAR_FRAGMENT *parse_from_string(const char *str); 00113 00114 private: 00115 char unichar[UNICHAR_LEN + 1]; 00116 inT16 pos; // fragment position in the character 00117 inT16 total; // total number of fragments in the character 00118 }; 00119 00120 // The UNICHARSET class is an utility class for Tesseract that holds the 00121 // set of characters that are used by the engine. Each character is identified 00122 // by a unique number, from 0 to (size - 1). 00123 class UNICHARSET { 00124 public: 00125 // Create an empty UNICHARSET 00126 UNICHARSET(); 00127 00128 ~UNICHARSET(); 00129 00130 // Return the UNICHAR_ID of a given unichar representation within the 00131 // UNICHARSET. 00132 const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const; 00133 00134 // Return the UNICHAR_ID of a given unichar representation within the 00135 // UNICHARSET. Only the first length characters from unichar_repr are used. 00136 const UNICHAR_ID unichar_to_id(const char* const unichar_repr, 00137 int length) const; 00138 00139 // Return the minimum number of bytes that matches a legal UNICHAR_ID, 00140 // while leaving a legal UNICHAR_ID afterwards. In other words, if there 00141 // is both a short and a long match to the string, return the length that 00142 // ensures there is a legal match after it. 00143 int step(const char* str) const; 00144 00145 // Return the unichar representation corresponding to the given UNICHAR_ID 00146 // within the UNICHARSET. 00147 const char* const id_to_unichar(UNICHAR_ID id) const; 00148 00149 // Return a STRING that reformats the utf8 str into the str followed 00150 // by its hex unicodes. 00151 static STRING debug_utf8_str(const char* str); 00152 00153 // Return a STRING containing debug information on the unichar, including 00154 // the id_to_unichar, its hex unicodes and the properties. 00155 STRING debug_str(UNICHAR_ID id) const; 00156 STRING debug_str(const char * unichar_repr) const { 00157 return debug_str(unichar_to_id(unichar_repr)); 00158 } 00159 00160 // Add a unichar representation to the set. 00161 void unichar_insert(const char* const unichar_repr); 00162 00163 // Return true if the given unichar id exists within the set. 00164 // Relies on the fact that unichar ids are contiguous in the unicharset. 00165 bool contains_unichar_id(UNICHAR_ID unichar_id) const { 00166 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used; 00167 } 00168 00169 // Return true if the given unichar representation exists within the set. 00170 bool contains_unichar(const char* const unichar_repr) const; 00171 bool contains_unichar(const char* const unichar_repr, int length) const; 00172 00173 // Return true if the given unichar representation corresponds to the given 00174 // UNICHAR_ID within the set. 00175 bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const; 00176 00177 // Delete CHAR_FRAGMENTs stored in properties of unichars array. 00178 void delete_pointers_in_unichars() { 00179 for (int i = 0; i < size_used; ++i) { 00180 if (unichars[i].properties.fragment != NULL) { 00181 delete unichars[i].properties.fragment; 00182 unichars[i].properties.fragment = NULL; 00183 } 00184 } 00185 } 00186 00187 // Clear the UNICHARSET (all the previous data is lost). 00188 void clear() { 00189 if (script_table != NULL) { 00190 for (int i = 0; i < script_table_size_used; ++i) 00191 delete[] script_table[i]; 00192 delete[] script_table; 00193 script_table = NULL; 00194 script_table_size_used = 0; 00195 } 00196 if (unichars != NULL) { 00197 delete_pointers_in_unichars(); 00198 delete[] unichars; 00199 unichars = NULL; 00200 } 00201 script_table_size_reserved = 0; 00202 size_reserved = 0; 00203 size_used = 0; 00204 ids.clear(); 00205 top_bottom_set_ = false; 00206 script_has_upper_lower_ = false; 00207 script_has_xheight_ = false; 00208 null_sid_ = 0; 00209 common_sid_ = 0; 00210 latin_sid_ = 0; 00211 cyrillic_sid_ = 0; 00212 greek_sid_ = 0; 00213 han_sid_ = 0; 00214 hiragana_sid_ = 0; 00215 katakana_sid_ = 0; 00216 } 00217 00218 // Return the size of the set (the number of different UNICHAR it holds). 00219 int size() const { 00220 return size_used; 00221 } 00222 00223 // Reserve enough memory space for the given number of UNICHARS 00224 void reserve(int unichars_number); 00225 00226 // Opens the file indicated by filename and saves unicharset to that file. 00227 // Returns true if the operation is successful. 00228 bool save_to_file(const char * const filename) const { 00229 FILE* file = fopen(filename, "w+b"); 00230 if (file == NULL) return false; 00231 bool result = save_to_file(file); 00232 fclose(file); 00233 return result; 00234 } 00235 00236 // Saves the content of the UNICHARSET to the given file. 00237 // Returns true if the operation is successful. 00238 bool save_to_file(FILE *file) const; 00239 00240 // Opens the file indicated by filename and loads the UNICHARSET 00241 // from the given file. The previous data is lost. 00242 // Returns true if the operation is successful. 00243 bool load_from_file(const char* const filename, bool skip_fragments) { 00244 FILE* file = fopen(filename, "rb"); 00245 if (file == NULL) return false; 00246 bool result = load_from_file(file, skip_fragments); 00247 fclose(file); 00248 return result; 00249 } 00250 bool load_from_file(const char* const filename) { 00251 return load_from_file(filename, false); 00252 } 00253 00254 // Loads the UNICHARSET from the given file. The previous data is lost. 00255 // Returns true if the operation is successful. 00256 bool load_from_file(FILE *file, bool skip_fragments); 00257 bool load_from_file(FILE *file) { return load_from_file(file, false); } 00258 00259 // Sets up internal data after loading the file, based on the char 00260 // properties. Called from load_from_file, but also needs to be run 00261 // during set_unicharset_properties. 00262 void post_load_setup(); 00263 00264 // Returns true if any script entry in the unicharset is for a 00265 // right_to_left language. 00266 bool any_right_to_left() const; 00267 00268 // Set a whitelist and/or blacklist of characters to recognize. 00269 // An empty or NULL whitelist enables everything (minus any blacklist). 00270 // An empty or NULL blacklist disables nothing. 00271 // The blacklist overrides the whitelist. 00272 // Each list is a string of utf8 character strings. Boundaries between 00273 // unicharset units are worked out automatically, and characters not in 00274 // the unicharset are silently ignored. 00275 void set_black_and_whitelist(const char* blacklist, const char* whitelist); 00276 00277 // Set the isalpha property of the given unichar to the given value. 00278 void set_isalpha(UNICHAR_ID unichar_id, bool value) { 00279 unichars[unichar_id].properties.isalpha = value; 00280 } 00281 00282 // Set the islower property of the given unichar to the given value. 00283 void set_islower(UNICHAR_ID unichar_id, bool value) { 00284 unichars[unichar_id].properties.islower = value; 00285 } 00286 00287 // Set the isupper property of the given unichar to the given value. 00288 void set_isupper(UNICHAR_ID unichar_id, bool value) { 00289 unichars[unichar_id].properties.isupper = value; 00290 } 00291 00292 // Set the isdigit property of the given unichar to the given value. 00293 void set_isdigit(UNICHAR_ID unichar_id, bool value) { 00294 unichars[unichar_id].properties.isdigit = value; 00295 } 00296 00297 // Set the ispunctuation property of the given unichar to the given value. 00298 void set_ispunctuation(UNICHAR_ID unichar_id, bool value) { 00299 unichars[unichar_id].properties.ispunctuation = value; 00300 } 00301 00302 // Set the isngram property of the given unichar to the given value. 00303 void set_isngram(UNICHAR_ID unichar_id, bool value) { 00304 unichars[unichar_id].properties.isngram = value; 00305 } 00306 00307 // Set the script name of the given unichar to the given value. 00308 // Value is copied and thus can be a temporary; 00309 void set_script(UNICHAR_ID unichar_id, const char* value) { 00310 unichars[unichar_id].properties.script_id = add_script(value); 00311 } 00312 00313 // Set other_case unichar id in the properties for the given unichar id. 00314 void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) { 00315 unichars[unichar_id].properties.other_case = other_case; 00316 } 00317 00318 // Return the isalpha property of the given unichar. 00319 bool get_isalpha(UNICHAR_ID unichar_id) const { 00320 return unichars[unichar_id].properties.isalpha; 00321 } 00322 00323 // Return the islower property of the given unichar. 00324 bool get_islower(UNICHAR_ID unichar_id) const { 00325 return unichars[unichar_id].properties.islower; 00326 } 00327 00328 // Return the isupper property of the given unichar. 00329 bool get_isupper(UNICHAR_ID unichar_id) const { 00330 return unichars[unichar_id].properties.isupper; 00331 } 00332 00333 // Return the isdigit property of the given unichar. 00334 bool get_isdigit(UNICHAR_ID unichar_id) const { 00335 return unichars[unichar_id].properties.isdigit; 00336 } 00337 00338 // Return the ispunctuation property of the given unichar. 00339 bool get_ispunctuation(UNICHAR_ID unichar_id) const { 00340 return unichars[unichar_id].properties.ispunctuation; 00341 } 00342 00343 // Return the isngram property of the given unichar. 00344 bool get_isngram(UNICHAR_ID unichar_id) const { 00345 return unichars[unichar_id].properties.isngram; 00346 } 00347 00348 // Returns true if the ids have useful min/max top/bottom values. 00349 bool top_bottom_useful() const { 00350 return top_bottom_set_; 00351 } 00352 // Returns the min and max bottom and top of the given unichar in 00353 // baseline-normalized coordinates, ie, where the baseline is 00354 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight 00355 // (See normalis.h for the definitions). 00356 void get_top_bottom(UNICHAR_ID unichar_id, 00357 int* min_bottom, int* max_bottom, 00358 int* min_top, int* max_top) const { 00359 *min_bottom = unichars[unichar_id].properties.min_bottom; 00360 *max_bottom = unichars[unichar_id].properties.max_bottom; 00361 *min_top = unichars[unichar_id].properties.min_top; 00362 *max_top = unichars[unichar_id].properties.max_top; 00363 } 00364 void set_top_bottom(UNICHAR_ID unichar_id, 00365 int min_bottom, int max_bottom, 00366 int min_top, int max_top) { 00367 unichars[unichar_id].properties.min_bottom = 00368 static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8)); 00369 unichars[unichar_id].properties.max_bottom = 00370 static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8)); 00371 unichars[unichar_id].properties.min_top = 00372 static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8)); 00373 unichars[unichar_id].properties.max_top = 00374 static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8)); 00375 } 00376 00377 // Return the script name of the given unichar. 00378 // The returned pointer will always be the same for the same script, it's 00379 // managed by unicharset and thus MUST NOT be deleted 00380 int get_script(UNICHAR_ID unichar_id) const { 00381 return unichars[unichar_id].properties.script_id; 00382 } 00383 00384 // Return the character properties, eg. alpha/upper/lower/digit/punct, 00385 // as a bit field of unsigned int. 00386 unsigned int get_properties(UNICHAR_ID unichar_id) const; 00387 00388 // Return the character property as a single char. If a character has 00389 // multiple attributes, the main property is defined by the following order: 00390 // upper_case : 'A' 00391 // lower_case : 'a' 00392 // alpha : 'x' 00393 // digit : '0' 00394 // punctuation: 'p' 00395 char get_chartype(UNICHAR_ID unichar_id) const; 00396 00397 // Get other_case unichar id in the properties for the given unichar id. 00398 UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const { 00399 return unichars[unichar_id].properties.other_case; 00400 } 00401 00402 // Returns UNICHAR_ID of the corresponding lower-case unichar. 00403 UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const { 00404 if (unichars[unichar_id].properties.islower) return unichar_id; 00405 return unichars[unichar_id].properties.other_case; 00406 } 00407 00408 // Returns UNICHAR_ID of the corresponding upper-case unichar. 00409 UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const { 00410 if (unichars[unichar_id].properties.isupper) return unichar_id; 00411 return unichars[unichar_id].properties.other_case; 00412 } 00413 00414 // Return a pointer to the CHAR_FRAGMENT class if the given 00415 // unichar id represents a character fragment. 00416 const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { 00417 return unichars[unichar_id].properties.fragment; 00418 } 00419 00420 // Return the isalpha property of the given unichar representation. 00421 bool get_isalpha(const char* const unichar_repr) const { 00422 return get_isalpha(unichar_to_id(unichar_repr)); 00423 } 00424 00425 // Return the islower property of the given unichar representation. 00426 bool get_islower(const char* const unichar_repr) const { 00427 return get_islower(unichar_to_id(unichar_repr)); 00428 } 00429 00430 // Return the isupper property of the given unichar representation. 00431 bool get_isupper(const char* const unichar_repr) const { 00432 return get_isupper(unichar_to_id(unichar_repr)); 00433 } 00434 00435 // Return the isdigit property of the given unichar representation. 00436 bool get_isdigit(const char* const unichar_repr) const { 00437 return get_isdigit(unichar_to_id(unichar_repr)); 00438 } 00439 00440 // Return the ispunctuation property of the given unichar representation. 00441 bool get_ispunctuation(const char* const unichar_repr) const { 00442 return get_ispunctuation(unichar_to_id(unichar_repr)); 00443 } 00444 00445 // Return the character properties, eg. alpha/upper/lower/digit/punct, 00446 // of the given unichar representation 00447 unsigned int get_properties(const char* const unichar_repr) const { 00448 return get_properties(unichar_to_id(unichar_repr)); 00449 } 00450 00451 char get_chartype(const char* const unichar_repr) const { 00452 return get_chartype(unichar_to_id(unichar_repr)); 00453 } 00454 00455 // Return the script name of the given unichar representation. 00456 // The returned pointer will always be the same for the same script, it's 00457 // managed by unicharset and thus MUST NOT be deleted 00458 int get_script(const char* const unichar_repr) const { 00459 return get_script(unichar_to_id(unichar_repr)); 00460 } 00461 00462 // Return a pointer to the CHAR_FRAGMENT class struct if the given 00463 // unichar representation represents a character fragment. 00464 const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const { 00465 if (unichar_repr == NULL || unichar_repr[0] == '\0' || 00466 !ids.contains(unichar_repr)) { 00467 return NULL; 00468 } 00469 return get_fragment(unichar_to_id(unichar_repr)); 00470 } 00471 00472 // Return the isalpha property of the given unichar representation. 00473 // Only the first length characters from unichar_repr are used. 00474 bool get_isalpha(const char* const unichar_repr, 00475 int length) const { 00476 return get_isalpha(unichar_to_id(unichar_repr, length)); 00477 } 00478 00479 // Return the islower property of the given unichar representation. 00480 // Only the first length characters from unichar_repr are used. 00481 bool get_islower(const char* const unichar_repr, 00482 int length) const { 00483 return get_islower(unichar_to_id(unichar_repr, length)); 00484 } 00485 00486 // Return the isupper property of the given unichar representation. 00487 // Only the first length characters from unichar_repr are used. 00488 bool get_isupper(const char* const unichar_repr, 00489 int length) const { 00490 return get_isupper(unichar_to_id(unichar_repr, length)); 00491 } 00492 00493 // Return the isdigit property of the given unichar representation. 00494 // Only the first length characters from unichar_repr are used. 00495 bool get_isdigit(const char* const unichar_repr, 00496 int length) const { 00497 return get_isdigit(unichar_to_id(unichar_repr, length)); 00498 } 00499 00500 // Return the ispunctuation property of the given unichar representation. 00501 // Only the first length characters from unichar_repr are used. 00502 bool get_ispunctuation(const char* const unichar_repr, 00503 int length) const { 00504 return get_ispunctuation(unichar_to_id(unichar_repr, length)); 00505 } 00506 00507 // Return the script name of the given unichar representation. 00508 // Only the first length characters from unichar_repr are used. 00509 // The returned pointer will always be the same for the same script, it's 00510 // managed by unicharset and thus MUST NOT be deleted 00511 int get_script(const char* const unichar_repr, 00512 int length) const { 00513 return get_script(unichar_to_id(unichar_repr, length)); 00514 } 00515 00516 // Return the (current) number of scripts in the script table 00517 int get_script_table_size() const { 00518 return script_table_size_used; 00519 } 00520 00521 // Return the script string from its id 00522 const char* get_script_from_script_id(int id) const { 00523 if (id >= script_table_size_used || id < 0) 00524 return null_script; 00525 return script_table[id]; 00526 } 00527 00528 // Returns the id from the name of the script, or 0 if script is not found. 00529 // Note that this is an expensive operation since it involves iteratively 00530 // comparing strings in the script table. To avoid dependency on STL, we 00531 // won't use a hash. Instead, the calling function can use this to lookup 00532 // and save the ID for relevant scripts for fast comparisons later. 00533 int get_script_id_from_name(const char* script_name) const; 00534 00535 // Return true if the given script is the null script 00536 bool is_null_script(const char* script) const { 00537 return script == null_script; 00538 } 00539 00540 // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0, 00541 // then the returned pointer will be the same. 00542 // The script parameter is copied and thus can be a temporary. 00543 int add_script(const char* script); 00544 00545 // Return the enabled property of the given unichar. 00546 bool get_enabled(UNICHAR_ID unichar_id) const { 00547 return unichars[unichar_id].properties.enabled; 00548 } 00549 00550 00551 int null_sid() const { return null_sid_; } 00552 int common_sid() const { return common_sid_; } 00553 int latin_sid() const { return latin_sid_; } 00554 int cyrillic_sid() const { return cyrillic_sid_; } 00555 int greek_sid() const { return greek_sid_; } 00556 int han_sid() const { return han_sid_; } 00557 int hiragana_sid() const { return hiragana_sid_; } 00558 int katakana_sid() const { return katakana_sid_; } 00559 int default_sid() const { return default_sid_; } 00560 00561 // Returns true if the unicharset has the concept of upper/lower case. 00562 bool script_has_upper_lower() const { 00563 return script_has_upper_lower_; 00564 } 00565 // Returns true if the unicharset has the concept of x-height. 00566 // script_has_xheight can be true even if script_has_upper_lower is not, 00567 // when the script has a sufficiently predominant top line with ascenders, 00568 // such as Devanagari and Thai. 00569 bool script_has_xheight() const { 00570 return script_has_xheight_; 00571 } 00572 00573 private: 00574 00575 struct UNICHAR_PROPERTIES { 00576 UNICHAR_PROPERTIES(); 00577 void Init(); 00578 00579 bool isalpha; 00580 bool islower; 00581 bool isupper; 00582 bool isdigit; 00583 bool ispunctuation; 00584 bool isngram; 00585 bool enabled; 00586 // Possible limits of the top and bottom of the bounding box in 00587 // baseline-normalized coordinates, ie, where the baseline is 00588 // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight 00589 // (See normalis.h for the definitions). 00590 uinT8 min_bottom; 00591 uinT8 max_bottom; 00592 uinT8 min_top; 00593 uinT8 max_top; 00594 int script_id; 00595 UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar 00596 00597 // Contains meta information about the fragment if a unichar represents 00598 // a fragment of a character, otherwise should be set to NULL. 00599 // It is assumed that character fragments are added to the unicharset 00600 // after the corresponding 'base' characters. 00601 CHAR_FRAGMENT *fragment; 00602 }; 00603 00604 struct UNICHAR_SLOT { 00605 char representation[UNICHAR_LEN + 1]; 00606 UNICHAR_PROPERTIES properties; 00607 }; 00608 00609 UNICHAR_SLOT* unichars; 00610 UNICHARMAP ids; 00611 int size_used; 00612 int size_reserved; 00613 char** script_table; 00614 int script_table_size_used; 00615 int script_table_size_reserved; 00616 const char* null_script; 00617 // True if the unichars have their tops/bottoms set. 00618 bool top_bottom_set_; 00619 // True if the unicharset has significant upper/lower case chars. 00620 bool script_has_upper_lower_; 00621 // True if the unicharset has a significant mean-line with significant 00622 // ascenders above that. 00623 bool script_has_xheight_; 00624 00625 // A few convenient script name-to-id mapping without using hash. 00626 // These are initialized when unicharset file is loaded. Anything 00627 // missing from this list can be looked up using get_script_id_from_name. 00628 int null_sid_; 00629 int common_sid_; 00630 int latin_sid_; 00631 int cyrillic_sid_; 00632 int greek_sid_; 00633 int han_sid_; 00634 int hiragana_sid_; 00635 int katakana_sid_; 00636 // The most frequently occurring script in the charset. 00637 int default_sid_; 00638 }; 00639 00640 #endif // TESSERACT_CCUTIL_UNICHARSET_H__