Tesseract 3.01
/data/source/tesseract-ocr/ccutil/unicharset.h
Go to the documentation of this file.
00001 
00002 // File:        unicharset.h
00003 // Description: Unicode character/ligature set class.
00004 // Author:      Thomas Kielbus
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
00021 #define TESSERACT_CCUTIL_UNICHARSET_H__
00022 
00023 #include "assert.h"
00024 #include "strngs.h"
00025 #include "unichar.h"
00026 #include "unicharmap.h"
00027 #include "params.h"
00028 
00029 class CHAR_FRAGMENT {
00030  public:
00031   // Minimum number of characters used for fragment representation.
00032   static const int kMinLen = 6;
00033   // Maximum number of characters used for fragment representation.
00034   static const int kMaxLen = 3 + UNICHAR_LEN + 2;
00035   // Special character used in representing character fragments.
00036   static const char kSeparator = '|';
00037   // Maximum number of fragments per character.
00038   static const int kMaxChunks = 3;
00039 
00040   // Setters and Getters.
00041   inline void set_all(const char *unichar, int pos, int total) {
00042     this->set_unichar(unichar);
00043     this->set_pos(pos);
00044     this->set_total(total);
00045   }
00046   inline void set_unichar(const char *uch) {
00047     strncpy(this->unichar, uch, UNICHAR_LEN);
00048     this->unichar[UNICHAR_LEN] = '\0';
00049   }
00050   inline void set_pos(int p) { this->pos = p; }
00051   inline void set_total(int t) { this->total = t; }
00052   inline const char* get_unichar() const { return this->unichar; }
00053   inline int get_pos() const { return this->pos; }
00054   inline int get_total() const { return this->total; }
00055 
00056   // Returns the string that represents a fragment
00057   // with the given unichar, pos and total.
00058   static STRING to_string(const char *unichar, int pos, int total) {
00059     if (total == 1) return STRING(unichar);
00060     STRING result = "";
00061     result += kSeparator;
00062     result += unichar;
00063     char buffer[kMaxLen];
00064     snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total);
00065     result += buffer;
00066     return result;
00067   }
00068   // Returns the string that represents this fragment.
00069   STRING to_string() const {
00070     return to_string(this->unichar, this->pos, this->total);
00071   }
00072 
00073   // Checks whether a fragment has the same unichar,
00074   // position and total as the given inputs.
00075   inline bool equals(const char *other_unichar,
00076                      int other_pos, int other_total) const {
00077     return (strcmp(this->unichar, other_unichar) == 0 &&
00078             this->pos == other_pos && this->total == other_total);
00079   }
00080   inline bool equals(const CHAR_FRAGMENT *other) const {
00081     return this->equals(other->get_unichar(),
00082                         other->get_pos(),
00083                         other->get_total());
00084   }
00085 
00086   // Checks whether a given fragment is a continuation of this fragment.
00087   // Assumes that the given fragment pointer is not NULL.
00088   inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
00089     return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
00090             this->total == fragment->get_total() &&
00091             this->pos == fragment->get_pos() + 1);
00092   }
00093 
00094   // Returns true if this fragment is a beginning fragment.
00095   inline bool is_beginning() const { return this->pos == 0; }
00096 
00097   // Returns true if this fragment is an ending fragment.
00098   inline bool is_ending() const { return this->pos == this->total-1; }
00099 
00100   // Parses the string to see whether it represents a character fragment
00101   // (rather than a regular character). If so, allocates memory for a new
00102   // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
00103   // information. Fragments are of the form:
00104   // |m|1|2, meaning chunk 1 of 2 of character m.
00105   //
00106   // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
00107   // instance, otherwise (if the string does not represent a fragment or it
00108   // looks like it does, but parsing it as a fragment fails) returns NULL.
00109   //
00110   // Note: The caller is responsible for deallocating memory
00111   // associated with the returned pointer.
00112   static CHAR_FRAGMENT *parse_from_string(const char *str);
00113 
00114  private:
00115   char unichar[UNICHAR_LEN + 1];
00116   inT16 pos;    // fragment position in the character
00117   inT16 total;  // total number of fragments in the character
00118 };
00119 
00120 // The UNICHARSET class is an utility class for Tesseract that holds the
00121 // set of characters that are used by the engine. Each character is identified
00122 // by a unique number, from 0 to (size - 1).
00123 class UNICHARSET {
00124  public:
00125   // Create an empty UNICHARSET
00126   UNICHARSET();
00127 
00128   ~UNICHARSET();
00129 
00130   // Return the UNICHAR_ID of a given unichar representation within the
00131   // UNICHARSET.
00132   const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
00133 
00134   // Return the UNICHAR_ID of a given unichar representation within the
00135   // UNICHARSET. Only the first length characters from unichar_repr are used.
00136   const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
00137                                  int length) const;
00138 
00139   // Return the minimum number of bytes that matches a legal UNICHAR_ID,
00140   // while leaving a legal UNICHAR_ID afterwards. In other words, if there
00141   // is both a short and a long match to the string, return the length that
00142   // ensures there is a legal match after it.
00143   int step(const char* str) const;
00144 
00145   // Return the unichar representation corresponding to the given UNICHAR_ID
00146   // within the UNICHARSET.
00147   const char* const id_to_unichar(UNICHAR_ID id) const;
00148 
00149   // Return a STRING that reformats the utf8 str into the str followed
00150   // by its hex unicodes.
00151   static STRING debug_utf8_str(const char* str);
00152 
00153   // Return a STRING containing debug information on the unichar, including
00154   // the id_to_unichar, its hex unicodes and the properties.
00155   STRING debug_str(UNICHAR_ID id) const;
00156   STRING debug_str(const char * unichar_repr) const {
00157     return debug_str(unichar_to_id(unichar_repr));
00158   }
00159 
00160   // Add a unichar representation to the set.
00161   void unichar_insert(const char* const unichar_repr);
00162 
00163   // Return true if the given unichar id exists within the set.
00164   // Relies on the fact that unichar ids are contiguous in the unicharset.
00165   bool contains_unichar_id(UNICHAR_ID unichar_id) const {
00166     return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used;
00167   }
00168 
00169   // Return true if the given unichar representation exists within the set.
00170   bool contains_unichar(const char* const unichar_repr) const;
00171   bool contains_unichar(const char* const unichar_repr, int length) const;
00172 
00173   // Return true if the given unichar representation corresponds to the given
00174   // UNICHAR_ID within the set.
00175   bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
00176 
00177   // Delete CHAR_FRAGMENTs stored in properties of unichars array.
00178   void delete_pointers_in_unichars() {
00179     for (int i = 0; i < size_used; ++i) {
00180       if (unichars[i].properties.fragment != NULL) {
00181         delete unichars[i].properties.fragment;
00182         unichars[i].properties.fragment = NULL;
00183       }
00184     }
00185   }
00186 
00187   // Clear the UNICHARSET (all the previous data is lost).
00188   void clear() {
00189     if (script_table != NULL) {
00190       for (int i = 0; i < script_table_size_used; ++i)
00191         delete[] script_table[i];
00192       delete[] script_table;
00193       script_table = NULL;
00194       script_table_size_used = 0;
00195     }
00196     if (unichars != NULL) {
00197       delete_pointers_in_unichars();
00198       delete[] unichars;
00199       unichars = NULL;
00200     }
00201     script_table_size_reserved = 0;
00202     size_reserved = 0;
00203     size_used = 0;
00204     ids.clear();
00205     top_bottom_set_ = false;
00206     script_has_upper_lower_ = false;
00207     script_has_xheight_ = false;
00208     null_sid_ = 0;
00209     common_sid_ = 0;
00210     latin_sid_ = 0;
00211     cyrillic_sid_ = 0;
00212     greek_sid_ = 0;
00213     han_sid_ = 0;
00214     hiragana_sid_ = 0;
00215     katakana_sid_ = 0;
00216   }
00217 
00218   // Return the size of the set (the number of different UNICHAR it holds).
00219   int size() const {
00220     return size_used;
00221   }
00222 
00223   // Reserve enough memory space for the given number of UNICHARS
00224   void reserve(int unichars_number);
00225 
00226   // Opens the file indicated by filename and saves unicharset to that file.
00227   // Returns true if the operation is successful.
00228   bool save_to_file(const char * const filename) const {
00229     FILE* file = fopen(filename, "w+b");
00230     if (file == NULL) return false;
00231     bool result = save_to_file(file);
00232     fclose(file);
00233     return result;
00234   }
00235 
00236   // Saves the content of the UNICHARSET to the given file.
00237   // Returns true if the operation is successful.
00238   bool save_to_file(FILE *file) const;
00239 
00240   // Opens the file indicated by filename and loads the UNICHARSET
00241   // from the given file. The previous data is lost.
00242   // Returns true if the operation is successful.
00243   bool load_from_file(const char* const filename, bool skip_fragments) {
00244     FILE* file = fopen(filename, "rb");
00245     if (file == NULL) return false;
00246     bool result = load_from_file(file, skip_fragments);
00247     fclose(file);
00248     return result;
00249   }
00250   bool load_from_file(const char* const filename) {
00251     return load_from_file(filename, false);
00252   }
00253 
00254   // Loads the UNICHARSET from the given file. The previous data is lost.
00255   // Returns true if the operation is successful.
00256   bool load_from_file(FILE *file, bool skip_fragments);
00257   bool load_from_file(FILE *file) { return load_from_file(file, false); }
00258 
00259   // Sets up internal data after loading the file, based on the char
00260   // properties. Called from load_from_file, but also needs to be run
00261   // during set_unicharset_properties.
00262   void post_load_setup();
00263 
00264   // Returns true if any script entry in the unicharset is for a
00265   // right_to_left language.
00266   bool any_right_to_left() const;
00267 
00268   // Set a whitelist and/or blacklist of characters to recognize.
00269   // An empty or NULL whitelist enables everything (minus any blacklist).
00270   // An empty or NULL blacklist disables nothing.
00271   // The blacklist overrides the whitelist.
00272   // Each list is a string of utf8 character strings. Boundaries between
00273   // unicharset units are worked out automatically, and characters not in
00274   // the unicharset are silently ignored.
00275   void set_black_and_whitelist(const char* blacklist, const char* whitelist);
00276 
00277   // Set the isalpha property of the given unichar to the given value.
00278   void set_isalpha(UNICHAR_ID unichar_id, bool value) {
00279     unichars[unichar_id].properties.isalpha = value;
00280   }
00281 
00282   // Set the islower property of the given unichar to the given value.
00283   void set_islower(UNICHAR_ID unichar_id, bool value) {
00284     unichars[unichar_id].properties.islower = value;
00285   }
00286 
00287   // Set the isupper property of the given unichar to the given value.
00288   void set_isupper(UNICHAR_ID unichar_id, bool value) {
00289     unichars[unichar_id].properties.isupper = value;
00290   }
00291 
00292   // Set the isdigit property of the given unichar to the given value.
00293   void set_isdigit(UNICHAR_ID unichar_id, bool value) {
00294     unichars[unichar_id].properties.isdigit = value;
00295   }
00296 
00297   // Set the ispunctuation property of the given unichar to the given value.
00298   void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
00299     unichars[unichar_id].properties.ispunctuation = value;
00300   }
00301 
00302   // Set the isngram property of the given unichar to the given value.
00303   void set_isngram(UNICHAR_ID unichar_id, bool value) {
00304     unichars[unichar_id].properties.isngram = value;
00305   }
00306 
00307   // Set the script name of the given unichar to the given value.
00308   // Value is copied and thus can be a temporary;
00309   void set_script(UNICHAR_ID unichar_id, const char* value) {
00310     unichars[unichar_id].properties.script_id = add_script(value);
00311   }
00312 
00313   // Set other_case unichar id in the properties for the given unichar id.
00314   void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
00315     unichars[unichar_id].properties.other_case = other_case;
00316   }
00317 
00318   // Return the isalpha property of the given unichar.
00319   bool get_isalpha(UNICHAR_ID unichar_id) const {
00320     return unichars[unichar_id].properties.isalpha;
00321   }
00322 
00323   // Return the islower property of the given unichar.
00324   bool get_islower(UNICHAR_ID unichar_id) const {
00325     return unichars[unichar_id].properties.islower;
00326   }
00327 
00328   // Return the isupper property of the given unichar.
00329   bool get_isupper(UNICHAR_ID unichar_id) const {
00330     return unichars[unichar_id].properties.isupper;
00331   }
00332 
00333   // Return the isdigit property of the given unichar.
00334   bool get_isdigit(UNICHAR_ID unichar_id) const {
00335     return unichars[unichar_id].properties.isdigit;
00336   }
00337 
00338   // Return the ispunctuation property of the given unichar.
00339   bool get_ispunctuation(UNICHAR_ID unichar_id) const {
00340     return unichars[unichar_id].properties.ispunctuation;
00341   }
00342 
00343   // Return the isngram property of the given unichar.
00344   bool get_isngram(UNICHAR_ID unichar_id) const {
00345     return unichars[unichar_id].properties.isngram;
00346   }
00347 
00348   // Returns true if the ids have useful min/max top/bottom values.
00349   bool top_bottom_useful() const {
00350     return top_bottom_set_;
00351   }
00352   // Returns the min and max bottom and top of the given unichar in
00353   // baseline-normalized coordinates, ie, where the baseline is
00354   // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
00355   // (See normalis.h for the definitions).
00356   void get_top_bottom(UNICHAR_ID unichar_id,
00357                       int* min_bottom, int* max_bottom,
00358                       int* min_top, int* max_top) const {
00359     *min_bottom = unichars[unichar_id].properties.min_bottom;
00360     *max_bottom = unichars[unichar_id].properties.max_bottom;
00361     *min_top = unichars[unichar_id].properties.min_top;
00362     *max_top = unichars[unichar_id].properties.max_top;
00363   }
00364   void set_top_bottom(UNICHAR_ID unichar_id,
00365                       int min_bottom, int max_bottom,
00366                       int min_top, int max_top) {
00367     unichars[unichar_id].properties.min_bottom =
00368         static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
00369     unichars[unichar_id].properties.max_bottom =
00370         static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
00371     unichars[unichar_id].properties.min_top =
00372         static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
00373     unichars[unichar_id].properties.max_top =
00374         static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
00375   }
00376 
00377   // Return the script name of the given unichar.
00378   // The returned pointer will always be the same for the same script, it's
00379   // managed by unicharset and thus MUST NOT be deleted
00380   int get_script(UNICHAR_ID unichar_id) const {
00381     return unichars[unichar_id].properties.script_id;
00382   }
00383 
00384   // Return the character properties, eg. alpha/upper/lower/digit/punct,
00385   // as a bit field of unsigned int.
00386   unsigned int get_properties(UNICHAR_ID unichar_id) const;
00387 
00388   // Return the character property as a single char.  If a character has
00389   // multiple attributes, the main property is defined by the following order:
00390   //   upper_case : 'A'
00391   //   lower_case : 'a'
00392   //   alpha      : 'x'
00393   //   digit      : '0'
00394   //   punctuation: 'p'
00395   char get_chartype(UNICHAR_ID unichar_id) const;
00396 
00397   // Get other_case unichar id in the properties for the given unichar id.
00398   UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
00399     return unichars[unichar_id].properties.other_case;
00400   }
00401 
00402   // Returns UNICHAR_ID of the corresponding lower-case unichar.
00403   UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
00404     if (unichars[unichar_id].properties.islower) return unichar_id;
00405     return unichars[unichar_id].properties.other_case;
00406   }
00407 
00408   // Returns UNICHAR_ID of the corresponding upper-case unichar.
00409   UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
00410     if (unichars[unichar_id].properties.isupper) return unichar_id;
00411     return unichars[unichar_id].properties.other_case;
00412   }
00413 
00414   // Return a pointer to the CHAR_FRAGMENT class if the given
00415   // unichar id represents a character fragment.
00416   const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
00417     return unichars[unichar_id].properties.fragment;
00418   }
00419 
00420   // Return the isalpha property of the given unichar representation.
00421   bool get_isalpha(const char* const unichar_repr) const {
00422     return get_isalpha(unichar_to_id(unichar_repr));
00423   }
00424 
00425   // Return the islower property of the given unichar representation.
00426   bool get_islower(const char* const unichar_repr) const {
00427     return get_islower(unichar_to_id(unichar_repr));
00428   }
00429 
00430   // Return the isupper property of the given unichar representation.
00431   bool get_isupper(const char* const unichar_repr) const {
00432     return get_isupper(unichar_to_id(unichar_repr));
00433   }
00434 
00435   // Return the isdigit property of the given unichar representation.
00436   bool get_isdigit(const char* const unichar_repr) const {
00437     return get_isdigit(unichar_to_id(unichar_repr));
00438   }
00439 
00440   // Return the ispunctuation property of the given unichar representation.
00441   bool get_ispunctuation(const char* const unichar_repr) const {
00442     return get_ispunctuation(unichar_to_id(unichar_repr));
00443   }
00444 
00445   // Return the character properties, eg. alpha/upper/lower/digit/punct,
00446   // of the given unichar representation
00447   unsigned int get_properties(const char* const unichar_repr) const {
00448     return get_properties(unichar_to_id(unichar_repr));
00449   }
00450 
00451   char get_chartype(const char* const unichar_repr) const {
00452     return get_chartype(unichar_to_id(unichar_repr));
00453   }
00454 
00455   // Return the script name of the given unichar representation.
00456   // The returned pointer will always be the same for the same script, it's
00457   // managed by unicharset and thus MUST NOT be deleted
00458   int get_script(const char* const unichar_repr) const {
00459     return get_script(unichar_to_id(unichar_repr));
00460   }
00461 
00462   // Return a pointer to the CHAR_FRAGMENT class struct if the given
00463   // unichar representation represents a character fragment.
00464   const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
00465     if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
00466         !ids.contains(unichar_repr)) {
00467       return NULL;
00468     }
00469     return get_fragment(unichar_to_id(unichar_repr));
00470   }
00471 
00472   // Return the isalpha property of the given unichar representation.
00473   // Only the first length characters from unichar_repr are used.
00474   bool get_isalpha(const char* const unichar_repr,
00475                int length) const {
00476     return get_isalpha(unichar_to_id(unichar_repr, length));
00477   }
00478 
00479   // Return the islower property of the given unichar representation.
00480   // Only the first length characters from unichar_repr are used.
00481   bool get_islower(const char* const unichar_repr,
00482                int length) const {
00483     return get_islower(unichar_to_id(unichar_repr, length));
00484   }
00485 
00486   // Return the isupper property of the given unichar representation.
00487   // Only the first length characters from unichar_repr are used.
00488   bool get_isupper(const char* const unichar_repr,
00489                int length) const {
00490     return get_isupper(unichar_to_id(unichar_repr, length));
00491   }
00492 
00493   // Return the isdigit property of the given unichar representation.
00494   // Only the first length characters from unichar_repr are used.
00495   bool get_isdigit(const char* const unichar_repr,
00496                int length) const {
00497     return get_isdigit(unichar_to_id(unichar_repr, length));
00498   }
00499 
00500   // Return the ispunctuation property of the given unichar representation.
00501   // Only the first length characters from unichar_repr are used.
00502   bool get_ispunctuation(const char* const unichar_repr,
00503                           int length) const {
00504     return get_ispunctuation(unichar_to_id(unichar_repr, length));
00505   }
00506 
00507   // Return the script name of the given unichar representation.
00508   // Only the first length characters from unichar_repr are used.
00509   // The returned pointer will always be the same for the same script, it's
00510   // managed by unicharset and thus MUST NOT be deleted
00511   int get_script(const char* const unichar_repr,
00512                  int length) const {
00513     return get_script(unichar_to_id(unichar_repr, length));
00514   }
00515 
00516   // Return the (current) number of scripts in the script table
00517   int get_script_table_size() const {
00518     return script_table_size_used;
00519   }
00520 
00521   // Return the script string from its id
00522   const char* get_script_from_script_id(int id) const {
00523     if (id >= script_table_size_used || id < 0)
00524       return null_script;
00525     return script_table[id];
00526   }
00527 
00528   // Returns the id from the name of the script, or 0 if script is not found.
00529   // Note that this is an expensive operation since it involves iteratively
00530   // comparing strings in the script table.  To avoid dependency on STL, we
00531   // won't use a hash.  Instead, the calling function can use this to lookup
00532   // and save the ID for relevant scripts for fast comparisons later.
00533   int get_script_id_from_name(const char* script_name) const;
00534 
00535   // Return true if the given script is the null script
00536   bool is_null_script(const char* script) const {
00537     return script == null_script;
00538   }
00539 
00540   // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
00541   // then the returned pointer will be the same.
00542   // The script parameter is copied and thus can be a temporary.
00543   int add_script(const char* script);
00544 
00545   // Return the enabled property of the given unichar.
00546   bool get_enabled(UNICHAR_ID unichar_id) const {
00547     return unichars[unichar_id].properties.enabled;
00548   }
00549 
00550 
00551   int null_sid() const { return null_sid_; }
00552   int common_sid() const { return common_sid_; }
00553   int latin_sid() const { return latin_sid_; }
00554   int cyrillic_sid() const { return cyrillic_sid_; }
00555   int greek_sid() const { return greek_sid_; }
00556   int han_sid() const { return han_sid_; }
00557   int hiragana_sid() const { return hiragana_sid_; }
00558   int katakana_sid() const { return katakana_sid_; }
00559   int default_sid() const { return default_sid_; }
00560 
00561   // Returns true if the unicharset has the concept of upper/lower case.
00562   bool script_has_upper_lower() const {
00563     return script_has_upper_lower_;
00564   }
00565   // Returns true if the unicharset has the concept of x-height.
00566   // script_has_xheight can be true even if script_has_upper_lower is not,
00567   // when the script has a sufficiently predominant top line with ascenders,
00568   // such as Devanagari and Thai.
00569   bool script_has_xheight() const {
00570     return script_has_xheight_;
00571   }
00572 
00573  private:
00574 
00575   struct UNICHAR_PROPERTIES {
00576     UNICHAR_PROPERTIES();
00577     void Init();
00578 
00579     bool  isalpha;
00580     bool  islower;
00581     bool  isupper;
00582     bool  isdigit;
00583     bool  ispunctuation;
00584     bool  isngram;
00585     bool  enabled;
00586     // Possible limits of the top and bottom of the bounding box in
00587     // baseline-normalized coordinates, ie, where the baseline is
00588     // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
00589     // (See normalis.h for the definitions).
00590     uinT8 min_bottom;
00591     uinT8 max_bottom;
00592     uinT8 min_top;
00593     uinT8 max_top;
00594     int   script_id;
00595     UNICHAR_ID other_case;  // id of the corresponding upper/lower case unichar
00596 
00597     // Contains meta information about the fragment if a unichar represents
00598     // a fragment of a character, otherwise should be set to NULL.
00599     // It is assumed that character fragments are added to the unicharset
00600     // after the corresponding 'base' characters.
00601     CHAR_FRAGMENT *fragment;
00602   };
00603 
00604   struct UNICHAR_SLOT {
00605     char representation[UNICHAR_LEN + 1];
00606     UNICHAR_PROPERTIES properties;
00607   };
00608 
00609   UNICHAR_SLOT* unichars;
00610   UNICHARMAP ids;
00611   int size_used;
00612   int size_reserved;
00613   char** script_table;
00614   int script_table_size_used;
00615   int script_table_size_reserved;
00616   const char* null_script;
00617   // True if the unichars have their tops/bottoms set.
00618   bool top_bottom_set_;
00619   // True if the unicharset has significant upper/lower case chars.
00620   bool script_has_upper_lower_;
00621   // True if the unicharset has a significant mean-line with significant
00622   // ascenders above that.
00623   bool script_has_xheight_;
00624 
00625   // A few convenient script name-to-id mapping without using hash.
00626   // These are initialized when unicharset file is loaded.  Anything
00627   // missing from this list can be looked up using get_script_id_from_name.
00628   int null_sid_;
00629   int common_sid_;
00630   int latin_sid_;
00631   int cyrillic_sid_;
00632   int greek_sid_;
00633   int han_sid_;
00634   int hiragana_sid_;
00635   int katakana_sid_;
00636   // The most frequently occurring script in the charset.
00637   int default_sid_;
00638 };
00639 
00640 #endif  // TESSERACT_CCUTIL_UNICHARSET_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines