00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00019
00020 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
00021 #define TESSERACT_CCUTIL_UNICHARSET_H__
00022
00023 #include "assert.h"
00024 #include "strngs.h"
00025 #include "unichar.h"
00026 #include "unicharmap.h"
00027 #include "varable.h"
00028
00029 class CHAR_FRAGMENT {
00030 public:
00031
00032 static const int kMinLen = 6;
00033
00034 static const int kMaxLen = 3 + UNICHAR_LEN + 2;
00035
00036 static const char kSeparator = '|';
00037
00038 static const int kMaxChunks = 3;
00039
00040
00041 inline void set_all(const char *unichar, int pos, int total) {
00042 this->set_unichar(unichar);
00043 this->set_pos(pos);
00044 this->set_total(total);
00045 }
00046 inline void set_unichar(const char *uch) {
00047 strncpy(this->unichar, uch, UNICHAR_LEN);
00048 this->unichar[UNICHAR_LEN] = '\0';
00049 }
00050 inline void set_pos(int p) { this->pos = p; }
00051 inline void set_total(int t) { this->total = t; }
00052 inline const char* get_unichar() const { return this->unichar; }
00053 inline int get_pos() const { return this->pos; }
00054 inline int get_total() const { return this->total; }
00055
00056
00057
00058 static STRING to_string(const char *unichar, int pos, int total) {
00059 STRING result = "";
00060 result += kSeparator;
00061 result += unichar;
00062 char buffer[kMaxLen];
00063 snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total);
00064 result += buffer;
00065 return result;
00066 }
00067
00068 STRING to_string() const {
00069 return to_string(this->unichar, this->pos, this->total);
00070 }
00071
00072
00073
00074 inline bool equals(const char *other_unichar,
00075 int other_pos, int other_total) const {
00076 return (strcmp(this->unichar, other_unichar) == 0 &&
00077 this->pos == other_pos && this->total == other_total);
00078 }
00079 inline bool equals(const CHAR_FRAGMENT *other) const {
00080 return this->equals(other->get_unichar(),
00081 other->get_pos(),
00082 other->get_total());
00083 }
00084
00085
00086
00087 inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
00088 return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
00089 this->total == fragment->get_total() &&
00090 this->pos == fragment->get_pos() + 1);
00091 }
00092
00093
00094 inline bool is_beginning() const { return this->pos == 0; }
00095
00096
00097 inline bool is_ending() const { return this->pos == this->total-1; }
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111 static CHAR_FRAGMENT *parse_from_string(const char *str);
00112
00113 private:
00114 char unichar[UNICHAR_LEN + 1];
00115 inT16 pos;
00116 inT16 total;
00117 };
00118
00119
00120
00121
00122 class UNICHARSET {
00123 public:
00124
00125 UNICHARSET();
00126
00127 ~UNICHARSET();
00128
00129
00130
00131 const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
00132
00133
00134
00135 const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
00136 int length) const;
00137
00138
00139
00140
00141
00142 int step(const char* str) const;
00143
00144
00145
00146 const char* const id_to_unichar(UNICHAR_ID id) const;
00147
00148
00149
00150 static STRING debug_utf8_str(const char* str);
00151
00152
00153
00154 STRING debug_str(UNICHAR_ID id) const;
00155 STRING debug_str(const char * unichar_repr) const {
00156 return debug_str(unichar_to_id(unichar_repr));
00157 }
00158
00159
00160 void unichar_insert(const char* const unichar_repr);
00161
00162
00163
00164 bool contains_unichar_id(UNICHAR_ID unichar_id) const {
00165 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used;
00166 }
00167
00168
00169 bool contains_unichar(const char* const unichar_repr) const;
00170 bool contains_unichar(const char* const unichar_repr, int length) const;
00171
00172
00173
00174 bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
00175
00176
00177 void delete_pointers_in_unichars() {
00178 for (int i = 0; i < size_used; ++i) {
00179 if (unichars[i].properties.fragment != NULL) {
00180 delete unichars[i].properties.fragment;
00181 unichars[i].properties.fragment = NULL;
00182 }
00183 }
00184 }
00185
00186
00187 void clear() {
00188 if (size_reserved > 0) {
00189 for (int i = 0; i < script_table_size_used; ++i)
00190 delete[] script_table[i];
00191 delete[] script_table;
00192 script_table = 0;
00193 script_table_size_reserved = 0;
00194 script_table_size_used = 0;
00195 delete_pointers_in_unichars();
00196 delete[] unichars;
00197 unichars = 0;
00198 size_reserved = 0;
00199 size_used = 0;
00200 }
00201 ids.clear();
00202 }
00203
00204
00205 int size() const {
00206 return size_used;
00207 }
00208
00209
00210 void reserve(int unichars_number);
00211
00212
00213
00214 bool save_to_file(const char * const filename) const {
00215 FILE* file = fopen(filename, "w+");
00216 if (file == NULL) return false;
00217 bool result = save_to_file(file);
00218 fclose(file);
00219 return result;
00220 }
00221
00222
00223
00224 bool save_to_file(FILE *file) const;
00225
00226
00227
00228
00229 bool load_from_file(const char* const filename) {
00230 FILE* file = fopen(filename, "r");
00231 if (file == NULL) return false;
00232 bool result = load_from_file(file);
00233 fclose(file);
00234 return result;
00235 }
00236
00237
00238
00239 bool load_from_file(FILE *file);
00240
00241
00242
00243
00244
00245
00246
00247
00248 void set_black_and_whitelist(const char* blacklist, const char* whitelist);
00249
00250
00251 void set_isalpha(UNICHAR_ID unichar_id, bool value) {
00252 unichars[unichar_id].properties.isalpha = value;
00253 }
00254
00255
00256 void set_islower(UNICHAR_ID unichar_id, bool value) {
00257 unichars[unichar_id].properties.islower = value;
00258 }
00259
00260
00261 void set_isupper(UNICHAR_ID unichar_id, bool value) {
00262 unichars[unichar_id].properties.isupper = value;
00263 }
00264
00265
00266 void set_isdigit(UNICHAR_ID unichar_id, bool value) {
00267 unichars[unichar_id].properties.isdigit = value;
00268 }
00269
00270
00271 void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
00272 unichars[unichar_id].properties.ispunctuation = value;
00273 }
00274
00275
00276 void set_isngram(UNICHAR_ID unichar_id, bool value) {
00277 unichars[unichar_id].properties.isngram = value;
00278 }
00279
00280
00281
00282 void set_script(UNICHAR_ID unichar_id, const char* value) {
00283 unichars[unichar_id].properties.script_id = add_script(value);
00284 }
00285
00286
00287 void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
00288 unichars[unichar_id].properties.other_case = other_case;
00289 }
00290
00291
00292 bool get_isalpha(UNICHAR_ID unichar_id) const {
00293 return unichars[unichar_id].properties.isalpha;
00294 }
00295
00296
00297 bool get_islower(UNICHAR_ID unichar_id) const {
00298 return unichars[unichar_id].properties.islower;
00299 }
00300
00301
00302 bool get_isupper(UNICHAR_ID unichar_id) const {
00303 return unichars[unichar_id].properties.isupper;
00304 }
00305
00306
00307 bool get_isdigit(UNICHAR_ID unichar_id) const {
00308 return unichars[unichar_id].properties.isdigit;
00309 }
00310
00311
00312 bool get_ispunctuation(UNICHAR_ID unichar_id) const {
00313 return unichars[unichar_id].properties.ispunctuation;
00314 }
00315
00316
00317 bool get_isngram(UNICHAR_ID unichar_id) const {
00318 return unichars[unichar_id].properties.isngram;
00319 }
00320
00321
00322
00323
00324 int get_script(UNICHAR_ID unichar_id) const {
00325 return unichars[unichar_id].properties.script_id;
00326 }
00327
00328
00329 UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
00330 return unichars[unichar_id].properties.other_case;
00331 }
00332
00333
00334 UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
00335 if (unichars[unichar_id].properties.islower) return unichar_id;
00336 return unichars[unichar_id].properties.other_case;
00337 }
00338
00339
00340 UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
00341 if (unichars[unichar_id].properties.isupper) return unichar_id;
00342 return unichars[unichar_id].properties.other_case;
00343 }
00344
00345
00346
00347 const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
00348 return unichars[unichar_id].properties.fragment;
00349 }
00350
00351
00352 bool get_isalpha(const char* const unichar_repr) const {
00353 return get_isalpha(unichar_to_id(unichar_repr));
00354 }
00355
00356
00357 bool get_islower(const char* const unichar_repr) const {
00358 return get_islower(unichar_to_id(unichar_repr));
00359 }
00360
00361
00362 bool get_isupper(const char* const unichar_repr) const {
00363 return get_isupper(unichar_to_id(unichar_repr));
00364 }
00365
00366
00367 bool get_isdigit(const char* const unichar_repr) const {
00368 return get_isdigit(unichar_to_id(unichar_repr));
00369 }
00370
00371
00372 bool get_ispunctuation(const char* const unichar_repr) const {
00373 return get_ispunctuation(unichar_to_id(unichar_repr));
00374 }
00375
00376
00377
00378
00379 int get_script(const char* const unichar_repr) const {
00380 return get_script(unichar_to_id(unichar_repr));
00381 }
00382
00383
00384
00385 const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
00386 if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
00387 !ids.contains(unichar_repr)) {
00388 return NULL;
00389 }
00390 return get_fragment(unichar_to_id(unichar_repr));
00391 }
00392
00393
00394
00395 bool get_isalpha(const char* const unichar_repr,
00396 int length) const {
00397 return get_isalpha(unichar_to_id(unichar_repr, length));
00398 }
00399
00400
00401
00402 bool get_islower(const char* const unichar_repr,
00403 int length) const {
00404 return get_islower(unichar_to_id(unichar_repr, length));
00405 }
00406
00407
00408
00409 bool get_isupper(const char* const unichar_repr,
00410 int length) const {
00411 return get_isupper(unichar_to_id(unichar_repr, length));
00412 }
00413
00414
00415
00416 bool get_isdigit(const char* const unichar_repr,
00417 int length) const {
00418 return get_isdigit(unichar_to_id(unichar_repr, length));
00419 }
00420
00421
00422
00423 bool get_ispunctuation(const char* const unichar_repr,
00424 int length) const {
00425 return get_ispunctuation(unichar_to_id(unichar_repr, length));
00426 }
00427
00428
00429
00430
00431
00432 int get_script(const char* const unichar_repr,
00433 int length) const {
00434 return get_script(unichar_to_id(unichar_repr, length));
00435 }
00436
00437
00438 int get_script_table_size() const {
00439 return script_table_size_used;
00440 }
00441
00442
00443 const char* get_script_from_script_id(int id) const {
00444 if (id >= script_table_size_used || id < 0)
00445 return null_script;
00446 return script_table[id];
00447 }
00448
00449
00450
00451
00452
00453
00454 int get_script_id_from_name(const char* script_name) const;
00455
00456
00457 bool is_null_script(const char* script) const {
00458 return script == null_script;
00459 }
00460
00461
00462
00463
00464 int add_script(const char* script);
00465
00466
00467 bool get_enabled(UNICHAR_ID unichar_id) const {
00468 return unichars[unichar_id].properties.enabled;
00469 }
00470
00471
00472 int null_sid() const { return null_sid_; }
00473 int common_sid() const { return common_sid_; }
00474 int latin_sid() const { return latin_sid_; }
00475 int cyrillic_sid() const { return cyrillic_sid_; }
00476 int greek_sid() const { return greek_sid_; }
00477 int han_sid() const { return han_sid_; }
00478
00479 private:
00480
00481 struct UNICHAR_PROPERTIES {
00482 bool isalpha;
00483 bool islower;
00484 bool isupper;
00485 bool isdigit;
00486 bool ispunctuation;
00487 bool isngram;
00488 bool enabled;
00489 int script_id;
00490 UNICHAR_ID other_case;
00491
00492
00493
00494
00495
00496 CHAR_FRAGMENT *fragment;
00497 };
00498
00499 struct UNICHAR_SLOT {
00500 char representation[UNICHAR_LEN + 1];
00501 UNICHAR_PROPERTIES properties;
00502 };
00503
00504 UNICHAR_SLOT* unichars;
00505 UNICHARMAP ids;
00506 int size_used;
00507 int size_reserved;
00508 char** script_table;
00509 int script_table_size_used;
00510 int script_table_size_reserved;
00511 const char* null_script;
00512
00513
00514
00515
00516 int null_sid_;
00517 int common_sid_;
00518 int latin_sid_;
00519 int cyrillic_sid_;
00520 int greek_sid_;
00521 int han_sid_;
00522 };
00523
00524 #endif // TESSERACT_CCUTIL_UNICHARSET_H__