00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00019
00020 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
00021 #define TESSERACT_CCUTIL_UNICHARSET_H__
00022
00023 #include "assert.h"
00024 #include "strngs.h"
00025 #include "unichar.h"
00026 #include "unicharmap.h"
00027 #include "params.h"
00028
00029 enum StrongScriptDirection {
00030 DIR_NEUTRAL = 0,
00031 DIR_LEFT_TO_RIGHT = 1,
00032 DIR_RIGHT_TO_LEFT = 2,
00033 DIR_MIX = 3,
00034
00035 };
00036
00037 class CHAR_FRAGMENT {
00038 public:
00039
00040 static const int kMinLen = 6;
00041
00042 static const int kMaxLen = 3 + UNICHAR_LEN + 2;
00043
00044 static const int kMaxChunks = 5;
00045
00046
00047 inline void set_all(const char *unichar, int pos, int total, bool natural) {
00048 set_unichar(unichar);
00049 set_pos(pos);
00050 set_total(total);
00051 set_natural(natural);
00052 }
00053 inline void set_unichar(const char *uch) {
00054 strncpy(this->unichar, uch, UNICHAR_LEN);
00055 this->unichar[UNICHAR_LEN] = '\0';
00056 }
00057 inline void set_pos(int p) { this->pos = p; }
00058 inline void set_total(int t) { this->total = t; }
00059 inline const char* get_unichar() const { return this->unichar; }
00060 inline int get_pos() const { return this->pos; }
00061 inline int get_total() const { return this->total; }
00062
00063
00064
00065 static STRING to_string(const char *unichar, int pos, int total,
00066 bool natural);
00067
00068 STRING to_string() const {
00069 return to_string(unichar, pos, total, natural);
00070 }
00071
00072
00073
00074 inline bool equals(const char *other_unichar,
00075 int other_pos, int other_total) const {
00076 return (strcmp(this->unichar, other_unichar) == 0 &&
00077 this->pos == other_pos && this->total == other_total);
00078 }
00079 inline bool equals(const CHAR_FRAGMENT *other) const {
00080 return this->equals(other->get_unichar(),
00081 other->get_pos(),
00082 other->get_total());
00083 }
00084
00085
00086
00087 inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
00088 return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
00089 this->total == fragment->get_total() &&
00090 this->pos == fragment->get_pos() + 1);
00091 }
00092
00093
00094 inline bool is_beginning() const { return this->pos == 0; }
00095
00096
00097 inline bool is_ending() const { return this->pos == this->total-1; }
00098
00099
00100
00101
00102 inline bool is_natural() const { return natural; }
00103 void set_natural(bool value) { natural = value; }
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119 static CHAR_FRAGMENT *parse_from_string(const char *str);
00120
00121 private:
00122 char unichar[UNICHAR_LEN + 1];
00123
00124
00125
00126 bool natural;
00127 inT16 pos;
00128 inT16 total;
00129 };
00130
00131
00132
00133
00134 class UNICHARSET {
00135 public:
00136
00137
00138
00139 static const char* kCustomLigatures[][2];
00140
00141
00142 enum Direction {
00143 U_LEFT_TO_RIGHT = 0,
00144 U_RIGHT_TO_LEFT = 1,
00145 U_EUROPEAN_NUMBER = 2,
00146 U_EUROPEAN_NUMBER_SEPARATOR = 3,
00147 U_EUROPEAN_NUMBER_TERMINATOR = 4,
00148 U_ARABIC_NUMBER = 5,
00149 U_COMMON_NUMBER_SEPARATOR = 6,
00150 U_BLOCK_SEPARATOR = 7,
00151 U_SEGMENT_SEPARATOR = 8,
00152 U_WHITE_SPACE_NEUTRAL = 9,
00153 U_OTHER_NEUTRAL = 10,
00154 U_LEFT_TO_RIGHT_EMBEDDING = 11,
00155 U_LEFT_TO_RIGHT_OVERRIDE = 12,
00156 U_RIGHT_TO_LEFT_ARABIC = 13,
00157 U_RIGHT_TO_LEFT_EMBEDDING = 14,
00158 U_RIGHT_TO_LEFT_OVERRIDE = 15,
00159 U_POP_DIRECTIONAL_FORMAT = 16,
00160 U_DIR_NON_SPACING_MARK = 17,
00161 U_BOUNDARY_NEUTRAL = 18,
00162 U_CHAR_DIRECTION_COUNT
00163 };
00164
00165
00166 UNICHARSET();
00167
00168 ~UNICHARSET();
00169
00170
00171
00172 const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
00173
00174
00175
00176 const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
00177 int length) const;
00178
00179
00180
00181
00182
00183 int step(const char* str) const;
00184
00185
00186
00187
00188 bool encodable_string(const char *str, int *first_bad_position) const;
00189
00190
00191
00192 const char* const id_to_unichar(UNICHAR_ID id) const;
00193
00194
00195
00196
00197
00198 const char* const id_to_unichar_ext(UNICHAR_ID id) const;
00199
00200
00201
00202 static STRING debug_utf8_str(const char* str);
00203
00204
00205
00206 STRING debug_str(UNICHAR_ID id) const;
00207 STRING debug_str(const char * unichar_repr) const {
00208 return debug_str(unichar_to_id(unichar_repr));
00209 }
00210
00211
00212 void unichar_insert(const char* const unichar_repr);
00213
00214
00215
00216 bool contains_unichar_id(UNICHAR_ID unichar_id) const {
00217 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
00218 unichar_id >= 0;
00219 }
00220
00221
00222 bool contains_unichar(const char* const unichar_repr) const;
00223 bool contains_unichar(const char* const unichar_repr, int length) const;
00224
00225
00226
00227 bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
00228
00229
00230 void delete_pointers_in_unichars() {
00231 for (int i = 0; i < size_used; ++i) {
00232 if (unichars[i].properties.fragment != NULL) {
00233 delete unichars[i].properties.fragment;
00234 unichars[i].properties.fragment = NULL;
00235 }
00236 }
00237 }
00238
00239
00240 void clear() {
00241 if (script_table != NULL) {
00242 for (int i = 0; i < script_table_size_used; ++i)
00243 delete[] script_table[i];
00244 delete[] script_table;
00245 script_table = NULL;
00246 script_table_size_used = 0;
00247 }
00248 if (unichars != NULL) {
00249 delete_pointers_in_unichars();
00250 delete[] unichars;
00251 unichars = NULL;
00252 }
00253 script_table_size_reserved = 0;
00254 size_reserved = 0;
00255 size_used = 0;
00256 ids.clear();
00257 top_bottom_set_ = false;
00258 script_has_upper_lower_ = false;
00259 script_has_xheight_ = false;
00260 null_sid_ = 0;
00261 common_sid_ = 0;
00262 latin_sid_ = 0;
00263 cyrillic_sid_ = 0;
00264 greek_sid_ = 0;
00265 han_sid_ = 0;
00266 hiragana_sid_ = 0;
00267 katakana_sid_ = 0;
00268 }
00269
00270
00271 int size() const {
00272 return size_used;
00273 }
00274
00275
00276 void reserve(int unichars_number);
00277
00278
00279
00280 bool save_to_file(const char * const filename) const {
00281 FILE* file = fopen(filename, "w+b");
00282 if (file == NULL) return false;
00283 bool result = save_to_file(file);
00284 fclose(file);
00285 return result;
00286 }
00287
00288
00289
00290 bool save_to_file(FILE *file) const;
00291
00292
00293
00294
00295 bool load_from_inmemory_file(const char* const memory, int mem_size,
00296 bool skip_fragments);
00297
00298 bool load_from_inmemory_file(const char* const memory, int mem_size) {
00299 return load_from_inmemory_file(memory, mem_size, false);
00300 }
00301
00302
00303
00304
00305 bool load_from_file(const char* const filename, bool skip_fragments) {
00306 FILE* file = fopen(filename, "rb");
00307 if (file == NULL) return false;
00308 bool result = load_from_file(file, skip_fragments);
00309 fclose(file);
00310 return result;
00311 }
00312
00313 bool load_from_file(const char* const filename) {
00314 return load_from_file(filename, false);
00315 }
00316
00317
00318
00319 bool load_from_file(FILE *file, bool skip_fragments);
00320 bool load_from_file(FILE *file) { return load_from_file(file, false); }
00321
00322
00323
00324
00325 void post_load_setup();
00326
00327
00328
00329
00330
00331 bool major_right_to_left() const;
00332
00333
00334
00335
00336
00337
00338
00339
00340 void set_black_and_whitelist(const char* blacklist, const char* whitelist);
00341
00342
00343 void set_isalpha(UNICHAR_ID unichar_id, bool value) {
00344 unichars[unichar_id].properties.isalpha = value;
00345 }
00346
00347
00348 void set_islower(UNICHAR_ID unichar_id, bool value) {
00349 unichars[unichar_id].properties.islower = value;
00350 }
00351
00352
00353 void set_isupper(UNICHAR_ID unichar_id, bool value) {
00354 unichars[unichar_id].properties.isupper = value;
00355 }
00356
00357
00358 void set_isdigit(UNICHAR_ID unichar_id, bool value) {
00359 unichars[unichar_id].properties.isdigit = value;
00360 }
00361
00362
00363 void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
00364 unichars[unichar_id].properties.ispunctuation = value;
00365 }
00366
00367
00368 void set_isngram(UNICHAR_ID unichar_id, bool value) {
00369 unichars[unichar_id].properties.isngram = value;
00370 }
00371
00372
00373
00374 void set_script(UNICHAR_ID unichar_id, const char* value) {
00375 unichars[unichar_id].properties.script_id = add_script(value);
00376 }
00377
00378
00379 void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
00380 unichars[unichar_id].properties.other_case = other_case;
00381 }
00382
00383
00384 void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {
00385 unichars[unichar_id].properties.direction = value;
00386 }
00387
00388
00389 void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
00390 unichars[unichar_id].properties.mirror = mirror;
00391 }
00392
00393
00394 void set_normed(UNICHAR_ID unichar_id, const char* normed) {
00395 unichars[unichar_id].properties.normed = normed;
00396 }
00397
00398
00399 bool get_isalpha(UNICHAR_ID unichar_id) const {
00400 if (INVALID_UNICHAR_ID == unichar_id) return false;
00401 ASSERT_HOST(contains_unichar_id(unichar_id));
00402 return unichars[unichar_id].properties.isalpha;
00403 }
00404
00405
00406 bool get_islower(UNICHAR_ID unichar_id) const {
00407 if (INVALID_UNICHAR_ID == unichar_id) return false;
00408 ASSERT_HOST(contains_unichar_id(unichar_id));
00409 return unichars[unichar_id].properties.islower;
00410 }
00411
00412
00413 bool get_isupper(UNICHAR_ID unichar_id) const {
00414 if (INVALID_UNICHAR_ID == unichar_id) return false;
00415 ASSERT_HOST(contains_unichar_id(unichar_id));
00416 return unichars[unichar_id].properties.isupper;
00417 }
00418
00419
00420 bool get_isdigit(UNICHAR_ID unichar_id) const {
00421 if (INVALID_UNICHAR_ID == unichar_id) return false;
00422 ASSERT_HOST(contains_unichar_id(unichar_id));
00423 return unichars[unichar_id].properties.isdigit;
00424 }
00425
00426
00427 bool get_ispunctuation(UNICHAR_ID unichar_id) const {
00428 if (INVALID_UNICHAR_ID == unichar_id) return false;
00429 ASSERT_HOST(contains_unichar_id(unichar_id));
00430 return unichars[unichar_id].properties.ispunctuation;
00431 }
00432
00433
00434 bool get_isngram(UNICHAR_ID unichar_id) const {
00435 if (INVALID_UNICHAR_ID == unichar_id) return false;
00436 ASSERT_HOST(contains_unichar_id(unichar_id));
00437 return unichars[unichar_id].properties.isngram;
00438 }
00439
00440
00441
00442 bool get_isprivate(UNICHAR_ID unichar_id) const;
00443
00444
00445 bool top_bottom_useful() const {
00446 return top_bottom_set_;
00447 }
00448
00449 void set_ranges_empty();
00450
00451
00452
00453 void SetPropertiesFromOther(const UNICHARSET& src);
00454
00455
00456
00457 void ExpandRangesFromOther(const UNICHARSET& src);
00458
00459
00460
00461 void AppendOtherUnicharset(const UNICHARSET& src);
00462
00463
00464
00465
00466 void get_top_bottom(UNICHAR_ID unichar_id,
00467 int* min_bottom, int* max_bottom,
00468 int* min_top, int* max_top) const {
00469 if (INVALID_UNICHAR_ID == unichar_id) {
00470 *min_bottom = *min_top = 0;
00471 *max_bottom = *max_top = 256;
00472 return;
00473 }
00474 ASSERT_HOST(contains_unichar_id(unichar_id));
00475 *min_bottom = unichars[unichar_id].properties.min_bottom;
00476 *max_bottom = unichars[unichar_id].properties.max_bottom;
00477 *min_top = unichars[unichar_id].properties.min_top;
00478 *max_top = unichars[unichar_id].properties.max_top;
00479 }
00480 void set_top_bottom(UNICHAR_ID unichar_id,
00481 int min_bottom, int max_bottom,
00482 int min_top, int max_top) {
00483 unichars[unichar_id].properties.min_bottom =
00484 static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
00485 unichars[unichar_id].properties.max_bottom =
00486 static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
00487 unichars[unichar_id].properties.min_top =
00488 static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
00489 unichars[unichar_id].properties.max_top =
00490 static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
00491 }
00492
00493
00494
00495
00496 void get_width_range(UNICHAR_ID unichar_id,
00497 int* min_width, int* max_width) const {
00498 if (INVALID_UNICHAR_ID == unichar_id) {
00499 *min_width = 0;
00500 *max_width = 256;
00501 return;
00502 }
00503 ASSERT_HOST(contains_unichar_id(unichar_id));
00504 *min_width = unichars[unichar_id].properties.min_width;
00505 *max_width = unichars[unichar_id].properties.max_width;
00506 }
00507 void set_width_range(UNICHAR_ID unichar_id, int min_width, int max_width) {
00508 unichars[unichar_id].properties.min_width =
00509 static_cast<inT16>(ClipToRange(min_width, 0, MAX_INT16));
00510 unichars[unichar_id].properties.max_width =
00511 static_cast<inT16>(ClipToRange(max_width, 0, MAX_INT16));
00512 }
00513
00514
00515
00516
00517 void get_bearing_range(UNICHAR_ID unichar_id,
00518 int* min_bearing, int* max_bearing) const {
00519 if (INVALID_UNICHAR_ID == unichar_id) {
00520 *min_bearing = *max_bearing = 0;
00521 return;
00522 }
00523 ASSERT_HOST(contains_unichar_id(unichar_id));
00524 *min_bearing = unichars[unichar_id].properties.min_bearing;
00525 *max_bearing = unichars[unichar_id].properties.max_bearing;
00526 }
00527 void set_bearing_range(UNICHAR_ID unichar_id,
00528 int min_bearing, int max_bearing) {
00529 unichars[unichar_id].properties.min_bearing =
00530 static_cast<inT16>(ClipToRange(min_bearing, 0, MAX_INT16));
00531 unichars[unichar_id].properties.max_bearing =
00532 static_cast<inT16>(ClipToRange(max_bearing, 0, MAX_INT16));
00533 }
00534
00535
00536
00537
00538 void get_advance_range(UNICHAR_ID unichar_id,
00539 int* min_advance, int* max_advance) const {
00540 if (INVALID_UNICHAR_ID == unichar_id) {
00541 *min_advance = *max_advance = 0;
00542 return;
00543 }
00544 ASSERT_HOST(contains_unichar_id(unichar_id));
00545 *min_advance = unichars[unichar_id].properties.min_advance;
00546 *max_advance = unichars[unichar_id].properties.max_advance;
00547 }
00548 void set_advance_range(UNICHAR_ID unichar_id,
00549 int min_advance, int max_advance) {
00550 unichars[unichar_id].properties.min_advance =
00551 static_cast<inT16>(ClipToRange(min_advance, 0, MAX_INT16));
00552 unichars[unichar_id].properties.max_advance =
00553 static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16));
00554 }
00555
00556
00557
00558
00559 int get_script(UNICHAR_ID unichar_id) const {
00560 if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
00561 ASSERT_HOST(contains_unichar_id(unichar_id));
00562 return unichars[unichar_id].properties.script_id;
00563 }
00564
00565
00566
00567 unsigned int get_properties(UNICHAR_ID unichar_id) const;
00568
00569
00570
00571
00572
00573
00574
00575
00576 char get_chartype(UNICHAR_ID unichar_id) const;
00577
00578
00579 UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
00580 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00581 ASSERT_HOST(contains_unichar_id(unichar_id));
00582 return unichars[unichar_id].properties.other_case;
00583 }
00584
00585
00586 Direction get_direction(UNICHAR_ID unichar_id) const {
00587 if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
00588 ASSERT_HOST(contains_unichar_id(unichar_id));
00589 return unichars[unichar_id].properties.direction;
00590 }
00591
00592
00593 UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
00594 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00595 ASSERT_HOST(contains_unichar_id(unichar_id));
00596 return unichars[unichar_id].properties.mirror;
00597 }
00598
00599
00600 UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
00601 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00602 ASSERT_HOST(contains_unichar_id(unichar_id));
00603 if (unichars[unichar_id].properties.islower) return unichar_id;
00604 return unichars[unichar_id].properties.other_case;
00605 }
00606
00607
00608 UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
00609 if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
00610 ASSERT_HOST(contains_unichar_id(unichar_id));
00611 if (unichars[unichar_id].properties.isupper) return unichar_id;
00612 return unichars[unichar_id].properties.other_case;
00613 }
00614
00615
00616
00617 const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
00618 if (INVALID_UNICHAR_ID == unichar_id) return NULL;
00619 ASSERT_HOST(contains_unichar_id(unichar_id));
00620 return unichars[unichar_id].properties.fragment;
00621 }
00622
00623
00624 bool get_isalpha(const char* const unichar_repr) const {
00625 return get_isalpha(unichar_to_id(unichar_repr));
00626 }
00627
00628
00629 bool get_islower(const char* const unichar_repr) const {
00630 return get_islower(unichar_to_id(unichar_repr));
00631 }
00632
00633
00634 bool get_isupper(const char* const unichar_repr) const {
00635 return get_isupper(unichar_to_id(unichar_repr));
00636 }
00637
00638
00639 bool get_isdigit(const char* const unichar_repr) const {
00640 return get_isdigit(unichar_to_id(unichar_repr));
00641 }
00642
00643
00644 bool get_ispunctuation(const char* const unichar_repr) const {
00645 return get_ispunctuation(unichar_to_id(unichar_repr));
00646 }
00647
00648
00649
00650 unsigned int get_properties(const char* const unichar_repr) const {
00651 return get_properties(unichar_to_id(unichar_repr));
00652 }
00653
00654 char get_chartype(const char* const unichar_repr) const {
00655 return get_chartype(unichar_to_id(unichar_repr));
00656 }
00657
00658
00659
00660
00661 int get_script(const char* const unichar_repr) const {
00662 return get_script(unichar_to_id(unichar_repr));
00663 }
00664
00665
00666
00667 const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
00668 if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
00669 !ids.contains(unichar_repr)) {
00670 return NULL;
00671 }
00672 return get_fragment(unichar_to_id(unichar_repr));
00673 }
00674
00675
00676
00677 bool get_isalpha(const char* const unichar_repr,
00678 int length) const {
00679 return get_isalpha(unichar_to_id(unichar_repr, length));
00680 }
00681
00682
00683
00684 bool get_islower(const char* const unichar_repr,
00685 int length) const {
00686 return get_islower(unichar_to_id(unichar_repr, length));
00687 }
00688
00689
00690
00691 bool get_isupper(const char* const unichar_repr,
00692 int length) const {
00693 return get_isupper(unichar_to_id(unichar_repr, length));
00694 }
00695
00696
00697
00698 bool get_isdigit(const char* const unichar_repr,
00699 int length) const {
00700 return get_isdigit(unichar_to_id(unichar_repr, length));
00701 }
00702
00703
00704
00705 bool get_ispunctuation(const char* const unichar_repr,
00706 int length) const {
00707 return get_ispunctuation(unichar_to_id(unichar_repr, length));
00708 }
00709
00710
00711 const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
00712 return unichars[unichar_id].properties.normed.string();
00713 }
00714
00715
00716
00717
00718
00719 int get_script(const char* const unichar_repr,
00720 int length) const {
00721 return get_script(unichar_to_id(unichar_repr, length));
00722 }
00723
00724
00725 int get_script_table_size() const {
00726 return script_table_size_used;
00727 }
00728
00729
00730 const char* get_script_from_script_id(int id) const {
00731 if (id >= script_table_size_used || id < 0)
00732 return null_script;
00733 return script_table[id];
00734 }
00735
00736
00737
00738
00739
00740
00741 int get_script_id_from_name(const char* script_name) const;
00742
00743
00744 bool is_null_script(const char* script) const {
00745 return script == null_script;
00746 }
00747
00748
00749
00750
00751 int add_script(const char* script);
00752
00753
00754 bool get_enabled(UNICHAR_ID unichar_id) const {
00755 return unichars[unichar_id].properties.enabled;
00756 }
00757
00758
00759 int null_sid() const { return null_sid_; }
00760 int common_sid() const { return common_sid_; }
00761 int latin_sid() const { return latin_sid_; }
00762 int cyrillic_sid() const { return cyrillic_sid_; }
00763 int greek_sid() const { return greek_sid_; }
00764 int han_sid() const { return han_sid_; }
00765 int hiragana_sid() const { return hiragana_sid_; }
00766 int katakana_sid() const { return katakana_sid_; }
00767 int default_sid() const { return default_sid_; }
00768
00769
00770 bool script_has_upper_lower() const {
00771 return script_has_upper_lower_;
00772 }
00773
00774
00775
00776
00777 bool script_has_xheight() const {
00778 return script_has_xheight_;
00779 }
00780
00781 private:
00782
00783 struct UNICHAR_PROPERTIES {
00784 UNICHAR_PROPERTIES();
00785
00786 void Init();
00787
00788
00789 void SetRangesOpen();
00790
00791 void SetRangesEmpty();
00792
00793
00794 bool AnyRangeEmpty() const;
00795
00796 void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
00797
00798 void CopyFrom(const UNICHAR_PROPERTIES& src);
00799
00800 bool isalpha;
00801 bool islower;
00802 bool isupper;
00803 bool isdigit;
00804 bool ispunctuation;
00805 bool isngram;
00806 bool enabled;
00807
00808
00809
00810
00811 uinT8 min_bottom;
00812 uinT8 max_bottom;
00813 uinT8 min_top;
00814 uinT8 max_top;
00815
00816 inT16 min_width;
00817 inT16 max_width;
00818
00819 inT16 min_bearing;
00820 inT16 max_bearing;
00821 inT16 min_advance;
00822 inT16 max_advance;
00823 int script_id;
00824 UNICHAR_ID other_case;
00825 Direction direction;
00826
00827
00828
00829
00830
00831
00832 UNICHAR_ID mirror;
00833 STRING normed;
00834
00835
00836
00837
00838 CHAR_FRAGMENT *fragment;
00839 };
00840
00841 struct UNICHAR_SLOT {
00842 char representation[UNICHAR_LEN + 1];
00843 UNICHAR_PROPERTIES properties;
00844 };
00845
00846
00847
00848
00849
00850
00851 bool GetStrProperties(const char* utf8_str,
00852 UNICHAR_PROPERTIES* props) const;
00853
00854
00855
00856
00857 bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
00858 bool skip_fragments);
00859
00860 UNICHAR_SLOT* unichars;
00861 UNICHARMAP ids;
00862 int size_used;
00863 int size_reserved;
00864 char** script_table;
00865 int script_table_size_used;
00866 int script_table_size_reserved;
00867 const char* null_script;
00868
00869 bool top_bottom_set_;
00870
00871 bool script_has_upper_lower_;
00872
00873
00874 bool script_has_xheight_;
00875
00876
00877
00878
00879 int null_sid_;
00880 int common_sid_;
00881 int latin_sid_;
00882 int cyrillic_sid_;
00883 int greek_sid_;
00884 int han_sid_;
00885 int hiragana_sid_;
00886 int katakana_sid_;
00887
00888 int default_sid_;
00889 };
00890
00891 #endif // TESSERACT_CCUTIL_UNICHARSET_H__