tesseract
3.03
|
#include <unicharset.h>
Classes | |
struct | UNICHAR_PROPERTIES |
struct | UNICHAR_SLOT |
Public Types | |
enum | Direction { U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3, U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7, U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11, U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15, U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_CHAR_DIRECTION_COUNT } |
Public Member Functions | |
UNICHARSET () | |
~UNICHARSET () | |
const UNICHAR_ID | unichar_to_id (const char *const unichar_repr) const |
const UNICHAR_ID | unichar_to_id (const char *const unichar_repr, int length) const |
int | step (const char *str) const |
int | normed_step (const char *str) const |
bool | encodable_string (const char *str, int *first_bad_position) const |
bool | encode_string (const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const |
const char *const | id_to_unichar (UNICHAR_ID id) const |
const char *const | id_to_unichar_ext (UNICHAR_ID id) const |
STRING | debug_str (UNICHAR_ID id) const |
STRING | debug_str (const char *unichar_repr) const |
void | unichar_insert (const char *const unichar_repr) |
bool | contains_unichar_id (UNICHAR_ID unichar_id) const |
bool | contains_unichar (const char *const unichar_repr) const |
bool | contains_unichar (const char *const unichar_repr, int length) const |
bool | eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const |
void | delete_pointers_in_unichars () |
void | clear () |
int | size () const |
void | reserve (int unichars_number) |
bool | save_to_file (const char *const filename) const |
bool | save_to_file (FILE *file) const |
bool | load_from_inmemory_file (const char *const memory, int mem_size, bool skip_fragments) |
bool | load_from_inmemory_file (const char *const memory, int mem_size) |
bool | load_from_file (const char *const filename, bool skip_fragments) |
bool | load_from_file (const char *const filename) |
bool | load_from_file (FILE *file, bool skip_fragments) |
bool | load_from_file (FILE *file) |
void | post_load_setup () |
bool | major_right_to_left () const |
void | set_black_and_whitelist (const char *blacklist, const char *whitelist) |
void | set_isalpha (UNICHAR_ID unichar_id, bool value) |
void | set_islower (UNICHAR_ID unichar_id, bool value) |
void | set_isupper (UNICHAR_ID unichar_id, bool value) |
void | set_isdigit (UNICHAR_ID unichar_id, bool value) |
void | set_ispunctuation (UNICHAR_ID unichar_id, bool value) |
void | set_isngram (UNICHAR_ID unichar_id, bool value) |
void | set_script (UNICHAR_ID unichar_id, const char *value) |
void | set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case) |
void | set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value) |
void | set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror) |
void | set_normed (UNICHAR_ID unichar_id, const char *normed) |
void | set_normed_ids (UNICHAR_ID unichar_id) |
bool | get_isalpha (UNICHAR_ID unichar_id) const |
bool | get_islower (UNICHAR_ID unichar_id) const |
bool | get_isupper (UNICHAR_ID unichar_id) const |
bool | get_isdigit (UNICHAR_ID unichar_id) const |
bool | get_ispunctuation (UNICHAR_ID unichar_id) const |
bool | get_isngram (UNICHAR_ID unichar_id) const |
bool | get_isprivate (UNICHAR_ID unichar_id) const |
bool | top_bottom_useful () const |
void | set_ranges_empty () |
void | SetPropertiesFromOther (const UNICHARSET &src) |
void | PartialSetPropertiesFromOther (int start_index, const UNICHARSET &src) |
void | ExpandRangesFromOther (const UNICHARSET &src) |
void | CopyFrom (const UNICHARSET &src) |
void | AppendOtherUnicharset (const UNICHARSET &src) |
bool | SizesDistinct (UNICHAR_ID id1, UNICHAR_ID id2) const |
void | get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const |
void | set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top) |
void | get_width_range (UNICHAR_ID unichar_id, int *min_width, int *max_width) const |
void | set_width_range (UNICHAR_ID unichar_id, int min_width, int max_width) |
void | get_bearing_range (UNICHAR_ID unichar_id, int *min_bearing, int *max_bearing) const |
void | set_bearing_range (UNICHAR_ID unichar_id, int min_bearing, int max_bearing) |
void | get_advance_range (UNICHAR_ID unichar_id, int *min_advance, int *max_advance) const |
void | set_advance_range (UNICHAR_ID unichar_id, int min_advance, int max_advance) |
int | get_script (UNICHAR_ID unichar_id) const |
unsigned int | get_properties (UNICHAR_ID unichar_id) const |
char | get_chartype (UNICHAR_ID unichar_id) const |
UNICHAR_ID | get_other_case (UNICHAR_ID unichar_id) const |
Direction | get_direction (UNICHAR_ID unichar_id) const |
UNICHAR_ID | get_mirror (UNICHAR_ID unichar_id) const |
UNICHAR_ID | to_lower (UNICHAR_ID unichar_id) const |
UNICHAR_ID | to_upper (UNICHAR_ID unichar_id) const |
bool | has_special_codes () const |
const CHAR_FRAGMENT * | get_fragment (UNICHAR_ID unichar_id) const |
bool | get_isalpha (const char *const unichar_repr) const |
bool | get_islower (const char *const unichar_repr) const |
bool | get_isupper (const char *const unichar_repr) const |
bool | get_isdigit (const char *const unichar_repr) const |
bool | get_ispunctuation (const char *const unichar_repr) const |
unsigned int | get_properties (const char *const unichar_repr) const |
char | get_chartype (const char *const unichar_repr) const |
int | get_script (const char *const unichar_repr) const |
const CHAR_FRAGMENT * | get_fragment (const char *const unichar_repr) const |
bool | get_isalpha (const char *const unichar_repr, int length) const |
bool | get_islower (const char *const unichar_repr, int length) const |
bool | get_isupper (const char *const unichar_repr, int length) const |
bool | get_isdigit (const char *const unichar_repr, int length) const |
bool | get_ispunctuation (const char *const unichar_repr, int length) const |
const char * | get_normed_unichar (UNICHAR_ID unichar_id) const |
const GenericVector< UNICHAR_ID > & | normed_ids (UNICHAR_ID unichar_id) const |
int | get_script (const char *const unichar_repr, int length) const |
int | get_script_table_size () const |
const char * | get_script_from_script_id (int id) const |
int | get_script_id_from_name (const char *script_name) const |
bool | is_null_script (const char *script) const |
int | add_script (const char *script) |
bool | get_enabled (UNICHAR_ID unichar_id) const |
int | null_sid () const |
int | common_sid () const |
int | latin_sid () const |
int | cyrillic_sid () const |
int | greek_sid () const |
int | han_sid () const |
int | hiragana_sid () const |
int | katakana_sid () const |
int | default_sid () const |
bool | script_has_upper_lower () const |
bool | script_has_xheight () const |
Static Public Member Functions | |
static STRING | debug_utf8_str (const char *str) |
Static Public Attributes | |
static const char * | kCustomLigatures [][2] |
static const char * | kSpecialUnicharCodes [SPECIAL_UNICHAR_CODES_COUNT] |
Definition at line 138 of file unicharset.h.
Definition at line 149 of file unicharset.h.
{ U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3, U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7, U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11, U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15, U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_CHAR_DIRECTION_COUNT };
Definition at line 153 of file unicharset.cpp.
: unichars(NULL), ids(), size_used(0), size_reserved(0), script_table(NULL), script_table_size_used(0), null_script("NULL") { clear(); for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) { unichar_insert(kSpecialUnicharCodes[i]); if (i == UNICHAR_JOINED) set_isngram(i, true); } }
Definition at line 169 of file unicharset.cpp.
{ clear(); }
int UNICHARSET::add_script | ( | const char * | script | ) |
Definition at line 988 of file unicharset.cpp.
{ for (int i = 0; i < script_table_size_used; ++i) { if (strcmp(script, script_table[i]) == 0) return i; } if (script_table_size_reserved == 0) { script_table_size_reserved = 8; script_table = new char*[script_table_size_reserved]; } if (script_table_size_used + 1 >= script_table_size_reserved) { char** new_script_table = new char*[script_table_size_reserved * 2]; memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*)); delete[] script_table; script_table = new_script_table; script_table_size_reserved = 2 * script_table_size_reserved; } script_table[script_table_size_used] = new char[strlen(script) + 1]; strcpy(script_table[script_table_size_used], script); return script_table_size_used++; }
void UNICHARSET::AppendOtherUnicharset | ( | const UNICHARSET & | src | ) |
Definition at line 461 of file unicharset.cpp.
{ int initial_used = size_used; for (int ch = 0; ch < src.size_used; ++ch) { const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; const char* utf8 = src.id_to_unichar(ch); if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) { // Only use fully valid entries. tprintf("Bad properties for index %d, char %s: " "%d,%d %d,%d %d,%d %d,%d %d,%d\n", ch, utf8, src_props.min_bottom, src_props.max_bottom, src_props.min_top, src_props.max_top, src_props.min_width, src_props.max_width, src_props.min_bearing, src_props.max_bearing, src_props.min_advance, src_props.max_advance); continue; } int id = size_used; if (contains_unichar(utf8)) { id = unichar_to_id(utf8); // Just expand current ranges. unichars[id].properties.ExpandRangesFrom(src_props); } else { unichar_insert(utf8); unichars[id].properties.SetRangesEmpty(); } } // Set properties, including mirror and other_case, WITHOUT reordering // the unicharset. PartialSetPropertiesFromOther(initial_used, src); }
void UNICHARSET::clear | ( | ) | [inline] |
Definition at line 270 of file unicharset.h.
{ if (script_table != NULL) { for (int i = 0; i < script_table_size_used; ++i) delete[] script_table[i]; delete[] script_table; script_table = NULL; script_table_size_used = 0; } if (unichars != NULL) { delete_pointers_in_unichars(); delete[] unichars; unichars = NULL; } script_table_size_reserved = 0; size_reserved = 0; size_used = 0; ids.clear(); top_bottom_set_ = false; script_has_upper_lower_ = false; script_has_xheight_ = false; null_sid_ = 0; common_sid_ = 0; latin_sid_ = 0; cyrillic_sid_ = 0; greek_sid_ = 0; han_sid_ = 0; hiragana_sid_ = 0; katakana_sid_ = 0; }
int UNICHARSET::common_sid | ( | ) | const [inline] |
Definition at line 819 of file unicharset.h.
{ return common_sid_; }
bool UNICHARSET::contains_unichar | ( | const char *const | unichar_repr | ) | const |
Definition at line 675 of file unicharset.cpp.
{ return ids.contains(unichar_repr); }
bool UNICHARSET::contains_unichar | ( | const char *const | unichar_repr, |
int | length | ||
) | const |
Definition at line 679 of file unicharset.cpp.
{ if (length == 0) { return false; } return ids.contains(unichar_repr, length); }
bool UNICHARSET::contains_unichar_id | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 246 of file unicharset.h.
{
return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
unichar_id >= 0;
}
void UNICHARSET::CopyFrom | ( | const UNICHARSET & | src | ) |
Definition at line 453 of file unicharset.cpp.
{ clear(); AppendOtherUnicharset(src); }
int UNICHARSET::cyrillic_sid | ( | ) | const [inline] |
Definition at line 821 of file unicharset.h.
{ return cyrillic_sid_; }
STRING UNICHARSET::debug_str | ( | UNICHAR_ID | id | ) | const |
Definition at line 340 of file unicharset.cpp.
{ if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id)); const CHAR_FRAGMENT *fragment = this->get_fragment(id); if (fragment) { return fragment->to_string(); } const char* str = id_to_unichar(id); STRING result = debug_utf8_str(str); // Append a for lower alpha, A for upper alpha, and x if alpha but neither. if (get_isalpha(id)) { if (get_islower(id)) result += "a"; else if (get_isupper(id)) result += "A"; else result += "x"; } // Append 0 if a digit. if (get_isdigit(id)) { result += "0"; } // Append p is a punctuation symbol. if (get_ispunctuation(id)) { result += "p"; } return result; }
STRING UNICHARSET::debug_str | ( | const char * | unichar_repr | ) | const [inline] |
Definition at line 237 of file unicharset.h.
{ return debug_str(unichar_to_id(unichar_repr)); }
STRING UNICHARSET::debug_utf8_str | ( | const char * | str | ) | [static] |
Definition at line 316 of file unicharset.cpp.
{ STRING result = str; result += " ["; int step = 1; // Chop into unicodes and code each as hex. for (int i = 0; str[i] != '\0'; i += step) { char hex[sizeof(int) * 2 + 1]; step = UNICHAR::utf8_step(str + i); if (step == 0) { step = 1; sprintf(hex, "%x", str[i]); } else { UNICHAR ch(str + i, step); sprintf(hex, "%x", ch.first_uni()); } result += hex; result += " "; } result += "]"; return result; }
int UNICHARSET::default_sid | ( | ) | const [inline] |
Definition at line 826 of file unicharset.h.
{ return default_sid_; }
void UNICHARSET::delete_pointers_in_unichars | ( | ) | [inline] |
Definition at line 260 of file unicharset.h.
{ for (int i = 0; i < size_used; ++i) { if (unichars[i].properties.fragment != NULL) { delete unichars[i].properties.fragment; unichars[i].properties.fragment = NULL; } } }
bool UNICHARSET::encodable_string | ( | const char * | str, |
int * | first_bad_position | ||
) | const |
Definition at line 244 of file unicharset.cpp.
{ GenericVector<UNICHAR_ID> encoding; return encode_string(str, true, &encoding, NULL, first_bad_position); }
bool UNICHARSET::encode_string | ( | const char * | str, |
bool | give_up_on_failure, | ||
GenericVector< UNICHAR_ID > * | encoding, | ||
GenericVector< char > * | lengths, | ||
int * | encoded_length | ||
) | const |
Definition at line 256 of file unicharset.cpp.
{ GenericVector<UNICHAR_ID> working_encoding; GenericVector<char> working_lengths; GenericVector<char> best_lengths; encoding->truncate(0); // Just in case str is empty. int str_length = strlen(str); int str_pos = 0; bool perfect = true; while (str_pos < str_length) { encode_string(str, str_pos, str_length, &working_encoding, &working_lengths, &str_pos, encoding, &best_lengths); if (str_pos < str_length) { // This is a non-match. Skip one utf-8 character. perfect = false; if (give_up_on_failure) break; int step = UNICHAR::utf8_step(str + str_pos); if (step == 0) step = 1; encoding->push_back(INVALID_UNICHAR_ID); best_lengths.push_back(step); str_pos += step; working_encoding = *encoding; working_lengths = best_lengths; } } if (lengths != NULL) *lengths = best_lengths; if (encoded_length != NULL) *encoded_length = str_pos; return perfect; }
bool UNICHARSET::eq | ( | UNICHAR_ID | unichar_id, |
const char *const | unichar_repr | ||
) | const |
Definition at line 687 of file unicharset.cpp.
{ return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0; }
void UNICHARSET::ExpandRangesFromOther | ( | const UNICHARSET & | src | ) |
Definition at line 440 of file unicharset.cpp.
{ for (int ch = 0; ch < size_used; ++ch) { const char* utf8 = id_to_unichar(ch); UNICHAR_PROPERTIES properties; if (src.GetStrProperties(utf8, &properties)) { // Expand just the ranges from properties. unichars[ch].properties.ExpandRangesFrom(properties); } } }
void UNICHARSET::get_advance_range | ( | UNICHAR_ID | unichar_id, |
int * | min_advance, | ||
int * | max_advance | ||
) | const [inline] |
Definition at line 582 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) { *min_advance = *max_advance = 0; return; } ASSERT_HOST(contains_unichar_id(unichar_id)); *min_advance = unichars[unichar_id].properties.min_advance; *max_advance = unichars[unichar_id].properties.max_advance; }
void UNICHARSET::get_bearing_range | ( | UNICHAR_ID | unichar_id, |
int * | min_bearing, | ||
int * | max_bearing | ||
) | const [inline] |
Definition at line 561 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) { *min_bearing = *max_bearing = 0; return; } ASSERT_HOST(contains_unichar_id(unichar_id)); *min_bearing = unichars[unichar_id].properties.min_bearing; *max_bearing = unichars[unichar_id].properties.max_bearing; }
char UNICHARSET::get_chartype | ( | UNICHAR_ID | unichar_id | ) | const |
Definition at line 634 of file unicharset.cpp.
{ if (this->get_isupper(id)) return 'A'; if (this->get_islower(id)) return 'a'; if (this->get_isalpha(id)) return 'x'; if (this->get_isdigit(id)) return '0'; if (this->get_ispunctuation(id)) return 'p'; return 0; }
char UNICHARSET::get_chartype | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 707 of file unicharset.h.
{ return get_chartype(unichar_to_id(unichar_repr)); }
Direction UNICHARSET::get_direction | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 630 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL; ASSERT_HOST(contains_unichar_id(unichar_id)); return unichars[unichar_id].properties.direction; }
bool UNICHARSET::get_enabled | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 813 of file unicharset.h.
{
return unichars[unichar_id].properties.enabled;
}
const CHAR_FRAGMENT* UNICHARSET::get_fragment | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 670 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return NULL; ASSERT_HOST(contains_unichar_id(unichar_id)); return unichars[unichar_id].properties.fragment; }
const CHAR_FRAGMENT* UNICHARSET::get_fragment | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 720 of file unicharset.h.
{ if (unichar_repr == NULL || unichar_repr[0] == '\0' || !ids.contains(unichar_repr)) { return NULL; } return get_fragment(unichar_to_id(unichar_repr)); }
bool UNICHARSET::get_isalpha | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 433 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return false; ASSERT_HOST(contains_unichar_id(unichar_id)); return unichars[unichar_id].properties.isalpha; }
bool UNICHARSET::get_isalpha | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 677 of file unicharset.h.
{ return get_isalpha(unichar_to_id(unichar_repr)); }
bool UNICHARSET::get_isalpha | ( | const char *const | unichar_repr, |
int | length | ||
) | const [inline] |
Definition at line 730 of file unicharset.h.
{ return get_isalpha(unichar_to_id(unichar_repr, length)); }
bool UNICHARSET::get_isdigit | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 454 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return false; ASSERT_HOST(contains_unichar_id(unichar_id)); return unichars[unichar_id].properties.isdigit; }
bool UNICHARSET::get_isdigit | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 692 of file unicharset.h.
{ return get_isdigit(unichar_to_id(unichar_repr)); }
bool UNICHARSET::get_isdigit | ( | const char *const | unichar_repr, |
int | length | ||
) | const [inline] |
Definition at line 751 of file unicharset.h.
{ return get_isdigit(unichar_to_id(unichar_repr, length)); }
bool UNICHARSET::get_islower | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 440 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return false; ASSERT_HOST(contains_unichar_id(unichar_id)); return unichars[unichar_id].properties.islower; }
bool UNICHARSET::get_islower | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 682 of file unicharset.h.
{ return get_islower(unichar_to_id(unichar_repr)); }
bool UNICHARSET::get_islower | ( | const char *const | unichar_repr, |
int | length | ||
) | const [inline] |
Definition at line 737 of file unicharset.h.
{ return get_islower(unichar_to_id(unichar_repr, length)); }
bool UNICHARSET::get_isngram | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 468 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return false; ASSERT_HOST(contains_unichar_id(unichar_id)); return unichars[unichar_id].properties.isngram; }
bool UNICHARSET::get_isprivate | ( | UNICHAR_ID | unichar_id | ) | const |
Definition at line 391 of file unicharset.cpp.
{ UNICHAR uc(id_to_unichar(unichar_id), -1); int uni = uc.first_uni(); return (uni >= 0xE000 && uni <= 0xF8FF); }
bool UNICHARSET::get_ispunctuation | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 461 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return false; ASSERT_HOST(contains_unichar_id(unichar_id)); return unichars[unichar_id].properties.ispunctuation; }
bool UNICHARSET::get_ispunctuation | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 697 of file unicharset.h.
{ return get_ispunctuation(unichar_to_id(unichar_repr)); }
bool UNICHARSET::get_ispunctuation | ( | const char *const | unichar_repr, |
int | length | ||
) | const [inline] |
Definition at line 758 of file unicharset.h.
{ return get_ispunctuation(unichar_to_id(unichar_repr, length)); }
bool UNICHARSET::get_isupper | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 447 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return false; ASSERT_HOST(contains_unichar_id(unichar_id)); return unichars[unichar_id].properties.isupper; }
bool UNICHARSET::get_isupper | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 687 of file unicharset.h.
{ return get_isupper(unichar_to_id(unichar_repr)); }
bool UNICHARSET::get_isupper | ( | const char *const | unichar_repr, |
int | length | ||
) | const [inline] |
Definition at line 744 of file unicharset.h.
{ return get_isupper(unichar_to_id(unichar_repr, length)); }
UNICHAR_ID UNICHARSET::get_mirror | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 637 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; ASSERT_HOST(contains_unichar_id(unichar_id)); return unichars[unichar_id].properties.mirror; }
const char* UNICHARSET::get_normed_unichar | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 764 of file unicharset.h.
{
return unichars[unichar_id].properties.normed.string();
}
UNICHAR_ID UNICHARSET::get_other_case | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 623 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; ASSERT_HOST(contains_unichar_id(unichar_id)); return unichars[unichar_id].properties.other_case; }
unsigned int UNICHARSET::get_properties | ( | UNICHAR_ID | unichar_id | ) | const |
Definition at line 619 of file unicharset.cpp.
{ unsigned int properties = 0; if (this->get_isalpha(id)) properties |= ISALPHA_MASK; if (this->get_islower(id)) properties |= ISLOWER_MASK; if (this->get_isupper(id)) properties |= ISUPPER_MASK; if (this->get_isdigit(id)) properties |= ISDIGIT_MASK; if (this->get_ispunctuation(id)) properties |= ISPUNCTUATION_MASK; return properties; }
unsigned int UNICHARSET::get_properties | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 703 of file unicharset.h.
{ return get_properties(unichar_to_id(unichar_repr)); }
int UNICHARSET::get_script | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 603 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return null_sid_; ASSERT_HOST(contains_unichar_id(unichar_id)); return unichars[unichar_id].properties.script_id; }
int UNICHARSET::get_script | ( | const char *const | unichar_repr | ) | const [inline] |
Definition at line 714 of file unicharset.h.
{ return get_script(unichar_to_id(unichar_repr)); }
int UNICHARSET::get_script | ( | const char *const | unichar_repr, |
int | length | ||
) | const [inline] |
Definition at line 778 of file unicharset.h.
{ return get_script(unichar_to_id(unichar_repr, length)); }
const char* UNICHARSET::get_script_from_script_id | ( | int | id | ) | const [inline] |
Definition at line 789 of file unicharset.h.
{ if (id >= script_table_size_used || id < 0) return null_script; return script_table[id]; }
int UNICHARSET::get_script_id_from_name | ( | const char * | script_name | ) | const |
Definition at line 1066 of file unicharset.cpp.
{ for (int i = 0; i < script_table_size_used; ++i) { if (strcmp(script_name, script_table[i]) == 0) return i; } return 0; // 0 is always the null_script }
int UNICHARSET::get_script_table_size | ( | ) | const [inline] |
Definition at line 784 of file unicharset.h.
{
return script_table_size_used;
}
void UNICHARSET::get_top_bottom | ( | UNICHAR_ID | unichar_id, |
int * | min_bottom, | ||
int * | max_bottom, | ||
int * | min_top, | ||
int * | max_top | ||
) | const [inline] |
Definition at line 510 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) { *min_bottom = *min_top = 0; *max_bottom = *max_top = 256; // kBlnCellHeight return; } ASSERT_HOST(contains_unichar_id(unichar_id)); *min_bottom = unichars[unichar_id].properties.min_bottom; *max_bottom = unichars[unichar_id].properties.max_bottom; *min_top = unichars[unichar_id].properties.min_top; *max_top = unichars[unichar_id].properties.max_top; }
void UNICHARSET::get_width_range | ( | UNICHAR_ID | unichar_id, |
int * | min_width, | ||
int * | max_width | ||
) | const [inline] |
Definition at line 540 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) { *min_width = 0; *max_width = 256; // kBlnCellHeight; return; } ASSERT_HOST(contains_unichar_id(unichar_id)); *min_width = unichars[unichar_id].properties.min_width; *max_width = unichars[unichar_id].properties.max_width; }
int UNICHARSET::greek_sid | ( | ) | const [inline] |
Definition at line 822 of file unicharset.h.
{ return greek_sid_; }
int UNICHARSET::han_sid | ( | ) | const [inline] |
Definition at line 823 of file unicharset.h.
{ return han_sid_; }
bool UNICHARSET::has_special_codes | ( | ) | const [inline] |
Definition at line 662 of file unicharset.h.
{ return get_fragment(UNICHAR_BROKEN) != NULL && strcmp(id_to_unichar(UNICHAR_BROKEN), kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0; }
int UNICHARSET::hiragana_sid | ( | ) | const [inline] |
Definition at line 824 of file unicharset.h.
{ return hiragana_sid_; }
const char *const UNICHARSET::id_to_unichar | ( | UNICHAR_ID | id | ) | const |
Definition at line 288 of file unicharset.cpp.
{ if (id == INVALID_UNICHAR_ID) { return INVALID_UNICHAR; } ASSERT_HOST(id < this->size()); return unichars[id].representation; }
const char *const UNICHARSET::id_to_unichar_ext | ( | UNICHAR_ID | id | ) | const |
Definition at line 296 of file unicharset.cpp.
{ if (id == INVALID_UNICHAR_ID) { return INVALID_UNICHAR; } ASSERT_HOST(id < this->size()); // Resolve from the kCustomLigatures table if this is a private encoding. if (get_isprivate(id)) { const char* ch = id_to_unichar(id); for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) { if (!strcmp(ch, kCustomLigatures[i][1])) { return kCustomLigatures[i][0]; } } } // Otherwise return the stored representation. return unichars[id].representation; }
bool UNICHARSET::is_null_script | ( | const char * | script | ) | const [inline] |
Definition at line 803 of file unicharset.h.
{
return script == null_script;
}
int UNICHARSET::katakana_sid | ( | ) | const [inline] |
Definition at line 825 of file unicharset.h.
{ return katakana_sid_; }
int UNICHARSET::latin_sid | ( | ) | const [inline] |
Definition at line 820 of file unicharset.h.
{ return latin_sid_; }
bool UNICHARSET::load_from_file | ( | const char *const | filename, |
bool | skip_fragments | ||
) | [inline] |
Definition at line 335 of file unicharset.h.
{ FILE* file = fopen(filename, "rb"); if (file == NULL) return false; bool result = load_from_file(file, skip_fragments); fclose(file); return result; }
bool UNICHARSET::load_from_file | ( | const char *const | filename | ) | [inline] |
Definition at line 343 of file unicharset.h.
{ return load_from_file(filename, false); }
bool UNICHARSET::load_from_file | ( | FILE * | file, |
bool | skip_fragments | ||
) |
Definition at line 770 of file unicharset.cpp.
{ LocalFilePointer lfp(file); TessResultCallback2<char *, char *, int> *fgets_cb = NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets); bool success = load_via_fgets(fgets_cb, skip_fragments); delete fgets_cb; return success; }
bool UNICHARSET::load_from_file | ( | FILE * | file | ) | [inline] |
Definition at line 350 of file unicharset.h.
{ return load_from_file(file, false); }
bool UNICHARSET::load_from_inmemory_file | ( | const char *const | memory, |
int | mem_size, | ||
bool | skip_fragments | ||
) |
Definition at line 750 of file unicharset.cpp.
{ InMemoryFilePointer mem_fp(memory, mem_size); TessResultCallback2<char *, char *, int> *fgets_cb = NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets); bool success = load_via_fgets(fgets_cb, skip_fragments); delete fgets_cb; return success; }
bool UNICHARSET::load_from_inmemory_file | ( | const char *const | memory, |
int | mem_size | ||
) | [inline] |
Definition at line 328 of file unicharset.h.
{ return load_from_inmemory_file(memory, mem_size, false); }
bool UNICHARSET::major_right_to_left | ( | ) | const |
Definition at line 946 of file unicharset.cpp.
{ int ltr_count = 0; int rtl_count = 0; for (int id = 0; id < size_used; ++id) { int dir = get_direction(id); if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++; if (dir == UNICHARSET::U_RIGHT_TO_LEFT || dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC || dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++; } return rtl_count > ltr_count; }
const GenericVector<UNICHAR_ID>& UNICHARSET::normed_ids | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 770 of file unicharset.h.
{
return unichars[unichar_id].properties.normed_ids;
}
int UNICHARSET::normed_step | ( | const char * | str | ) | const |
Definition at line 216 of file unicharset.cpp.
{ // Find the length of the first matching unicharset member. int length = ids.minmatch(str); if (length == 0) return 0; // Empty string or illegal char. while (length <= UNICHAR_LEN) { if (ids.contains(str, length)) { int matched_id = unichar_to_id(str, length); const GenericVector<UNICHAR_ID>& matched_norms = normed_ids(matched_id); bool good_start = matched_norms.size() == 1 && matched_norms[0] == matched_id; if (str[length] == '\0') { return good_start ? length : 0; } if (normed_step(str + length) > 0) return length; // This length works! } else if (str[length] == '\0') { return 0; // Ran out of string. } ++length; } return 0; }
int UNICHARSET::null_sid | ( | ) | const [inline] |
Definition at line 818 of file unicharset.h.
{ return null_sid_; }
void UNICHARSET::PartialSetPropertiesFromOther | ( | int | start_index, |
const UNICHARSET & | src | ||
) |
Definition at line 408 of file unicharset.cpp.
{ for (int ch = start_index; ch < size_used; ++ch) { const char* utf8 = id_to_unichar(ch); UNICHAR_PROPERTIES properties; if (src.GetStrProperties(utf8, &properties)) { // Setup the script_id, other_case, and mirror properly. const char* script = src.get_script_from_script_id(properties.script_id); properties.script_id = add_script(script); const char* other_case = src.id_to_unichar(properties.other_case); if (contains_unichar(other_case)) { properties.other_case = unichar_to_id(other_case); } else { properties.other_case = ch; } const char* mirror_str = src.id_to_unichar(properties.mirror); if (contains_unichar(mirror_str)) { properties.mirror = unichar_to_id(mirror_str); } else { properties.mirror = ch; } unichars[ch].properties.CopyFrom(properties); set_normed_ids(ch); } else { tprintf("Failed to get properties for index %d = %s\n", ch, utf8); } } }
void UNICHARSET::post_load_setup | ( | ) |
Definition at line 882 of file unicharset.cpp.
{ // Number of alpha chars with the case property minus those without, // in order to determine that half the alpha chars have case. int net_case_alphas = 0; int x_height_alphas = 0; int cap_height_alphas = 0; top_bottom_set_ = false; for (UNICHAR_ID id = 0; id < size_used; ++id) { int min_bottom = 0; int max_bottom = MAX_UINT8; int min_top = 0; int max_top = MAX_UINT8; get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top); if (min_top > 0) top_bottom_set_ = true; if (get_isalpha(id)) { if (get_islower(id) || get_isupper(id)) ++net_case_alphas; else --net_case_alphas; if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold) ++x_height_alphas; else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold) ++cap_height_alphas; } set_normed_ids(id); } script_has_upper_lower_ = net_case_alphas > 0; script_has_xheight_ = script_has_upper_lower_ || (x_height_alphas > cap_height_alphas * kMinXHeightFraction && cap_height_alphas > x_height_alphas * kMinCapHeightFraction); null_sid_ = get_script_id_from_name(null_script); ASSERT_HOST(null_sid_ == 0); common_sid_ = get_script_id_from_name("Common"); latin_sid_ = get_script_id_from_name("Latin"); cyrillic_sid_ = get_script_id_from_name("Cyrillic"); greek_sid_ = get_script_id_from_name("Greek"); han_sid_ = get_script_id_from_name("Han"); hiragana_sid_ = get_script_id_from_name("Hiragana"); katakana_sid_ = get_script_id_from_name("Katakana"); // Compute default script. Use the highest-counting alpha script, that is // not the common script, as that still contains some "alphas". int* script_counts = new int[script_table_size_used]; memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used); for (int id = 0; id < size_used; ++id) { if (get_isalpha(id)) { ++script_counts[get_script(id)]; } } default_sid_ = 0; for (int s = 1; s < script_table_size_used; ++s) { if (script_counts[s] > script_counts[default_sid_] && s != common_sid_) default_sid_ = s; } delete [] script_counts; }
void UNICHARSET::reserve | ( | int | unichars_number | ) |
Definition at line 173 of file unicharset.cpp.
{ if (unichars_number > size_reserved) { UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number]; for (int i = 0; i < size_used; ++i) unichars_new[i] = unichars[i]; for (int j = size_used; j < unichars_number; ++j) { unichars_new[j].properties.script_id = add_script(null_script); } delete[] unichars; unichars = unichars_new; size_reserved = unichars_number; } }
bool UNICHARSET::save_to_file | ( | const char *const | filename | ) | const [inline] |
Definition at line 310 of file unicharset.h.
{ FILE* file = fopen(filename, "w+b"); if (file == NULL) return false; bool result = save_to_file(file); fclose(file); return result; }
bool UNICHARSET::save_to_file | ( | FILE * | file | ) | const |
Definition at line 692 of file unicharset.cpp.
{ fprintf(file, "%d\n", this->size()); for (UNICHAR_ID id = 0; id < this->size(); ++id) { int min_bottom, max_bottom, min_top, max_top; get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top); int min_width, max_width; get_width_range(id, &min_width, &max_width); int min_bearing, max_bearing; get_bearing_range(id, &min_bearing, &max_bearing); int min_advance, max_advance; get_advance_range(id, &min_advance, &max_advance); unsigned int properties = this->get_properties(id); if (strcmp(this->id_to_unichar(id), " ") == 0) { fprintf(file, "%s %x %s %d\n", "NULL", properties, this->get_script_from_script_id(this->get_script(id)), this->get_other_case(id)); } else { fprintf(file, "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n", this->id_to_unichar(id), properties, min_bottom, max_bottom, min_top, max_top, min_width, max_width, min_bearing, max_bearing, min_advance, max_advance, this->get_script_from_script_id(this->get_script(id)), this->get_other_case(id), this->get_direction(id), this->get_mirror(id), this->get_normed_unichar(id), this->debug_str(id).string()); } } return true; }
bool UNICHARSET::script_has_upper_lower | ( | ) | const [inline] |
Definition at line 829 of file unicharset.h.
{
return script_has_upper_lower_;
}
bool UNICHARSET::script_has_xheight | ( | ) | const [inline] |
Definition at line 836 of file unicharset.h.
{
return script_has_xheight_;
}
void UNICHARSET::set_advance_range | ( | UNICHAR_ID | unichar_id, |
int | min_advance, | ||
int | max_advance | ||
) | [inline] |
Definition at line 592 of file unicharset.h.
{ unichars[unichar_id].properties.min_advance = static_cast<inT16>(ClipToRange(min_advance, 0, MAX_INT16)); unichars[unichar_id].properties.max_advance = static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16)); }
void UNICHARSET::set_bearing_range | ( | UNICHAR_ID | unichar_id, |
int | min_bearing, | ||
int | max_bearing | ||
) | [inline] |
Definition at line 571 of file unicharset.h.
{ unichars[unichar_id].properties.min_bearing = static_cast<inT16>(ClipToRange(min_bearing, 0, MAX_INT16)); unichars[unichar_id].properties.max_bearing = static_cast<inT16>(ClipToRange(max_bearing, 0, MAX_INT16)); }
void UNICHARSET::set_black_and_whitelist | ( | const char * | blacklist, |
const char * | whitelist | ||
) |
Definition at line 962 of file unicharset.cpp.
{ bool def_enabled = whitelist == NULL || whitelist[0] == '\0'; // Set everything to default for (int ch = 0; ch < size_used; ++ch) unichars[ch].properties.enabled = def_enabled; if (!def_enabled) { // Enable the whitelist. GenericVector<UNICHAR_ID> encoding; encode_string(whitelist, false, &encoding, NULL, NULL); for (int i = 0; i < encoding.size(); ++i) { if (encoding[i] != INVALID_UNICHAR_ID) unichars[encoding[i]].properties.enabled = true; } } if (blacklist != NULL && blacklist[0] != '\0') { // Disable the blacklist. GenericVector<UNICHAR_ID> encoding; encode_string(blacklist, false, &encoding, NULL, NULL); for (int i = 0; i < encoding.size(); ++i) { if (encoding[i] != INVALID_UNICHAR_ID) unichars[encoding[i]].properties.enabled = false; } } }
void UNICHARSET::set_direction | ( | UNICHAR_ID | unichar_id, |
UNICHARSET::Direction | value | ||
) | [inline] |
Definition at line 414 of file unicharset.h.
{ unichars[unichar_id].properties.direction = value; }
void UNICHARSET::set_isalpha | ( | UNICHAR_ID | unichar_id, |
bool | value | ||
) | [inline] |
Definition at line 373 of file unicharset.h.
{ unichars[unichar_id].properties.isalpha = value; }
void UNICHARSET::set_isdigit | ( | UNICHAR_ID | unichar_id, |
bool | value | ||
) | [inline] |
Definition at line 388 of file unicharset.h.
{ unichars[unichar_id].properties.isdigit = value; }
void UNICHARSET::set_islower | ( | UNICHAR_ID | unichar_id, |
bool | value | ||
) | [inline] |
Definition at line 378 of file unicharset.h.
{ unichars[unichar_id].properties.islower = value; }
void UNICHARSET::set_isngram | ( | UNICHAR_ID | unichar_id, |
bool | value | ||
) | [inline] |
Definition at line 398 of file unicharset.h.
{ unichars[unichar_id].properties.isngram = value; }
void UNICHARSET::set_ispunctuation | ( | UNICHAR_ID | unichar_id, |
bool | value | ||
) | [inline] |
Definition at line 393 of file unicharset.h.
{ unichars[unichar_id].properties.ispunctuation = value; }
void UNICHARSET::set_isupper | ( | UNICHAR_ID | unichar_id, |
bool | value | ||
) | [inline] |
Definition at line 383 of file unicharset.h.
{ unichars[unichar_id].properties.isupper = value; }
void UNICHARSET::set_mirror | ( | UNICHAR_ID | unichar_id, |
UNICHAR_ID | mirror | ||
) | [inline] |
Definition at line 419 of file unicharset.h.
{ unichars[unichar_id].properties.mirror = mirror; }
void UNICHARSET::set_normed | ( | UNICHAR_ID | unichar_id, |
const char * | normed | ||
) | [inline] |
Definition at line 424 of file unicharset.h.
{ unichars[unichar_id].properties.normed = normed; unichars[unichar_id].properties.normed_ids.truncate(0); }
void UNICHARSET::set_normed_ids | ( | UNICHAR_ID | unichar_id | ) |
Definition at line 370 of file unicharset.cpp.
{ unichars[unichar_id].properties.normed_ids.truncate(0); int length = unichars[unichar_id].properties.normed.length(); const char* normed_str = unichars[unichar_id].properties.normed.string(); int step = 0; for (int offset = 0; offset < length; offset+= step) { step = normed_step(normed_str + offset); if (step == 0) { unichars[unichar_id].properties.normed_ids.truncate(0); unichars[unichar_id].properties.normed_ids.push_back(unichar_id); break; } int normed_id = unichar_to_id(normed_str + offset, step); ASSERT_HOST(normed_id >= 0); unichars[unichar_id].properties.normed_ids.push_back(normed_id); } }
void UNICHARSET::set_other_case | ( | UNICHAR_ID | unichar_id, |
UNICHAR_ID | other_case | ||
) | [inline] |
Definition at line 409 of file unicharset.h.
{ unichars[unichar_id].properties.other_case = other_case; }
void UNICHARSET::set_ranges_empty | ( | ) |
Definition at line 399 of file unicharset.cpp.
{ for (int id = 0; id < size_used; ++id) { unichars[id].properties.SetRangesEmpty(); } }
void UNICHARSET::set_script | ( | UNICHAR_ID | unichar_id, |
const char * | value | ||
) | [inline] |
Definition at line 404 of file unicharset.h.
{ unichars[unichar_id].properties.script_id = add_script(value); }
void UNICHARSET::set_top_bottom | ( | UNICHAR_ID | unichar_id, |
int | min_bottom, | ||
int | max_bottom, | ||
int | min_top, | ||
int | max_top | ||
) | [inline] |
Definition at line 524 of file unicharset.h.
{ unichars[unichar_id].properties.min_bottom = static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8)); unichars[unichar_id].properties.max_bottom = static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8)); unichars[unichar_id].properties.min_top = static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8)); unichars[unichar_id].properties.max_top = static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8)); }
void UNICHARSET::set_width_range | ( | UNICHAR_ID | unichar_id, |
int | min_width, | ||
int | max_width | ||
) | [inline] |
Definition at line 551 of file unicharset.h.
{ unichars[unichar_id].properties.min_width = static_cast<inT16>(ClipToRange(min_width, 0, MAX_INT16)); unichars[unichar_id].properties.max_width = static_cast<inT16>(ClipToRange(max_width, 0, MAX_INT16)); }
void UNICHARSET::SetPropertiesFromOther | ( | const UNICHARSET & | src | ) | [inline] |
Definition at line 487 of file unicharset.h.
{ PartialSetPropertiesFromOther(0, src); }
int UNICHARSET::size | ( | ) | const [inline] |
Definition at line 301 of file unicharset.h.
{
return size_used;
}
bool UNICHARSET::SizesDistinct | ( | UNICHAR_ID | id1, |
UNICHAR_ID | id2 | ||
) | const |
Definition at line 494 of file unicharset.cpp.
{ int overlap = MIN(unichars[id1].properties.max_top, unichars[id2].properties.max_top) - MAX(unichars[id1].properties.min_top, unichars[id2].properties.min_top); return overlap <= 0; }
int UNICHARSET::step | ( | const char * | str | ) | const |
Definition at line 205 of file unicharset.cpp.
{ GenericVector<UNICHAR_ID> encoding; GenericVector<char> lengths; encode_string(str, true, &encoding, &lengths, NULL); if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0; return lengths[0]; }
UNICHAR_ID UNICHARSET::to_lower | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 644 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; ASSERT_HOST(contains_unichar_id(unichar_id)); if (unichars[unichar_id].properties.islower) return unichar_id; return unichars[unichar_id].properties.other_case; }
UNICHAR_ID UNICHARSET::to_upper | ( | UNICHAR_ID | unichar_id | ) | const [inline] |
Definition at line 652 of file unicharset.h.
{ if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID; ASSERT_HOST(contains_unichar_id(unichar_id)); if (unichars[unichar_id].properties.isupper) return unichar_id; return unichars[unichar_id].properties.other_case; }
bool UNICHARSET::top_bottom_useful | ( | ) | const [inline] |
Definition at line 479 of file unicharset.h.
{
return top_bottom_set_;
}
void UNICHARSET::unichar_insert | ( | const char *const | unichar_repr | ) |
Definition at line 643 of file unicharset.cpp.
{ if (!ids.contains(unichar_repr)) { if (strlen(unichar_repr) > UNICHAR_LEN) { fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n", int(strlen(unichar_repr)), unichar_repr); return; } if (size_used == size_reserved) { if (size_used == 0) reserve(8); else reserve(2 * size_used); } strcpy(unichars[size_used].representation, unichar_repr); this->set_script(size_used, null_script); // If the given unichar_repr represents a fragmented character, set // fragment property to a pointer to CHAR_FRAGMENT class instance with // information parsed from the unichar representation. Use the script // of the base unichar for the fragmented character if possible. CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr); this->unichars[size_used].properties.fragment = frag; if (frag != NULL && this->contains_unichar(frag->get_unichar())) { this->unichars[size_used].properties.script_id = this->get_script(frag->get_unichar()); } this->unichars[size_used].properties.enabled = true; ids.insert(unichar_repr, size_used); ++size_used; } }
const UNICHAR_ID UNICHARSET::unichar_to_id | ( | const char *const | unichar_repr | ) | const |
Definition at line 188 of file unicharset.cpp.
{ return ids.contains(unichar_repr) ? ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID; }
const UNICHAR_ID UNICHARSET::unichar_to_id | ( | const char *const | unichar_repr, |
int | length | ||
) | const |
Definition at line 193 of file unicharset.cpp.
{ assert(length > 0 && length <= UNICHAR_LEN); return ids.contains(unichar_repr, length) ? ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID; }
const char * UNICHARSET::kCustomLigatures [static] |
{ {"ct", "\uE003"}, {"ſh", "\uE006"}, {"ſi", "\uE007"}, {"ſl", "\uE008"}, {"ſſ", "\uE009"}, {NULL, NULL} }
Definition at line 143 of file unicharset.h.
const char * UNICHARSET::kSpecialUnicharCodes [static] |
{ " ", "Joined", "|Broken|0|1" }
Definition at line 146 of file unicharset.h.