tesseract
3.03
|
00001 // This program reads a unicharset file, puts the result in a UNICHARSET 00002 // object, fills it with properties about the unichars it contains and writes 00003 // the result back to a file. 00004 00005 #include <stdlib.h> 00006 #include <string.h> 00007 #include <string> 00008 00009 #include "commandlineflags.h" 00010 #include "fileio.h" 00011 #include "genericvector.h" 00012 #include "icuerrorcode.h" 00013 #include "normstrngs.h" 00014 #include "strngs.h" 00015 #include "unicharset.h" 00016 #include "unicode/uchar.h" // from libicu 00017 #include "unicode/uscript.h" // from libicu 00018 00019 // The directory that is searched for universal script unicharsets. 00020 STRING_PARAM_FLAG(script_dir, "", 00021 "Directory name for input script unicharsets/xheights"); 00022 00023 // Flags from commontraining.cpp 00024 DECLARE_STRING_PARAM_FLAG(U); 00025 DECLARE_STRING_PARAM_FLAG(O); 00026 DECLARE_STRING_PARAM_FLAG(X); 00027 00028 namespace tesseract { 00029 00030 // Helper sets the character attribute properties and sets up the script table. 00031 // Does not set tops and bottoms. 00032 static void SetupBasicProperties(UNICHARSET* unicharset) { 00033 for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) { 00034 // Convert any custom ligatures. 00035 const char* unichar_str = unicharset->id_to_unichar(unichar_id); 00036 for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) { 00037 if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) { 00038 unichar_str = UNICHARSET::kCustomLigatures[i][0]; 00039 break; 00040 } 00041 } 00042 00043 // Convert the unichar to UTF32 representation 00044 GenericVector<char32> uni_vector; 00045 tesseract::UTF8ToUTF32(unichar_str, &uni_vector); 00046 00047 // Assume that if the property is true for any character in the string, 00048 // then it holds for the whole "character". 00049 bool unichar_isalpha = false; 00050 bool unichar_islower = false; 00051 bool unichar_isupper = false; 00052 bool unichar_isdigit = false; 00053 bool unichar_ispunct = false; 00054 00055 for (int i = 0; i < uni_vector.size(); ++i) { 00056 if (u_isalpha(uni_vector[i])) 00057 unichar_isalpha = true; 00058 if (u_islower(uni_vector[i])) 00059 unichar_islower = true; 00060 if (u_isupper(uni_vector[i])) 00061 unichar_isupper = true; 00062 if (u_isdigit(uni_vector[i])) 00063 unichar_isdigit = true; 00064 if (u_ispunct(uni_vector[i])) 00065 unichar_ispunct = true; 00066 } 00067 00068 unicharset->set_isalpha(unichar_id, unichar_isalpha); 00069 unicharset->set_islower(unichar_id, unichar_islower); 00070 unicharset->set_isupper(unichar_id, unichar_isupper); 00071 unicharset->set_isdigit(unichar_id, unichar_isdigit); 00072 unicharset->set_ispunctuation(unichar_id, unichar_ispunct); 00073 00074 tesseract::IcuErrorCode err; 00075 unicharset->set_script(unichar_id, uscript_getName( 00076 uscript_getScript(uni_vector[0], err))); 00077 00078 const int num_code_points = uni_vector.size(); 00079 // Obtain the lower/upper case if needed and record it in the properties. 00080 unicharset->set_other_case(unichar_id, unichar_id); 00081 if (unichar_islower || unichar_isupper) { 00082 GenericVector<char32> other_case(num_code_points, 0); 00083 for (int i = 0; i < num_code_points; ++i) { 00084 // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used. 00085 // However since they deal with UChars (so need a conversion function 00086 // from char32 or UTF8string) and require a meaningful locale string, 00087 // for now u_tolower()/u_toupper() are used. 00088 other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : 00089 u_tolower(uni_vector[i]); 00090 } 00091 STRING other_case_uch; 00092 tesseract::UTF32ToUTF8(other_case, &other_case_uch); 00093 UNICHAR_ID other_case_id = 00094 unicharset->unichar_to_id(other_case_uch.c_str()); 00095 if (other_case_id != INVALID_UNICHAR_ID) { 00096 unicharset->set_other_case(unichar_id, other_case_id); 00097 } else { 00098 tprintf("Other case %s of %s is not in unicharset", 00099 other_case_uch.c_str(), unichar_str); 00100 } 00101 } 00102 00103 // Set RTL property and obtain mirror unichar ID from ICU. 00104 GenericVector<char32> mirrors(num_code_points, 0); 00105 for (int i = 0; i < num_code_points; ++i) { 00106 mirrors[i] = u_charMirror(uni_vector[i]); 00107 if (i == 0) { // set directionality to that of the 1st code point 00108 unicharset->set_direction(unichar_id, 00109 static_cast<UNICHARSET::Direction>( 00110 u_charDirection(uni_vector[i]))); 00111 } 00112 } 00113 STRING mirror_uch; 00114 tesseract::UTF32ToUTF8(mirrors, &mirror_uch); 00115 UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str()); 00116 if (mirror_uch_id != INVALID_UNICHAR_ID) { 00117 unicharset->set_mirror(unichar_id, mirror_uch_id); 00118 } else { 00119 tprintf("Mirror %s of %s is not in unicharset\n", 00120 mirror_uch.c_str(), unichar_str); 00121 } 00122 00123 // Record normalized version of this unichar. 00124 STRING normed_str = tesseract::NormalizeUTF8String(unichar_str); 00125 if (unichar_id != 0 && normed_str.length() > 0) { 00126 unicharset->set_normed(unichar_id, normed_str.c_str()); 00127 } else { 00128 unicharset->set_normed(unichar_id, unichar_str); 00129 } 00130 } 00131 unicharset->post_load_setup(); 00132 } 00133 00134 // Helper to set the properties for an input unicharset file, writes to the 00135 // output file. If an appropriate script unicharset can be found in the 00136 // script_dir directory, then the tops and bottoms are expanded using the 00137 // script unicharset. 00138 // If non-empty, xheight data for the fonts are written to the xheights_file. 00139 static void SetPropertiesForInputFile(const string& script_dir, 00140 const string& input_unicharset_file, 00141 const string& output_unicharset_file, 00142 const string& output_xheights_file) { 00143 UNICHARSET unicharset; 00144 00145 // Load the input unicharset 00146 unicharset.load_from_file(input_unicharset_file.c_str()); 00147 tprintf("Loaded unicharset of size %d from file %s", unicharset.size(), 00148 input_unicharset_file.c_str()); 00149 00150 // Set unichar properties 00151 tprintf("Setting unichar properties"); 00152 SetupBasicProperties(&unicharset); 00153 string xheights_str; 00154 for (int s = 0; s < unicharset.get_script_table_size(); ++s) { 00155 // Load the unicharset for the script if available. 00156 string filename = script_dir + "/" + 00157 unicharset.get_script_from_script_id(s) + ".unicharset"; 00158 UNICHARSET script_set; 00159 if (script_set.load_from_file(filename.c_str())) { 00160 unicharset.SetPropertiesFromOther(script_set); 00161 } 00162 // Load the xheights for the script if available. 00163 filename = script_dir + "/" + unicharset.get_script_from_script_id(s) + 00164 ".xheights"; 00165 string script_heights; 00166 if (File::ReadFileToString(filename, &script_heights)) 00167 xheights_str += script_heights; 00168 } 00169 if (!output_xheights_file.empty()) 00170 File::WriteStringToFileOrDie(xheights_str, output_xheights_file); 00171 00172 // Write the output unicharset 00173 tprintf("Writing unicharset to file %s", output_unicharset_file.c_str()); 00174 unicharset.save_to_file(output_unicharset_file.c_str()); 00175 } 00176 } // namespace tesseract 00177 00178 00179 int main(int argc, char** argv) { 00180 tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); 00181 00182 // Check validity of input flags. 00183 if (FLAGS_U.empty() || FLAGS_O.empty()) { 00184 tprintf("Specify both input and output unicharsets!"); 00185 exit(1); 00186 } 00187 if (FLAGS_script_dir.empty()) { 00188 tprintf("Must specify a script_dir!"); 00189 exit(1); 00190 } 00191 00192 tesseract::SetPropertiesForInputFile(FLAGS_script_dir.c_str(), 00193 FLAGS_U.c_str(), FLAGS_O.c_str(), 00194 FLAGS_X.c_str()); 00195 return 0; 00196 }