tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/set_unicharset_properties.cpp
Go to the documentation of this file.
00001 // This program reads a unicharset file, puts the result in a UNICHARSET
00002 // object, fills it with properties about the unichars it contains and writes
00003 // the result back to a file.
00004 
00005 #include <stdlib.h>
00006 #include <string.h>
00007 #include <string>
00008 
00009 #include "commandlineflags.h"
00010 #include "fileio.h"
00011 #include "genericvector.h"
00012 #include "icuerrorcode.h"
00013 #include "normstrngs.h"
00014 #include "strngs.h"
00015 #include "unicharset.h"
00016 #include "unicode/uchar.h"    // from libicu
00017 #include "unicode/uscript.h"  // from libicu
00018 
00019 // The directory that is searched for universal script unicharsets.
00020 STRING_PARAM_FLAG(script_dir, "",
00021                   "Directory name for input script unicharsets/xheights");
00022 
00023 // Flags from commontraining.cpp
00024 DECLARE_STRING_PARAM_FLAG(U);
00025 DECLARE_STRING_PARAM_FLAG(O);
00026 DECLARE_STRING_PARAM_FLAG(X);
00027 
00028 namespace tesseract {
00029 
00030 // Helper sets the character attribute properties and sets up the script table.
00031 // Does not set tops and bottoms.
00032 static void SetupBasicProperties(UNICHARSET* unicharset) {
00033   for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
00034     // Convert any custom ligatures.
00035     const char* unichar_str = unicharset->id_to_unichar(unichar_id);
00036     for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
00037       if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
00038         unichar_str = UNICHARSET::kCustomLigatures[i][0];
00039         break;
00040       }
00041     }
00042 
00043     // Convert the unichar to UTF32 representation
00044     GenericVector<char32> uni_vector;
00045     tesseract::UTF8ToUTF32(unichar_str, &uni_vector);
00046 
00047     // Assume that if the property is true for any character in the string,
00048     // then it holds for the whole "character".
00049     bool unichar_isalpha = false;
00050     bool unichar_islower = false;
00051     bool unichar_isupper = false;
00052     bool unichar_isdigit = false;
00053     bool unichar_ispunct = false;
00054 
00055     for (int i = 0; i < uni_vector.size(); ++i) {
00056       if (u_isalpha(uni_vector[i]))
00057         unichar_isalpha = true;
00058       if (u_islower(uni_vector[i]))
00059         unichar_islower = true;
00060       if (u_isupper(uni_vector[i]))
00061         unichar_isupper = true;
00062       if (u_isdigit(uni_vector[i]))
00063         unichar_isdigit = true;
00064       if (u_ispunct(uni_vector[i]))
00065         unichar_ispunct = true;
00066     }
00067 
00068     unicharset->set_isalpha(unichar_id, unichar_isalpha);
00069     unicharset->set_islower(unichar_id, unichar_islower);
00070     unicharset->set_isupper(unichar_id, unichar_isupper);
00071     unicharset->set_isdigit(unichar_id, unichar_isdigit);
00072     unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
00073 
00074     tesseract::IcuErrorCode err;
00075     unicharset->set_script(unichar_id, uscript_getName(
00076         uscript_getScript(uni_vector[0], err)));
00077 
00078     const int num_code_points = uni_vector.size();
00079     // Obtain the lower/upper case if needed and record it in the properties.
00080     unicharset->set_other_case(unichar_id, unichar_id);
00081     if (unichar_islower || unichar_isupper) {
00082       GenericVector<char32> other_case(num_code_points, 0);
00083       for (int i = 0; i < num_code_points; ++i) {
00084         // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
00085         // However since they deal with UChars (so need a conversion function
00086         // from char32 or UTF8string) and require a meaningful locale string,
00087         // for now u_tolower()/u_toupper() are used.
00088         other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
00089           u_tolower(uni_vector[i]);
00090       }
00091       STRING other_case_uch;
00092       tesseract::UTF32ToUTF8(other_case, &other_case_uch);
00093       UNICHAR_ID other_case_id =
00094           unicharset->unichar_to_id(other_case_uch.c_str());
00095       if (other_case_id != INVALID_UNICHAR_ID) {
00096         unicharset->set_other_case(unichar_id, other_case_id);
00097       } else {
00098         tprintf("Other case %s of %s is not in unicharset",
00099                 other_case_uch.c_str(), unichar_str);
00100       }
00101     }
00102 
00103     // Set RTL property and obtain mirror unichar ID from ICU.
00104     GenericVector<char32> mirrors(num_code_points, 0);
00105     for (int i = 0; i < num_code_points; ++i) {
00106       mirrors[i] = u_charMirror(uni_vector[i]);
00107       if (i == 0) {  // set directionality to that of the 1st code point
00108         unicharset->set_direction(unichar_id,
00109                                   static_cast<UNICHARSET::Direction>(
00110                                       u_charDirection(uni_vector[i])));
00111       }
00112     }
00113     STRING mirror_uch;
00114     tesseract::UTF32ToUTF8(mirrors, &mirror_uch);
00115     UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
00116     if (mirror_uch_id != INVALID_UNICHAR_ID) {
00117       unicharset->set_mirror(unichar_id, mirror_uch_id);
00118     } else {
00119       tprintf("Mirror %s of %s is not in unicharset\n",
00120               mirror_uch.c_str(), unichar_str);
00121     }
00122 
00123     // Record normalized version of this unichar.
00124     STRING normed_str = tesseract::NormalizeUTF8String(unichar_str);
00125     if (unichar_id != 0 && normed_str.length() > 0) {
00126       unicharset->set_normed(unichar_id, normed_str.c_str());
00127     } else {
00128       unicharset->set_normed(unichar_id, unichar_str);
00129     }
00130   }
00131   unicharset->post_load_setup();
00132 }
00133 
00134 // Helper to set the properties for an input unicharset file, writes to the
00135 // output file. If an appropriate script unicharset can be found in the
00136 // script_dir directory, then the tops and bottoms are expanded using the
00137 // script unicharset.
00138 // If non-empty, xheight data for the fonts are written to the xheights_file.
00139 static void SetPropertiesForInputFile(const string& script_dir,
00140                                       const string& input_unicharset_file,
00141                                       const string& output_unicharset_file,
00142                                       const string& output_xheights_file) {
00143   UNICHARSET unicharset;
00144 
00145   // Load the input unicharset
00146   unicharset.load_from_file(input_unicharset_file.c_str());
00147   tprintf("Loaded unicharset of size %d from file %s", unicharset.size(),
00148           input_unicharset_file.c_str());
00149 
00150   // Set unichar properties
00151   tprintf("Setting unichar properties");
00152   SetupBasicProperties(&unicharset);
00153   string xheights_str;
00154   for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
00155     // Load the unicharset for the script if available.
00156     string filename = script_dir + "/" +
00157         unicharset.get_script_from_script_id(s) + ".unicharset";
00158     UNICHARSET script_set;
00159     if (script_set.load_from_file(filename.c_str())) {
00160       unicharset.SetPropertiesFromOther(script_set);
00161     }
00162     // Load the xheights for the script if available.
00163     filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
00164         ".xheights";
00165     string script_heights;
00166     if (File::ReadFileToString(filename, &script_heights))
00167       xheights_str += script_heights;
00168   }
00169   if (!output_xheights_file.empty())
00170     File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
00171 
00172   // Write the output unicharset
00173   tprintf("Writing unicharset to file %s", output_unicharset_file.c_str());
00174   unicharset.save_to_file(output_unicharset_file.c_str());
00175 }
00176 }  // namespace tesseract
00177 
00178 
00179 int main(int argc, char** argv) {
00180   tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
00181 
00182   // Check validity of input flags.
00183   if (FLAGS_U.empty() || FLAGS_O.empty()) {
00184     tprintf("Specify both input and output unicharsets!");
00185     exit(1);
00186   }
00187   if (FLAGS_script_dir.empty()) {
00188     tprintf("Must specify a script_dir!");
00189     exit(1);
00190   }
00191 
00192   tesseract::SetPropertiesForInputFile(FLAGS_script_dir.c_str(),
00193                                        FLAGS_U.c_str(), FLAGS_O.c_str(),
00194                                        FLAGS_X.c_str());
00195   return 0;
00196 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines