tesseract
3.03
|
00001 00002 // File: unicharset_extractor.cpp 00003 // Description: Unicode character/ligature set extractor. 00004 // Author: Thomas Kielbus 00005 // Created: Wed Jun 28 17:05:01 PDT 2006 00006 // 00007 // (C) Copyright 2006, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 // Given a list of box files on the command line, this program generates a file 00021 // containing a unicharset, a list of all the characters used by Tesseract 00022 // 00023 // The file contains the size of the set on the first line, and then one 00024 // unichar per line. 00025 00026 #include <stdio.h> 00027 #if defined(HAVE_WCHAR_T) || defined(_WIN32) || defined(GOOGLE3) 00028 #include <wchar.h> 00029 #include <wctype.h> 00030 #define USING_WCTYPE 00031 #endif 00032 #include <locale.h> 00033 00034 #include "boxread.h" 00035 #include "rect.h" 00036 #include "strngs.h" 00037 #include "tessopt.h" 00038 #include "unichar.h" 00039 #include "unicharset.h" 00040 00041 static const char* const kUnicharsetFileName = "unicharset"; 00042 00043 UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) { 00044 UNICHAR uch(wc); 00045 char *unichar = uch.utf8_str(); 00046 UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar); 00047 delete[] unichar; 00048 return unichar_id; 00049 } 00050 00051 // Set character properties using wctype if we have it. 00052 // Contributed by piggy@gmail.com. 00053 // Modified by Ray to use UNICHAR for unicode conversion 00054 // and to check for wctype using autoconf/presence of windows. 00055 void set_properties(UNICHARSET *unicharset, const char* const c_string) { 00056 #ifdef USING_WCTYPE 00057 UNICHAR_ID id; 00058 int wc; 00059 00060 // Convert the string to a unichar id. 00061 id = unicharset->unichar_to_id(c_string); 00062 00063 // Set the other_case property to be this unichar id by default. 00064 unicharset->set_other_case(id, id); 00065 00066 int step = UNICHAR::utf8_step(c_string); 00067 if (step == 0) 00068 return; // Invalid utf-8. 00069 00070 // Get the next Unicode code point in the string. 00071 UNICHAR ch(c_string, step); 00072 wc = ch.first_uni(); 00073 00074 /* Copy the properties. */ 00075 if (iswalpha(wc)) { 00076 unicharset->set_isalpha(id, 1); 00077 if (iswlower(wc)) { 00078 unicharset->set_islower(id, 1); 00079 unicharset->set_other_case(id, wc_to_unichar_id(*unicharset, 00080 towupper(wc))); 00081 } 00082 if (iswupper(wc)) { 00083 unicharset->set_isupper(id, 1); 00084 unicharset->set_other_case(id, wc_to_unichar_id(*unicharset, 00085 towlower(wc))); 00086 } 00087 } 00088 if (iswdigit(wc)) 00089 unicharset->set_isdigit(id, 1); 00090 if(iswpunct(wc)) 00091 unicharset->set_ispunctuation(id, 1); 00092 00093 #endif 00094 } 00095 00096 int main(int argc, char** argv) { 00097 int option; 00098 const char* output_directory = "."; 00099 STRING unicharset_file_name; 00100 // Special characters are now included by default. 00101 UNICHARSET unicharset; 00102 00103 setlocale(LC_ALL, ""); 00104 00105 // Print usage 00106 if (argc <= 1) { 00107 printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]); 00108 exit(1); 00109 00110 } 00111 00112 // Parse arguments 00113 while ((option = tessopt(argc, argv, "D" )) != EOF) { 00114 switch (option) { 00115 case 'D': 00116 output_directory = tessoptarg; 00117 ++tessoptind; 00118 break; 00119 } 00120 } 00121 00122 // Save file name 00123 unicharset_file_name = output_directory; 00124 unicharset_file_name += "/"; 00125 unicharset_file_name += kUnicharsetFileName; 00126 00127 // Load box files 00128 for (; tessoptind < argc; ++tessoptind) { 00129 printf("Extracting unicharset from %s\n", argv[tessoptind]); 00130 00131 FILE* box_file = fopen(argv[tessoptind], "rb"); 00132 if (box_file == NULL) { 00133 printf("Cannot open box file %s\n", argv[tessoptind]); 00134 return -1; 00135 } 00136 00137 TBOX box; 00138 STRING unichar_string; 00139 int line_number = 0; 00140 while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) { 00141 unicharset.unichar_insert(unichar_string.string()); 00142 set_properties(&unicharset, unichar_string.string()); 00143 } 00144 } 00145 00146 // Write unicharset file 00147 if (unicharset.save_to_file(unicharset_file_name.string())) { 00148 printf("Wrote unicharset file %s.\n", unicharset_file_name.string()); 00149 } 00150 else { 00151 printf("Cannot save unicharset file %s.\n", unicharset_file_name.string()); 00152 return -1; 00153 } 00154 return 0; 00155 }