tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/training/unicharset_extractor.cpp
Go to the documentation of this file.
00001 
00002 // File:        unicharset_extractor.cpp
00003 // Description: Unicode character/ligature set extractor.
00004 // Author:      Thomas Kielbus
00005 // Created:     Wed Jun 28 17:05:01 PDT 2006
00006 //
00007 // (C) Copyright 2006, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 // Given a list of box files on the command line, this program generates a file
00021 // containing a unicharset, a list of all the characters used by Tesseract
00022 //
00023 // The file contains the size of the set on the first line, and then one
00024 // unichar per line.
00025 
00026 #include <stdio.h>
00027 #if defined(HAVE_WCHAR_T) || defined(_WIN32) || defined(GOOGLE3)
00028 #include <wchar.h>
00029 #include <wctype.h>
00030 #define USING_WCTYPE
00031 #endif
00032 #include <locale.h>
00033 
00034 #include "boxread.h"
00035 #include "rect.h"
00036 #include "strngs.h"
00037 #include "tessopt.h"
00038 #include "unichar.h"
00039 #include "unicharset.h"
00040 
00041 static const char* const kUnicharsetFileName = "unicharset";
00042 
00043 UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) {
00044   UNICHAR uch(wc);
00045   char *unichar = uch.utf8_str();
00046   UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
00047   delete[] unichar;
00048   return unichar_id;
00049 }
00050 
00051 // Set character properties using wctype if we have it.
00052 // Contributed by piggy@gmail.com.
00053 // Modified by Ray to use UNICHAR for unicode conversion
00054 // and to check for wctype using autoconf/presence of windows.
00055 void set_properties(UNICHARSET *unicharset, const char* const c_string) {
00056 #ifdef USING_WCTYPE
00057   UNICHAR_ID id;
00058   int wc;
00059 
00060   // Convert the string to a unichar id.
00061   id = unicharset->unichar_to_id(c_string);
00062 
00063   // Set the other_case property to be this unichar id by default.
00064   unicharset->set_other_case(id, id);
00065 
00066   int step = UNICHAR::utf8_step(c_string);
00067   if (step == 0)
00068     return; // Invalid utf-8.
00069 
00070   // Get the next Unicode code point in the string.
00071   UNICHAR ch(c_string, step);
00072   wc = ch.first_uni();
00073 
00074   /* Copy the properties. */
00075   if (iswalpha(wc)) {
00076     unicharset->set_isalpha(id, 1);
00077     if (iswlower(wc)) {
00078       unicharset->set_islower(id, 1);
00079       unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
00080                                                       towupper(wc)));
00081     }
00082     if (iswupper(wc)) {
00083       unicharset->set_isupper(id, 1);
00084       unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
00085                                                       towlower(wc)));
00086     }
00087   }
00088   if (iswdigit(wc))
00089     unicharset->set_isdigit(id, 1);
00090   if(iswpunct(wc))
00091     unicharset->set_ispunctuation(id, 1);
00092 
00093 #endif
00094 }
00095 
00096 int main(int argc, char** argv) {
00097   int option;
00098   const char* output_directory = ".";
00099   STRING unicharset_file_name;
00100   // Special characters are now included by default.
00101   UNICHARSET unicharset;
00102 
00103   setlocale(LC_ALL, "");
00104 
00105   // Print usage
00106   if (argc <= 1) {
00107     printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
00108     exit(1);
00109 
00110   }
00111 
00112   // Parse arguments
00113   while ((option = tessopt(argc, argv, "D" )) != EOF) {
00114     switch (option) {
00115       case 'D':
00116         output_directory = tessoptarg;
00117         ++tessoptind;
00118         break;
00119     }
00120   }
00121 
00122   // Save file name
00123   unicharset_file_name = output_directory;
00124   unicharset_file_name += "/";
00125   unicharset_file_name += kUnicharsetFileName;
00126 
00127   // Load box files
00128   for (; tessoptind < argc; ++tessoptind) {
00129     printf("Extracting unicharset from %s\n", argv[tessoptind]);
00130 
00131     FILE* box_file = fopen(argv[tessoptind], "rb");
00132     if (box_file == NULL) {
00133       printf("Cannot open box file %s\n", argv[tessoptind]);
00134       return -1;
00135     }
00136 
00137     TBOX box;
00138     STRING unichar_string;
00139     int line_number = 0;
00140     while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
00141       unicharset.unichar_insert(unichar_string.string());
00142       set_properties(&unicharset, unichar_string.string());
00143     }
00144   }
00145 
00146   // Write unicharset file
00147   if (unicharset.save_to_file(unicharset_file_name.string())) {
00148     printf("Wrote unicharset file %s.\n", unicharset_file_name.string());
00149   }
00150   else {
00151     printf("Cannot save unicharset file %s.\n", unicharset_file_name.string());
00152     return -1;
00153   }
00154   return 0;
00155 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines