tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/wordrec/pieces.cpp
Go to the documentation of this file.
00001 /* -*-C-*-
00002  ********************************************************************************
00003  *
00004  * File:        pieces.c  (Formerly pieces.c)
00005  * Description:
00006  * Author:       Mark Seaman, OCR Technology
00007  * Created:      Fri Oct 16 14:37:00 1987
00008  * Modified:     Mon May 20 12:12:35 1991 (Mark Seaman) marks@hpgrlt
00009  * Language:     C
00010  * Package:      N/A
00011  * Status:       Reusable Software Component
00012  *
00013  * (c) Copyright 1987, Hewlett-Packard Company.
00014  ** Licensed under the Apache License, Version 2.0 (the "License");
00015  ** you may not use this file except in compliance with the License.
00016  ** You may obtain a copy of the License at
00017  ** http://www.apache.org/licenses/LICENSE-2.0
00018  ** Unless required by applicable law or agreed to in writing, software
00019  ** distributed under the License is distributed on an "AS IS" BASIS,
00020  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00021  ** See the License for the specific language governing permissions and
00022  ** limitations under the License.
00023  *
00024  *********************************************************************************/
00025 /*----------------------------------------------------------------------
00026           I n c l u d e s
00027 ----------------------------------------------------------------------*/
00028 
00029 #include "blobs.h"
00030 #include "freelist.h"
00031 #include "helpers.h"
00032 #include "matrix.h"
00033 #include "ndminx.h"
00034 #include "ratngs.h"
00035 #include "seam.h"
00036 #include "wordrec.h"
00037 
00038 // Include automatically generated configuration file if running autoconf.
00039 #ifdef HAVE_CONFIG_H
00040 #include "config_auto.h"
00041 #endif
00042 
00043 /*----------------------------------------------------------------------
00044           F u n c t i o n s
00045 ----------------------------------------------------------------------*/
00046 
00047 /**********************************************************************
00048  * classify_piece
00049  *
00050  * Create a larger piece from a collection of smaller ones.  Classify
00051  * it and return the results.  Take the large piece apart to leave
00052  * the collection of small pieces un modified.
00053  **********************************************************************/
00054 namespace tesseract {
00055 BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector<SEAM*>& seams,
00056                                           inT16 start,
00057                                           inT16 end,
00058                                           const char* description,
00059                                           TWERD *word,
00060                                           BlamerBundle *blamer_bundle) {
00061   if (end > start) join_pieces(seams, start, end, word);
00062   BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description,
00063                                             White, blamer_bundle);
00064   // Set the matrix_cell_ entries in all the BLOB_CHOICES.
00065   BLOB_CHOICE_IT bc_it(choices);
00066   for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
00067     bc_it.data()->set_matrix_cell(start, end);
00068   }
00069 
00070   if (end > start) break_pieces(seams, start, end, word);
00071 
00072   return (choices);
00073 }
00074 
00075 template<class BLOB_CHOICE>
00076 int SortByUnicharID(const void *void1, const void *void2) {
00077   const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
00078   const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
00079 
00080   return p1->unichar_id() - p2->unichar_id();
00081 }
00082 
00083 template<class BLOB_CHOICE>
00084 int SortByRating(const void *void1, const void *void2) {
00085   const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
00086   const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
00087 
00088   if (p1->rating() < p2->rating())
00089     return 1;
00090   return -1;
00091 }
00092 
00093 
00094 /**********************************************************************
00095  * fill_filtered_fragment_list
00096  *
00097  * Filter the fragment list so that the filtered_choices only contain
00098  * fragments that are in the correct position. choices is the list
00099  * that we are going to filter. fragment_pos is the position in the
00100  * fragment that we are looking for and num_frag_parts is the the
00101  * total number of pieces. The result will be appended to
00102  * filtered_choices.
00103  **********************************************************************/
00104 void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
00105                                           int fragment_pos,
00106                                           int num_frag_parts,
00107                                           BLOB_CHOICE_LIST *filtered_choices) {
00108   BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
00109   BLOB_CHOICE_IT choices_it(choices);
00110 
00111   for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
00112        choices_it.forward()) {
00113     UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
00114     const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
00115 
00116     if (frag != NULL && frag->get_pos() == fragment_pos &&
00117         frag->get_total() == num_frag_parts) {
00118       // Recover the unichar_id of the unichar that this fragment is
00119       // a part of
00120       BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data());
00121       int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
00122       b->set_unichar_id(original_unichar);
00123       filtered_choices_it.add_to_end(b);
00124     }
00125   }
00126 
00127   filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
00128 }
00129 
00130 
00131 /**********************************************************************
00132  * merge_and_put_fragment_lists
00133  *
00134  * Merge the fragment lists in choice_lists and append it to the
00135  * ratings matrix.
00136  **********************************************************************/
00137 void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column,
00138                                            inT16 num_frag_parts,
00139                                            BLOB_CHOICE_LIST *choice_lists,
00140                                            MATRIX *ratings) {
00141   BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
00142 
00143   for (int i = 0; i < num_frag_parts; i++) {
00144     choice_lists_it[i].set_to_list(&choice_lists[i]);
00145     choice_lists_it[i].mark_cycle_pt();
00146   }
00147 
00148   BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
00149   if (merged_choice == NULL)
00150     merged_choice = new BLOB_CHOICE_LIST;
00151 
00152   bool end_of_list = false;
00153   BLOB_CHOICE_IT merged_choice_it(merged_choice);
00154   while (!end_of_list) {
00155     // Find the maximum unichar_id of the current entry the iterators
00156     // are pointing at
00157     UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
00158     for (int i = 0; i < num_frag_parts; i++) {
00159       UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
00160       if (max_unichar_id < unichar_id) {
00161         max_unichar_id = unichar_id;
00162       }
00163     }
00164 
00165     // Move the each iterators until it gets to an entry that has a
00166     // value greater than or equal to max_unichar_id
00167     for (int i = 0; i < num_frag_parts; i++) {
00168       UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
00169       while (!choice_lists_it[i].cycled_list() &&
00170              unichar_id < max_unichar_id) {
00171         choice_lists_it[i].forward();
00172         unichar_id = choice_lists_it[i].data()->unichar_id();
00173       }
00174       if (choice_lists_it[i].cycled_list()) {
00175         end_of_list = true;
00176         break;
00177       }
00178     }
00179 
00180     if (end_of_list)
00181       break;
00182 
00183     // Checks if the fragments are parts of the same character
00184     UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
00185     bool same_unichar = true;
00186     for (int i = 1; i < num_frag_parts; i++) {
00187       UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
00188       if (unichar_id != first_unichar_id) {
00189         same_unichar = false;
00190         break;
00191       }
00192     }
00193 
00194     if (same_unichar) {
00195       // Add the merged character to the result
00196       UNICHAR_ID merged_unichar_id = first_unichar_id;
00197       inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id();
00198       inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2();
00199       float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
00200       float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
00201       float positive_yshift = 0, negative_yshift = 0;
00202       int merged_script_id = choice_lists_it[0].data()->script_id();
00203       BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();
00204 
00205       float merged_rating = 0, merged_certainty = 0;
00206       for (int i = 0; i < num_frag_parts; i++) {
00207         float rating = choice_lists_it[i].data()->rating();
00208         float certainty = choice_lists_it[i].data()->certainty();
00209 
00210         if (i == 0 || certainty < merged_certainty)
00211           merged_certainty = certainty;
00212         merged_rating += rating;
00213 
00214         choice_lists_it[i].forward();
00215         if (choice_lists_it[i].cycled_list())
00216           end_of_list = true;
00217         IntersectRange(choice_lists_it[i].data()->min_xheight(),
00218                        choice_lists_it[i].data()->max_xheight(),
00219                        &merged_min_xheight, &merged_max_xheight);
00220         float yshift = choice_lists_it[i].data()->yshift();
00221         if (yshift > positive_yshift) positive_yshift = yshift;
00222         if (yshift < negative_yshift) negative_yshift = yshift;
00223       }
00224 
00225       float merged_yshift = positive_yshift != 0
00226           ? (negative_yshift != 0 ? 0 : positive_yshift)
00227           : negative_yshift;
00228       merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id,
00229                                                   merged_rating,
00230                                                   merged_certainty,
00231                                                   merged_fontinfo_id,
00232                                                   merged_fontinfo_id2,
00233                                                   merged_script_id,
00234                                                   merged_min_xheight,
00235                                                   merged_max_xheight,
00236                                                   merged_yshift,
00237                                                   classifier));
00238     }
00239   }
00240 
00241   if (classify_debug_level)
00242     print_ratings_list("Merged Fragments", merged_choice,
00243                        unicharset);
00244 
00245   if (merged_choice->empty())
00246     delete merged_choice;
00247   else
00248     ratings->put(row, column, merged_choice);
00249 
00250   delete [] choice_lists_it;
00251 }
00252 
00253 
00254 /**********************************************************************
00255  * get_fragment_lists
00256  *
00257  * Recursively go through the ratings matrix to find lists of fragments
00258  * to be merged in the function merge_and_put_fragment_lists.
00259  * current_frag is the postion of the piece we are looking for.
00260  * current_row is the row in the rating matrix we are currently at.
00261  * start is the row we started initially, so that we can know where
00262  * to append the results to the matrix. num_frag_parts is the total
00263  * number of pieces we are looking for and num_blobs is the size of the
00264  * ratings matrix.
00265  **********************************************************************/
00266 void Wordrec::get_fragment_lists(inT16 current_frag, inT16 current_row,
00267                                  inT16 start, inT16 num_frag_parts,
00268                                  inT16 num_blobs, MATRIX *ratings,
00269                                  BLOB_CHOICE_LIST *choice_lists) {
00270   if (current_frag == num_frag_parts) {
00271     merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
00272                                  choice_lists, ratings);
00273     return;
00274   }
00275 
00276   for (inT16 x = current_row; x < num_blobs; x++) {
00277     BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
00278     if (choices == NULL)
00279       continue;
00280 
00281     fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
00282                                 &choice_lists[current_frag]);
00283     if (!choice_lists[current_frag].empty()) {
00284       get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
00285                          num_blobs, ratings, choice_lists);
00286       choice_lists[current_frag].clear();
00287     }
00288   }
00289 }
00290 
00291 
00292 /**********************************************************************
00293  * merge_fragments
00294  *
00295  * Try to merge fragments in the ratings matrix and put the result in
00296  * the corresponding row and column
00297  **********************************************************************/
00298 void Wordrec::merge_fragments(MATRIX *ratings, inT16 num_blobs) {
00299   BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
00300   for (inT16 start = 0; start < num_blobs; start++) {
00301     for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
00302          frag_parts++) {
00303       get_fragment_lists(0, start, start, frag_parts, num_blobs,
00304                          ratings, choice_lists);
00305     }
00306   }
00307 
00308   // Delete fragments from the rating matrix
00309   for (inT16 x = 0; x < num_blobs; x++) {
00310     for (inT16 y = x; y < num_blobs; y++) {
00311       BLOB_CHOICE_LIST *choices = ratings->get(x, y);
00312       if (choices != NULL) {
00313         BLOB_CHOICE_IT choices_it(choices);
00314         for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
00315              choices_it.forward()) {
00316           UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
00317           const CHAR_FRAGMENT *frag =
00318               unicharset.get_fragment(choice_unichar_id);
00319           if (frag != NULL)
00320             delete choices_it.extract();
00321         }
00322       }
00323     }
00324   }
00325 }
00326 
00327 
00328 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines