tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/classify/shapetable.cpp
Go to the documentation of this file.
00001 // Copyright 2010 Google Inc. All Rights Reserved.
00002 // Author: rays@google.com (Ray Smith)
00004 // File:        shapetable.cpp
00005 // Description: Class to map a classifier shape index to unicharset
00006 //              indices and font indices.
00007 // Author:      Ray Smith
00008 // Created:     Tue Nov 02 15:31:32 PDT 2010
00009 //
00010 // (C) Copyright 2010, Google Inc.
00011 // Licensed under the Apache License, Version 2.0 (the "License");
00012 // you may not use this file except in compliance with the License.
00013 // You may obtain a copy of the License at
00014 // http://www.apache.org/licenses/LICENSE-2.0
00015 // Unless required by applicable law or agreed to in writing, software
00016 // distributed under the License is distributed on an "AS IS" BASIS,
00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00018 // See the License for the specific language governing permissions and
00019 // limitations under the License.
00020 //
00022 
00023 #include "shapetable.h"
00024 
00025 #include "bitvector.h"
00026 #include "fontinfo.h"
00027 #include "intfeaturespace.h"
00028 #include "strngs.h"
00029 #include "unicharset.h"
00030 #include "unicity_table.h"
00031 
00032 namespace tesseract {
00033 
00034 // Helper function to get the index of the first result with the required
00035 // unichar_id. If the results are sorted by rating, this will also be the
00036 // best result with the required unichar_id.
00037 // Returns -1 if the unichar_id is not found
00038 int ShapeRating::FirstResultWithUnichar(
00039     const GenericVector<ShapeRating>& results,
00040     const ShapeTable& shape_table,
00041     UNICHAR_ID unichar_id) {
00042   for (int r = 0; r < results.size(); ++r) {
00043     int shape_id = results[r].shape_id;
00044     const Shape& shape = shape_table.GetShape(shape_id);
00045     if (shape.ContainsUnichar(unichar_id)) {
00046       return r;
00047     }
00048   }
00049   return -1;
00050 }
00051 
00052 // Helper function to get the index of the first result with the required
00053 // unichar_id. If the results are sorted by rating, this will also be the
00054 // best result with the required unichar_id.
00055 // Returns -1 if the unichar_id is not found
00056 int UnicharRating::FirstResultWithUnichar(
00057     const GenericVector<UnicharRating>& results,
00058     UNICHAR_ID unichar_id) {
00059   for (int r = 0; r < results.size(); ++r) {
00060     if (results[r].unichar_id == unichar_id)
00061       return r;
00062   }
00063   return -1;
00064 }
00065 
00066 // Writes to the given file. Returns false in case of error.
00067 bool UnicharAndFonts::Serialize(FILE* fp) const {
00068   if (fwrite(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false;
00069   if (!font_ids.Serialize(fp)) return false;
00070   return true;
00071 }
00072 // Reads from the given file. Returns false in case of error.
00073 // If swap is true, assumes a big/little-endian swap is needed.
00074 bool UnicharAndFonts::DeSerialize(bool swap, FILE* fp) {
00075   if (fread(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false;
00076   if (swap)
00077     ReverseN(&unichar_id, sizeof(unichar_id));
00078   if (!font_ids.DeSerialize(swap, fp)) return false;
00079   return true;
00080 }
00081 
00082 // Sort function to sort a pair of UnicharAndFonts by unichar_id.
00083 int UnicharAndFonts::SortByUnicharId(const void* v1, const void* v2) {
00084   const UnicharAndFonts* p1 = reinterpret_cast<const UnicharAndFonts*>(v1);
00085   const UnicharAndFonts* p2 = reinterpret_cast<const UnicharAndFonts*>(v2);
00086   return p1->unichar_id - p2->unichar_id;
00087 }
00088 
00089 // Writes to the given file. Returns false in case of error.
00090 bool Shape::Serialize(FILE* fp) const {
00091   uinT8 sorted = unichars_sorted_;
00092   if (fwrite(&sorted, sizeof(sorted), 1, fp) != 1)
00093     return false;
00094   if (!unichars_.SerializeClasses(fp)) return false;
00095   return true;
00096 }
00097 // Reads from the given file. Returns false in case of error.
00098 // If swap is true, assumes a big/little-endian swap is needed.
00099 bool Shape::DeSerialize(bool swap, FILE* fp) {
00100   uinT8 sorted;
00101   if (fread(&sorted, sizeof(sorted), 1, fp) != 1)
00102     return false;
00103   unichars_sorted_ = sorted != 0;
00104   if (!unichars_.DeSerializeClasses(swap, fp)) return false;
00105   return true;
00106 }
00107 
00108 // Adds a font_id for the given unichar_id. If the unichar_id is not
00109 // in the shape, it is added.
00110 void Shape::AddToShape(int unichar_id, int font_id) {
00111   for (int c = 0; c < unichars_.size(); ++c) {
00112     if (unichars_[c].unichar_id == unichar_id) {
00113       // Found the unichar in the shape table.
00114       GenericVector<int>& font_list = unichars_[c].font_ids;
00115       for (int f = 0; f < font_list.size(); ++f) {
00116         if (font_list[f] == font_id)
00117           return;  // Font is already there.
00118       }
00119       font_list.push_back(font_id);
00120       return;
00121     }
00122   }
00123   // Unichar_id is not in shape, so add it to shape.
00124   unichars_.push_back(UnicharAndFonts(unichar_id, font_id));
00125   unichars_sorted_ =  unichars_.size() <= 1;
00126 }
00127 
00128 // Adds everything in other to this.
00129 void Shape::AddShape(const Shape& other) {
00130   for (int c = 0; c < other.unichars_.size(); ++c) {
00131     for (int f = 0; f < other.unichars_[c].font_ids.size(); ++f) {
00132       AddToShape(other.unichars_[c].unichar_id,
00133                  other.unichars_[c].font_ids[f]);
00134     }
00135   }
00136   unichars_sorted_ =  unichars_.size() <= 1;
00137 }
00138 
00139 // Returns true if the shape contains the given unichar_id, font_id pair.
00140 bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
00141   for (int c = 0; c < unichars_.size(); ++c) {
00142     if (unichars_[c].unichar_id == unichar_id) {
00143       // Found the unichar, so look for the font.
00144       GenericVector<int>& font_list = unichars_[c].font_ids;
00145       for (int f = 0; f < font_list.size(); ++f) {
00146         if (font_list[f] == font_id)
00147           return true;
00148       }
00149       return false;
00150     }
00151   }
00152   return false;
00153 }
00154 
00155 // Returns true if the shape contains the given unichar_id, ignoring font.
00156 bool Shape::ContainsUnichar(int unichar_id) const {
00157   for (int c = 0; c < unichars_.size(); ++c) {
00158     if (unichars_[c].unichar_id == unichar_id) {
00159       return true;
00160     }
00161   }
00162   return false;
00163 }
00164 
00165 // Returns true if the shape contains the given font, ignoring unichar_id.
00166 bool Shape::ContainsFont(int font_id) const {
00167   for (int c = 0; c < unichars_.size(); ++c) {
00168     GenericVector<int>& font_list = unichars_[c].font_ids;
00169     for (int f = 0; f < font_list.size(); ++f) {
00170       if (font_list[f] == font_id)
00171         return true;
00172     }
00173   }
00174   return false;
00175 }
00176 // Returns true if the shape contains the given font properties, ignoring
00177 // unichar_id.
00178 bool Shape::ContainsFontProperties(const FontInfoTable& font_table,
00179                                    uinT32 properties) const {
00180   for (int c = 0; c < unichars_.size(); ++c) {
00181     GenericVector<int>& font_list = unichars_[c].font_ids;
00182     for (int f = 0; f < font_list.size(); ++f) {
00183       if (font_table.get(font_list[f]).properties == properties)
00184         return true;
00185     }
00186   }
00187   return false;
00188 }
00189 // Returns true if the shape contains multiple different font properties,
00190 // ignoring unichar_id.
00191 bool Shape::ContainsMultipleFontProperties(
00192     const FontInfoTable& font_table) const {
00193   uinT32 properties = font_table.get(unichars_[0].font_ids[0]).properties;
00194   for (int c = 0; c < unichars_.size(); ++c) {
00195     GenericVector<int>& font_list = unichars_[c].font_ids;
00196     for (int f = 0; f < font_list.size(); ++f) {
00197       if (font_table.get(font_list[f]).properties != properties)
00198         return true;
00199     }
00200   }
00201   return false;
00202 }
00203 
00204 // Returns true if this shape is equal to other (ignoring order of unichars
00205 // and fonts).
00206 bool Shape::operator==(const Shape& other) const {
00207   return IsSubsetOf(other) && other.IsSubsetOf(*this);
00208 }
00209 
00210 // Returns true if this is a subset (including equal) of other.
00211 bool Shape::IsSubsetOf(const Shape& other) const {
00212   for (int c = 0; c < unichars_.size(); ++c) {
00213     int unichar_id = unichars_[c].unichar_id;
00214     const GenericVector<int>& font_list = unichars_[c].font_ids;
00215     for (int f = 0; f < font_list.size(); ++f) {
00216       if (!other.ContainsUnicharAndFont(unichar_id, font_list[f]))
00217         return false;
00218     }
00219   }
00220   return true;
00221 }
00222 
00223 // Returns true if the lists of unichar ids are the same in this and other,
00224 // ignoring fonts.
00225 // NOT const, as it will sort the unichars on demand.
00226 bool Shape::IsEqualUnichars(Shape* other) {
00227   if (unichars_.size() != other->unichars_.size()) return false;
00228   if (!unichars_sorted_) SortUnichars();
00229   if (!other->unichars_sorted_) other->SortUnichars();
00230   for (int c = 0; c < unichars_.size(); ++c) {
00231     if (unichars_[c].unichar_id != other->unichars_[c].unichar_id)
00232       return false;
00233   }
00234   return true;
00235 }
00236 
00237 // Sorts the unichars_ vector by unichar.
00238 void Shape::SortUnichars() {
00239   unichars_.sort(UnicharAndFonts::SortByUnicharId);
00240   unichars_sorted_ = true;
00241 }
00242 
00243 ShapeTable::ShapeTable() : unicharset_(NULL), num_fonts_(0) {
00244 }
00245 ShapeTable::ShapeTable(const UNICHARSET& unicharset)
00246   : unicharset_(&unicharset), num_fonts_(0) {
00247 }
00248 
00249 // Writes to the given file. Returns false in case of error.
00250 bool ShapeTable::Serialize(FILE* fp) const {
00251   if (!shape_table_.Serialize(fp)) return false;
00252   return true;
00253 }
00254 // Reads from the given file. Returns false in case of error.
00255 // If swap is true, assumes a big/little-endian swap is needed.
00256 bool ShapeTable::DeSerialize(bool swap, FILE* fp) {
00257   if (!shape_table_.DeSerialize(swap, fp)) return false;
00258   num_fonts_ = 0;
00259   return true;
00260 }
00261 
00262 // Returns the number of fonts used in this ShapeTable, computing it if
00263 // necessary.
00264 int ShapeTable::NumFonts() const {
00265   if (num_fonts_ <= 0) {
00266     for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
00267       const Shape& shape = *shape_table_[shape_id];
00268       for (int c = 0; c < shape.size(); ++c) {
00269         for (int f = 0; f < shape[c].font_ids.size(); ++f) {
00270           if (shape[c].font_ids[f] >= num_fonts_)
00271             num_fonts_ = shape[c].font_ids[f] + 1;
00272         }
00273       }
00274     }
00275   }
00276   return num_fonts_;
00277 }
00278 
00279 // Re-indexes the class_ids in the shapetable according to the given map.
00280 // Useful in conjunction with set_unicharset.
00281 void ShapeTable::ReMapClassIds(const GenericVector<int>& unicharset_map) {
00282   for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) {
00283     Shape* shape = shape_table_[shape_id];
00284     for (int c = 0; c < shape->size(); ++c) {
00285       shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]);
00286     }
00287   }
00288 }
00289 
00290 // Returns a string listing the classes/fonts in a shape.
00291 STRING ShapeTable::DebugStr(int shape_id) const {
00292   if (shape_id < 0 || shape_id >= shape_table_.size())
00293     return STRING("INVALID_UNICHAR_ID");
00294   const Shape& shape = GetShape(shape_id);
00295   STRING result;
00296   result.add_str_int("Shape", shape_id);
00297   if (shape.size() > 100) {
00298     result.add_str_int(" Num unichars=", shape.size());
00299     return result;
00300   }
00301   for (int c = 0; c < shape.size(); ++c) {
00302     result.add_str_int(" c_id=", shape[c].unichar_id);
00303     result += "=";
00304     result += unicharset_->id_to_unichar(shape[c].unichar_id);
00305     if (shape.size() < 10) {
00306       result.add_str_int(", ", shape[c].font_ids.size());
00307       result += " fonts =";
00308       int num_fonts = shape[c].font_ids.size();
00309       if (num_fonts > 10) {
00310         result.add_str_int(" ", shape[c].font_ids[0]);
00311         result.add_str_int(" ... ", shape[c].font_ids[num_fonts - 1]);
00312       } else {
00313         for (int f = 0; f < num_fonts; ++f) {
00314           result.add_str_int(" ", shape[c].font_ids[f]);
00315         }
00316       }
00317     }
00318   }
00319   return result;
00320 }
00321 
00322 // Returns a debug string summarizing the table.
00323 STRING ShapeTable::SummaryStr() const {
00324   int max_unichars = 0;
00325   int num_multi_shapes = 0;
00326   int num_master_shapes = 0;
00327   for (int s = 0; s < shape_table_.size(); ++s) {
00328     if (MasterDestinationIndex(s) != s) continue;
00329     ++num_master_shapes;
00330     int shape_size = GetShape(s).size();
00331     if (shape_size > 1)
00332       ++num_multi_shapes;
00333     if (shape_size > max_unichars)
00334       max_unichars = shape_size;
00335   }
00336   STRING result;
00337   result.add_str_int("Number of shapes = ", num_master_shapes);
00338   result.add_str_int(" max unichars = ", max_unichars);
00339   result.add_str_int(" number with multiple unichars = ", num_multi_shapes);
00340   return result;
00341 }
00342 
00343 
00344 // Adds a new shape starting with the given unichar_id and font_id.
00345 // Returns the assigned index.
00346 int ShapeTable::AddShape(int unichar_id, int font_id) {
00347   int index = shape_table_.size();
00348   Shape* shape = new Shape;
00349   shape->AddToShape(unichar_id, font_id);
00350   shape_table_.push_back(shape);
00351   num_fonts_ = MAX(num_fonts_, font_id + 1);
00352   return index;
00353 }
00354 
00355 // Adds a copy of the given shape unless it is already present.
00356 // Returns the assigned index or index of existing shape if already present.
00357 int ShapeTable::AddShape(const Shape& other) {
00358   int index;
00359   for (index = 0; index < shape_table_.size() &&
00360        !(other == *shape_table_[index]); ++index)
00361     continue;
00362   if (index == shape_table_.size()) {
00363     Shape* shape = new Shape(other);
00364     shape_table_.push_back(shape);
00365   }
00366   num_fonts_ = 0;
00367   return index;
00368 }
00369 
00370 // Removes the shape given by the shape index.
00371 void ShapeTable::DeleteShape(int shape_id) {
00372   delete shape_table_[shape_id];
00373   shape_table_[shape_id] = NULL;
00374   shape_table_.remove(shape_id);
00375 }
00376 
00377 // Adds a font_id to the given existing shape index for the given
00378 // unichar_id. If the unichar_id is not in the shape, it is added.
00379 void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
00380   Shape& shape = *shape_table_[shape_id];
00381   shape.AddToShape(unichar_id, font_id);
00382   num_fonts_ = MAX(num_fonts_, font_id + 1);
00383 }
00384 
00385 // Adds the given shape to the existing shape with the given index.
00386 void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) {
00387   Shape& shape = *shape_table_[shape_id];
00388   shape.AddShape(other);
00389   num_fonts_ = 0;
00390 }
00391 
00392 // Returns the id of the shape that contains the given unichar and font.
00393 // If not found, returns -1.
00394 // If font_id < 0, the font_id is ignored and the first shape that matches
00395 // the unichar_id is returned.
00396 int ShapeTable::FindShape(int unichar_id, int font_id) const {
00397   for (int s = 0; s < shape_table_.size(); ++s) {
00398     const Shape& shape = GetShape(s);
00399     for (int c = 0; c < shape.size(); ++c) {
00400       if (shape[c].unichar_id == unichar_id) {
00401         if (font_id < 0)
00402           return s;  // We don't care about the font.
00403         for (int f = 0; f < shape[c].font_ids.size(); ++f) {
00404           if (shape[c].font_ids[f] == font_id)
00405             return s;
00406         }
00407       }
00408     }
00409   }
00410   return -1;
00411 }
00412 
00413 // Returns the first unichar_id and font_id in the given shape.
00414 void ShapeTable::GetFirstUnicharAndFont(int shape_id,
00415                                         int* unichar_id, int* font_id) const {
00416   const UnicharAndFonts& unichar_and_fonts = (*shape_table_[shape_id])[0];
00417   *unichar_id = unichar_and_fonts.unichar_id;
00418   *font_id = unichar_and_fonts.font_ids[0];
00419 }
00420 
00421 // Expands all the classes/fonts in the shape individually to build
00422 // a ShapeTable.
00423 int ShapeTable::BuildFromShape(const Shape& shape,
00424                                const ShapeTable& master_shapes) {
00425   BitVector shape_map(master_shapes.NumShapes());
00426   for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
00427     for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
00428       int c = shape[u_ind].unichar_id;
00429       int f = shape[u_ind].font_ids[f_ind];
00430       int master_id = master_shapes.FindShape(c, f);
00431       if (master_id >= 0) {
00432         shape_map.SetBit(master_id);
00433       } else if (FindShape(c, f) < 0) {
00434         AddShape(c, f);
00435       }
00436     }
00437   }
00438   int num_masters = 0;
00439   for (int s = 0; s < master_shapes.NumShapes(); ++s) {
00440     if (shape_map[s]) {
00441       AddShape(master_shapes.GetShape(s));
00442       ++num_masters;
00443     }
00444   }
00445   return num_masters;
00446 }
00447 
00448 // Returns true if the shapes are already merged.
00449 bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) const {
00450   return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
00451 }
00452 
00453 // Returns true if any shape contains multiple unichars.
00454 bool ShapeTable::AnyMultipleUnichars() const {
00455   int num_shapes = NumShapes();
00456   for (int s1 = 0; s1 < num_shapes; ++s1) {
00457     if (MasterDestinationIndex(s1) != s1) continue;
00458     if (GetShape(s1).size() > 1)
00459       return true;
00460   }
00461   return false;
00462 }
00463 
00464 // Returns the maximum number of unichars over all shapes.
00465 int ShapeTable::MaxNumUnichars() const {
00466   int max_num_unichars = 0;
00467   int num_shapes = NumShapes();
00468   for (int s = 0; s < num_shapes; ++s) {
00469     if (GetShape(s).size() > max_num_unichars)
00470       max_num_unichars = GetShape(s).size();
00471   }
00472   return max_num_unichars;
00473 }
00474 
00475 
00476 // Merges shapes with a common unichar over the [start, end) interval.
00477 // Assumes single unichar per shape.
00478 void ShapeTable::ForceFontMerges(int start, int end) {
00479   for (int s1 = start; s1 < end; ++s1) {
00480     if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
00481       int unichar_id = GetShape(s1)[0].unichar_id;
00482       for (int s2 = s1 + 1; s2 < end; ++s2) {
00483         if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
00484             unichar_id == GetShape(s2)[0].unichar_id) {
00485           MergeShapes(s1, s2);
00486         }
00487       }
00488     }
00489   }
00490   ShapeTable compacted(*unicharset_);
00491   compacted.AppendMasterShapes(*this, NULL);
00492   *this = compacted;
00493 }
00494 
00495 // Returns the number of unichars in the master shape.
00496 int ShapeTable::MasterUnicharCount(int shape_id) const {
00497   int master_id = MasterDestinationIndex(shape_id);
00498   return GetShape(master_id).size();
00499 }
00500 
00501 // Returns the sum of the font counts in the master shape.
00502 int ShapeTable::MasterFontCount(int shape_id) const {
00503   int master_id = MasterDestinationIndex(shape_id);
00504   const Shape& shape = GetShape(master_id);
00505   int font_count = 0;
00506   for (int c = 0; c < shape.size(); ++c) {
00507     font_count += shape[c].font_ids.size();
00508   }
00509   return font_count;
00510 }
00511 
00512 // Returns the number of unichars that would result from merging the shapes.
00513 int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const {
00514   // Do it the easy way for now.
00515   int master_id1 = MasterDestinationIndex(shape_id1);
00516   int master_id2 = MasterDestinationIndex(shape_id2);
00517   Shape combined_shape(*shape_table_[master_id1]);
00518   combined_shape.AddShape(*shape_table_[master_id2]);
00519   return combined_shape.size();
00520 }
00521 
00522 // Merges two shape_ids, leaving shape_id2 marked as merged.
00523 void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
00524   int master_id1 = MasterDestinationIndex(shape_id1);
00525   int master_id2 = MasterDestinationIndex(shape_id2);
00526   // Point master_id2 (and all merged shapes) to master_id1.
00527   shape_table_[master_id2]->set_destination_index(master_id1);
00528   // Add all the shapes of master_id2 to master_id1.
00529   shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
00530 }
00531 
00532 // Swaps two shape_ids.
00533 void ShapeTable::SwapShapes(int shape_id1, int shape_id2) {
00534   Shape* tmp = shape_table_[shape_id1];
00535   shape_table_[shape_id1] = shape_table_[shape_id2];
00536   shape_table_[shape_id2] = tmp;
00537 }
00538 
00539 // Returns the destination of this shape, (if merged), taking into account
00540 // the fact that the destination may itself have been merged.
00541 int ShapeTable::MasterDestinationIndex(int shape_id) const {
00542   int dest_id = shape_table_[shape_id]->destination_index();
00543   if (dest_id == shape_id || dest_id < 0)
00544     return shape_id;  // Is master already.
00545   int master_id = shape_table_[dest_id]->destination_index();
00546   if (master_id == dest_id || master_id < 0)
00547     return dest_id;  // Dest is the master and shape_id points to it.
00548   master_id = MasterDestinationIndex(master_id);
00549   return master_id;
00550 }
00551 
00552 // Returns false if the unichars in neither shape is a subset of the other.
00553 bool ShapeTable::SubsetUnichar(int shape_id1, int shape_id2) const {
00554   const Shape& shape1 = GetShape(shape_id1);
00555   const Shape& shape2 = GetShape(shape_id2);
00556   int c1, c2;
00557   for (c1 = 0; c1 < shape1.size(); ++c1) {
00558     int unichar_id1 = shape1[c1].unichar_id;
00559     if (!shape2.ContainsUnichar(unichar_id1))
00560       break;
00561   }
00562   for (c2 = 0; c2 < shape2.size(); ++c2) {
00563     int unichar_id2 = shape2[c2].unichar_id;
00564     if (!shape1.ContainsUnichar(unichar_id2))
00565       break;
00566   }
00567   return c1 == shape1.size() || c2 == shape2.size();
00568 }
00569 
00570 // Returns false if the unichars in neither shape is a subset of the other.
00571 bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2,
00572                                     int shape_id) const {
00573   const Shape& merge1 = GetShape(merge_id1);
00574   const Shape& merge2 = GetShape(merge_id2);
00575   const Shape& shape = GetShape(shape_id);
00576   int cm1, cm2, cs;
00577   for (cs = 0; cs < shape.size(); ++cs) {
00578     int unichar_id = shape[cs].unichar_id;
00579     if (!merge1.ContainsUnichar(unichar_id) &&
00580         !merge2.ContainsUnichar(unichar_id))
00581       break;  // Shape is not a subset of the merge.
00582   }
00583   for (cm1 = 0; cm1 < merge1.size(); ++cm1) {
00584     int unichar_id1 = merge1[cm1].unichar_id;
00585     if (!shape.ContainsUnichar(unichar_id1))
00586       break;  // Merge is not a subset of shape
00587   }
00588   for (cm2 = 0; cm2 < merge2.size(); ++cm2) {
00589     int unichar_id2 = merge2[cm2].unichar_id;
00590     if (!shape.ContainsUnichar(unichar_id2))
00591       break;  // Merge is not a subset of shape
00592   }
00593   return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size());
00594 }
00595 
00596 // Returns true if the unichar sets are equal between the shapes.
00597 bool ShapeTable::EqualUnichars(int shape_id1, int shape_id2) const {
00598   const Shape& shape1 = GetShape(shape_id1);
00599   const Shape& shape2 = GetShape(shape_id2);
00600   for (int c1 = 0; c1 < shape1.size(); ++c1) {
00601     int unichar_id1 = shape1[c1].unichar_id;
00602     if (!shape2.ContainsUnichar(unichar_id1))
00603       return false;
00604   }
00605   for (int c2 = 0; c2 < shape2.size(); ++c2) {
00606     int unichar_id2 = shape2[c2].unichar_id;
00607     if (!shape1.ContainsUnichar(unichar_id2))
00608       return false;
00609   }
00610   return true;
00611 }
00612 
00613 // Returns true if the unichar sets are equal between the shapes.
00614 bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2,
00615                                     int shape_id) const {
00616   const Shape& merge1 = GetShape(merge_id1);
00617   const Shape& merge2 = GetShape(merge_id2);
00618   const Shape& shape = GetShape(shape_id);
00619   for (int cs = 0; cs < shape.size(); ++cs) {
00620     int unichar_id = shape[cs].unichar_id;
00621     if (!merge1.ContainsUnichar(unichar_id) &&
00622         !merge2.ContainsUnichar(unichar_id))
00623       return false;  // Shape has a unichar that appears in neither merge.
00624   }
00625   for (int cm1 = 0; cm1 < merge1.size(); ++cm1) {
00626     int unichar_id1 = merge1[cm1].unichar_id;
00627     if (!shape.ContainsUnichar(unichar_id1))
00628       return false;  // Merge has a unichar that is not in shape.
00629   }
00630   for (int cm2 = 0; cm2 < merge2.size(); ++cm2) {
00631     int unichar_id2 = merge2[cm2].unichar_id;
00632     if (!shape.ContainsUnichar(unichar_id2))
00633       return false;  // Merge has a unichar that is not in shape.
00634   }
00635   return true;
00636 }
00637 
00638 // Returns true if there is a common unichar between the shapes.
00639 bool ShapeTable::CommonUnichars(int shape_id1, int shape_id2) const {
00640   const Shape& shape1 = GetShape(shape_id1);
00641   const Shape& shape2 = GetShape(shape_id2);
00642   for (int c1 = 0; c1 < shape1.size(); ++c1) {
00643     int unichar_id1 = shape1[c1].unichar_id;
00644     if (shape2.ContainsUnichar(unichar_id1))
00645       return true;
00646   }
00647   return false;
00648 }
00649 
00650 // Returns true if there is a common font id between the shapes.
00651 bool ShapeTable::CommonFont(int shape_id1, int shape_id2) const {
00652   const Shape& shape1 = GetShape(shape_id1);
00653   const Shape& shape2 = GetShape(shape_id2);
00654   for (int c1 = 0; c1 < shape1.size(); ++c1) {
00655     const GenericVector<int>& font_list1 = shape1[c1].font_ids;
00656     for (int f = 0; f < font_list1.size(); ++f) {
00657       if (shape2.ContainsFont(font_list1[f]))
00658         return true;
00659     }
00660   }
00661   return false;
00662 }
00663 
00664 // Appends the master shapes from other to this.
00665 // If not NULL, shape_map is set to map other shape_ids to this's shape_ids.
00666 void ShapeTable::AppendMasterShapes(const ShapeTable& other,
00667                                     GenericVector<int>* shape_map) {
00668   if (shape_map != NULL)
00669     shape_map->init_to_size(other.NumShapes(), -1);
00670   for (int s = 0; s < other.shape_table_.size(); ++s) {
00671     if (other.shape_table_[s]->destination_index() < 0) {
00672       int index = AddShape(*other.shape_table_[s]);
00673       if (shape_map != NULL)
00674         (*shape_map)[s] = index;
00675     }
00676   }
00677 }
00678 
00679 // Returns the number of master shapes remaining after merging.
00680 int ShapeTable::NumMasterShapes() const {
00681   int num_shapes = 0;
00682   for (int s = 0; s < shape_table_.size(); ++s) {
00683     if (shape_table_[s]->destination_index() < 0)
00684       ++num_shapes;
00685   }
00686   return num_shapes;
00687 }
00688 
00689 
00690 // Adds the unichars of the given shape_id to the vector of results. Any
00691 // unichar_id that is already present just has the fonts added to the
00692 // font set for that result without adding a new entry in the vector.
00693 // NOTE: it is assumed that the results are given to this function in order
00694 // of decreasing rating.
00695 // The unichar_map vector indicates the index of the results entry containing
00696 // each unichar, or -1 if the unichar is not yet included in results.
00697 void ShapeTable::AddShapeToResults(const ShapeRating& shape_rating,
00698                                    GenericVector<int>* unichar_map,
00699                                    GenericVector<UnicharRating>* results)const {
00700   if (shape_rating.joined) {
00701     AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map,
00702                         results);
00703   }
00704   if (shape_rating.broken) {
00705     AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map,
00706                         results);
00707   }
00708   const Shape& shape = GetShape(shape_rating.shape_id);
00709   for (int u = 0; u < shape.size(); ++u) {
00710     int result_index = AddUnicharToResults(shape[u].unichar_id,
00711                                            shape_rating.rating,
00712                                            unichar_map, results);
00713     (*results)[result_index].fonts += shape[u].font_ids;
00714   }
00715 }
00716 
00717 // Adds the given unichar_id to the results if needed, updating unichar_map
00718 // and returning the index of unichar in results.
00719 int ShapeTable::AddUnicharToResults(
00720     int unichar_id, float rating, GenericVector<int>* unichar_map,
00721     GenericVector<UnicharRating>* results) const {
00722   int result_index = unichar_map->get(unichar_id);
00723   if (result_index < 0) {
00724     UnicharRating result(unichar_id, rating);
00725     result_index = results->push_back(result);
00726     (*unichar_map)[unichar_id] = result_index;
00727   }
00728   return result_index;
00729 }
00730 
00731 
00732 }  // namespace tesseract
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines