tesseract
3.03
|
00001 // Copyright 2010 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00004 // File: shapetable.cpp 00005 // Description: Class to map a classifier shape index to unicharset 00006 // indices and font indices. 00007 // Author: Ray Smith 00008 // Created: Tue Nov 02 15:31:32 PDT 2010 00009 // 00010 // (C) Copyright 2010, Google Inc. 00011 // Licensed under the Apache License, Version 2.0 (the "License"); 00012 // you may not use this file except in compliance with the License. 00013 // You may obtain a copy of the License at 00014 // http://www.apache.org/licenses/LICENSE-2.0 00015 // Unless required by applicable law or agreed to in writing, software 00016 // distributed under the License is distributed on an "AS IS" BASIS, 00017 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00018 // See the License for the specific language governing permissions and 00019 // limitations under the License. 00020 // 00022 00023 #include "shapetable.h" 00024 00025 #include "bitvector.h" 00026 #include "fontinfo.h" 00027 #include "intfeaturespace.h" 00028 #include "strngs.h" 00029 #include "unicharset.h" 00030 #include "unicity_table.h" 00031 00032 namespace tesseract { 00033 00034 // Helper function to get the index of the first result with the required 00035 // unichar_id. If the results are sorted by rating, this will also be the 00036 // best result with the required unichar_id. 00037 // Returns -1 if the unichar_id is not found 00038 int ShapeRating::FirstResultWithUnichar( 00039 const GenericVector<ShapeRating>& results, 00040 const ShapeTable& shape_table, 00041 UNICHAR_ID unichar_id) { 00042 for (int r = 0; r < results.size(); ++r) { 00043 int shape_id = results[r].shape_id; 00044 const Shape& shape = shape_table.GetShape(shape_id); 00045 if (shape.ContainsUnichar(unichar_id)) { 00046 return r; 00047 } 00048 } 00049 return -1; 00050 } 00051 00052 // Helper function to get the index of the first result with the required 00053 // unichar_id. If the results are sorted by rating, this will also be the 00054 // best result with the required unichar_id. 00055 // Returns -1 if the unichar_id is not found 00056 int UnicharRating::FirstResultWithUnichar( 00057 const GenericVector<UnicharRating>& results, 00058 UNICHAR_ID unichar_id) { 00059 for (int r = 0; r < results.size(); ++r) { 00060 if (results[r].unichar_id == unichar_id) 00061 return r; 00062 } 00063 return -1; 00064 } 00065 00066 // Writes to the given file. Returns false in case of error. 00067 bool UnicharAndFonts::Serialize(FILE* fp) const { 00068 if (fwrite(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false; 00069 if (!font_ids.Serialize(fp)) return false; 00070 return true; 00071 } 00072 // Reads from the given file. Returns false in case of error. 00073 // If swap is true, assumes a big/little-endian swap is needed. 00074 bool UnicharAndFonts::DeSerialize(bool swap, FILE* fp) { 00075 if (fread(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false; 00076 if (swap) 00077 ReverseN(&unichar_id, sizeof(unichar_id)); 00078 if (!font_ids.DeSerialize(swap, fp)) return false; 00079 return true; 00080 } 00081 00082 // Sort function to sort a pair of UnicharAndFonts by unichar_id. 00083 int UnicharAndFonts::SortByUnicharId(const void* v1, const void* v2) { 00084 const UnicharAndFonts* p1 = reinterpret_cast<const UnicharAndFonts*>(v1); 00085 const UnicharAndFonts* p2 = reinterpret_cast<const UnicharAndFonts*>(v2); 00086 return p1->unichar_id - p2->unichar_id; 00087 } 00088 00089 // Writes to the given file. Returns false in case of error. 00090 bool Shape::Serialize(FILE* fp) const { 00091 uinT8 sorted = unichars_sorted_; 00092 if (fwrite(&sorted, sizeof(sorted), 1, fp) != 1) 00093 return false; 00094 if (!unichars_.SerializeClasses(fp)) return false; 00095 return true; 00096 } 00097 // Reads from the given file. Returns false in case of error. 00098 // If swap is true, assumes a big/little-endian swap is needed. 00099 bool Shape::DeSerialize(bool swap, FILE* fp) { 00100 uinT8 sorted; 00101 if (fread(&sorted, sizeof(sorted), 1, fp) != 1) 00102 return false; 00103 unichars_sorted_ = sorted != 0; 00104 if (!unichars_.DeSerializeClasses(swap, fp)) return false; 00105 return true; 00106 } 00107 00108 // Adds a font_id for the given unichar_id. If the unichar_id is not 00109 // in the shape, it is added. 00110 void Shape::AddToShape(int unichar_id, int font_id) { 00111 for (int c = 0; c < unichars_.size(); ++c) { 00112 if (unichars_[c].unichar_id == unichar_id) { 00113 // Found the unichar in the shape table. 00114 GenericVector<int>& font_list = unichars_[c].font_ids; 00115 for (int f = 0; f < font_list.size(); ++f) { 00116 if (font_list[f] == font_id) 00117 return; // Font is already there. 00118 } 00119 font_list.push_back(font_id); 00120 return; 00121 } 00122 } 00123 // Unichar_id is not in shape, so add it to shape. 00124 unichars_.push_back(UnicharAndFonts(unichar_id, font_id)); 00125 unichars_sorted_ = unichars_.size() <= 1; 00126 } 00127 00128 // Adds everything in other to this. 00129 void Shape::AddShape(const Shape& other) { 00130 for (int c = 0; c < other.unichars_.size(); ++c) { 00131 for (int f = 0; f < other.unichars_[c].font_ids.size(); ++f) { 00132 AddToShape(other.unichars_[c].unichar_id, 00133 other.unichars_[c].font_ids[f]); 00134 } 00135 } 00136 unichars_sorted_ = unichars_.size() <= 1; 00137 } 00138 00139 // Returns true if the shape contains the given unichar_id, font_id pair. 00140 bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const { 00141 for (int c = 0; c < unichars_.size(); ++c) { 00142 if (unichars_[c].unichar_id == unichar_id) { 00143 // Found the unichar, so look for the font. 00144 GenericVector<int>& font_list = unichars_[c].font_ids; 00145 for (int f = 0; f < font_list.size(); ++f) { 00146 if (font_list[f] == font_id) 00147 return true; 00148 } 00149 return false; 00150 } 00151 } 00152 return false; 00153 } 00154 00155 // Returns true if the shape contains the given unichar_id, ignoring font. 00156 bool Shape::ContainsUnichar(int unichar_id) const { 00157 for (int c = 0; c < unichars_.size(); ++c) { 00158 if (unichars_[c].unichar_id == unichar_id) { 00159 return true; 00160 } 00161 } 00162 return false; 00163 } 00164 00165 // Returns true if the shape contains the given font, ignoring unichar_id. 00166 bool Shape::ContainsFont(int font_id) const { 00167 for (int c = 0; c < unichars_.size(); ++c) { 00168 GenericVector<int>& font_list = unichars_[c].font_ids; 00169 for (int f = 0; f < font_list.size(); ++f) { 00170 if (font_list[f] == font_id) 00171 return true; 00172 } 00173 } 00174 return false; 00175 } 00176 // Returns true if the shape contains the given font properties, ignoring 00177 // unichar_id. 00178 bool Shape::ContainsFontProperties(const FontInfoTable& font_table, 00179 uinT32 properties) const { 00180 for (int c = 0; c < unichars_.size(); ++c) { 00181 GenericVector<int>& font_list = unichars_[c].font_ids; 00182 for (int f = 0; f < font_list.size(); ++f) { 00183 if (font_table.get(font_list[f]).properties == properties) 00184 return true; 00185 } 00186 } 00187 return false; 00188 } 00189 // Returns true if the shape contains multiple different font properties, 00190 // ignoring unichar_id. 00191 bool Shape::ContainsMultipleFontProperties( 00192 const FontInfoTable& font_table) const { 00193 uinT32 properties = font_table.get(unichars_[0].font_ids[0]).properties; 00194 for (int c = 0; c < unichars_.size(); ++c) { 00195 GenericVector<int>& font_list = unichars_[c].font_ids; 00196 for (int f = 0; f < font_list.size(); ++f) { 00197 if (font_table.get(font_list[f]).properties != properties) 00198 return true; 00199 } 00200 } 00201 return false; 00202 } 00203 00204 // Returns true if this shape is equal to other (ignoring order of unichars 00205 // and fonts). 00206 bool Shape::operator==(const Shape& other) const { 00207 return IsSubsetOf(other) && other.IsSubsetOf(*this); 00208 } 00209 00210 // Returns true if this is a subset (including equal) of other. 00211 bool Shape::IsSubsetOf(const Shape& other) const { 00212 for (int c = 0; c < unichars_.size(); ++c) { 00213 int unichar_id = unichars_[c].unichar_id; 00214 const GenericVector<int>& font_list = unichars_[c].font_ids; 00215 for (int f = 0; f < font_list.size(); ++f) { 00216 if (!other.ContainsUnicharAndFont(unichar_id, font_list[f])) 00217 return false; 00218 } 00219 } 00220 return true; 00221 } 00222 00223 // Returns true if the lists of unichar ids are the same in this and other, 00224 // ignoring fonts. 00225 // NOT const, as it will sort the unichars on demand. 00226 bool Shape::IsEqualUnichars(Shape* other) { 00227 if (unichars_.size() != other->unichars_.size()) return false; 00228 if (!unichars_sorted_) SortUnichars(); 00229 if (!other->unichars_sorted_) other->SortUnichars(); 00230 for (int c = 0; c < unichars_.size(); ++c) { 00231 if (unichars_[c].unichar_id != other->unichars_[c].unichar_id) 00232 return false; 00233 } 00234 return true; 00235 } 00236 00237 // Sorts the unichars_ vector by unichar. 00238 void Shape::SortUnichars() { 00239 unichars_.sort(UnicharAndFonts::SortByUnicharId); 00240 unichars_sorted_ = true; 00241 } 00242 00243 ShapeTable::ShapeTable() : unicharset_(NULL), num_fonts_(0) { 00244 } 00245 ShapeTable::ShapeTable(const UNICHARSET& unicharset) 00246 : unicharset_(&unicharset), num_fonts_(0) { 00247 } 00248 00249 // Writes to the given file. Returns false in case of error. 00250 bool ShapeTable::Serialize(FILE* fp) const { 00251 if (!shape_table_.Serialize(fp)) return false; 00252 return true; 00253 } 00254 // Reads from the given file. Returns false in case of error. 00255 // If swap is true, assumes a big/little-endian swap is needed. 00256 bool ShapeTable::DeSerialize(bool swap, FILE* fp) { 00257 if (!shape_table_.DeSerialize(swap, fp)) return false; 00258 num_fonts_ = 0; 00259 return true; 00260 } 00261 00262 // Returns the number of fonts used in this ShapeTable, computing it if 00263 // necessary. 00264 int ShapeTable::NumFonts() const { 00265 if (num_fonts_ <= 0) { 00266 for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) { 00267 const Shape& shape = *shape_table_[shape_id]; 00268 for (int c = 0; c < shape.size(); ++c) { 00269 for (int f = 0; f < shape[c].font_ids.size(); ++f) { 00270 if (shape[c].font_ids[f] >= num_fonts_) 00271 num_fonts_ = shape[c].font_ids[f] + 1; 00272 } 00273 } 00274 } 00275 } 00276 return num_fonts_; 00277 } 00278 00279 // Re-indexes the class_ids in the shapetable according to the given map. 00280 // Useful in conjunction with set_unicharset. 00281 void ShapeTable::ReMapClassIds(const GenericVector<int>& unicharset_map) { 00282 for (int shape_id = 0; shape_id < shape_table_.size(); ++shape_id) { 00283 Shape* shape = shape_table_[shape_id]; 00284 for (int c = 0; c < shape->size(); ++c) { 00285 shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]); 00286 } 00287 } 00288 } 00289 00290 // Returns a string listing the classes/fonts in a shape. 00291 STRING ShapeTable::DebugStr(int shape_id) const { 00292 if (shape_id < 0 || shape_id >= shape_table_.size()) 00293 return STRING("INVALID_UNICHAR_ID"); 00294 const Shape& shape = GetShape(shape_id); 00295 STRING result; 00296 result.add_str_int("Shape", shape_id); 00297 if (shape.size() > 100) { 00298 result.add_str_int(" Num unichars=", shape.size()); 00299 return result; 00300 } 00301 for (int c = 0; c < shape.size(); ++c) { 00302 result.add_str_int(" c_id=", shape[c].unichar_id); 00303 result += "="; 00304 result += unicharset_->id_to_unichar(shape[c].unichar_id); 00305 if (shape.size() < 10) { 00306 result.add_str_int(", ", shape[c].font_ids.size()); 00307 result += " fonts ="; 00308 int num_fonts = shape[c].font_ids.size(); 00309 if (num_fonts > 10) { 00310 result.add_str_int(" ", shape[c].font_ids[0]); 00311 result.add_str_int(" ... ", shape[c].font_ids[num_fonts - 1]); 00312 } else { 00313 for (int f = 0; f < num_fonts; ++f) { 00314 result.add_str_int(" ", shape[c].font_ids[f]); 00315 } 00316 } 00317 } 00318 } 00319 return result; 00320 } 00321 00322 // Returns a debug string summarizing the table. 00323 STRING ShapeTable::SummaryStr() const { 00324 int max_unichars = 0; 00325 int num_multi_shapes = 0; 00326 int num_master_shapes = 0; 00327 for (int s = 0; s < shape_table_.size(); ++s) { 00328 if (MasterDestinationIndex(s) != s) continue; 00329 ++num_master_shapes; 00330 int shape_size = GetShape(s).size(); 00331 if (shape_size > 1) 00332 ++num_multi_shapes; 00333 if (shape_size > max_unichars) 00334 max_unichars = shape_size; 00335 } 00336 STRING result; 00337 result.add_str_int("Number of shapes = ", num_master_shapes); 00338 result.add_str_int(" max unichars = ", max_unichars); 00339 result.add_str_int(" number with multiple unichars = ", num_multi_shapes); 00340 return result; 00341 } 00342 00343 00344 // Adds a new shape starting with the given unichar_id and font_id. 00345 // Returns the assigned index. 00346 int ShapeTable::AddShape(int unichar_id, int font_id) { 00347 int index = shape_table_.size(); 00348 Shape* shape = new Shape; 00349 shape->AddToShape(unichar_id, font_id); 00350 shape_table_.push_back(shape); 00351 num_fonts_ = MAX(num_fonts_, font_id + 1); 00352 return index; 00353 } 00354 00355 // Adds a copy of the given shape unless it is already present. 00356 // Returns the assigned index or index of existing shape if already present. 00357 int ShapeTable::AddShape(const Shape& other) { 00358 int index; 00359 for (index = 0; index < shape_table_.size() && 00360 !(other == *shape_table_[index]); ++index) 00361 continue; 00362 if (index == shape_table_.size()) { 00363 Shape* shape = new Shape(other); 00364 shape_table_.push_back(shape); 00365 } 00366 num_fonts_ = 0; 00367 return index; 00368 } 00369 00370 // Removes the shape given by the shape index. 00371 void ShapeTable::DeleteShape(int shape_id) { 00372 delete shape_table_[shape_id]; 00373 shape_table_[shape_id] = NULL; 00374 shape_table_.remove(shape_id); 00375 } 00376 00377 // Adds a font_id to the given existing shape index for the given 00378 // unichar_id. If the unichar_id is not in the shape, it is added. 00379 void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) { 00380 Shape& shape = *shape_table_[shape_id]; 00381 shape.AddToShape(unichar_id, font_id); 00382 num_fonts_ = MAX(num_fonts_, font_id + 1); 00383 } 00384 00385 // Adds the given shape to the existing shape with the given index. 00386 void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) { 00387 Shape& shape = *shape_table_[shape_id]; 00388 shape.AddShape(other); 00389 num_fonts_ = 0; 00390 } 00391 00392 // Returns the id of the shape that contains the given unichar and font. 00393 // If not found, returns -1. 00394 // If font_id < 0, the font_id is ignored and the first shape that matches 00395 // the unichar_id is returned. 00396 int ShapeTable::FindShape(int unichar_id, int font_id) const { 00397 for (int s = 0; s < shape_table_.size(); ++s) { 00398 const Shape& shape = GetShape(s); 00399 for (int c = 0; c < shape.size(); ++c) { 00400 if (shape[c].unichar_id == unichar_id) { 00401 if (font_id < 0) 00402 return s; // We don't care about the font. 00403 for (int f = 0; f < shape[c].font_ids.size(); ++f) { 00404 if (shape[c].font_ids[f] == font_id) 00405 return s; 00406 } 00407 } 00408 } 00409 } 00410 return -1; 00411 } 00412 00413 // Returns the first unichar_id and font_id in the given shape. 00414 void ShapeTable::GetFirstUnicharAndFont(int shape_id, 00415 int* unichar_id, int* font_id) const { 00416 const UnicharAndFonts& unichar_and_fonts = (*shape_table_[shape_id])[0]; 00417 *unichar_id = unichar_and_fonts.unichar_id; 00418 *font_id = unichar_and_fonts.font_ids[0]; 00419 } 00420 00421 // Expands all the classes/fonts in the shape individually to build 00422 // a ShapeTable. 00423 int ShapeTable::BuildFromShape(const Shape& shape, 00424 const ShapeTable& master_shapes) { 00425 BitVector shape_map(master_shapes.NumShapes()); 00426 for (int u_ind = 0; u_ind < shape.size(); ++u_ind) { 00427 for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) { 00428 int c = shape[u_ind].unichar_id; 00429 int f = shape[u_ind].font_ids[f_ind]; 00430 int master_id = master_shapes.FindShape(c, f); 00431 if (master_id >= 0) { 00432 shape_map.SetBit(master_id); 00433 } else if (FindShape(c, f) < 0) { 00434 AddShape(c, f); 00435 } 00436 } 00437 } 00438 int num_masters = 0; 00439 for (int s = 0; s < master_shapes.NumShapes(); ++s) { 00440 if (shape_map[s]) { 00441 AddShape(master_shapes.GetShape(s)); 00442 ++num_masters; 00443 } 00444 } 00445 return num_masters; 00446 } 00447 00448 // Returns true if the shapes are already merged. 00449 bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) const { 00450 return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2); 00451 } 00452 00453 // Returns true if any shape contains multiple unichars. 00454 bool ShapeTable::AnyMultipleUnichars() const { 00455 int num_shapes = NumShapes(); 00456 for (int s1 = 0; s1 < num_shapes; ++s1) { 00457 if (MasterDestinationIndex(s1) != s1) continue; 00458 if (GetShape(s1).size() > 1) 00459 return true; 00460 } 00461 return false; 00462 } 00463 00464 // Returns the maximum number of unichars over all shapes. 00465 int ShapeTable::MaxNumUnichars() const { 00466 int max_num_unichars = 0; 00467 int num_shapes = NumShapes(); 00468 for (int s = 0; s < num_shapes; ++s) { 00469 if (GetShape(s).size() > max_num_unichars) 00470 max_num_unichars = GetShape(s).size(); 00471 } 00472 return max_num_unichars; 00473 } 00474 00475 00476 // Merges shapes with a common unichar over the [start, end) interval. 00477 // Assumes single unichar per shape. 00478 void ShapeTable::ForceFontMerges(int start, int end) { 00479 for (int s1 = start; s1 < end; ++s1) { 00480 if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) { 00481 int unichar_id = GetShape(s1)[0].unichar_id; 00482 for (int s2 = s1 + 1; s2 < end; ++s2) { 00483 if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 && 00484 unichar_id == GetShape(s2)[0].unichar_id) { 00485 MergeShapes(s1, s2); 00486 } 00487 } 00488 } 00489 } 00490 ShapeTable compacted(*unicharset_); 00491 compacted.AppendMasterShapes(*this, NULL); 00492 *this = compacted; 00493 } 00494 00495 // Returns the number of unichars in the master shape. 00496 int ShapeTable::MasterUnicharCount(int shape_id) const { 00497 int master_id = MasterDestinationIndex(shape_id); 00498 return GetShape(master_id).size(); 00499 } 00500 00501 // Returns the sum of the font counts in the master shape. 00502 int ShapeTable::MasterFontCount(int shape_id) const { 00503 int master_id = MasterDestinationIndex(shape_id); 00504 const Shape& shape = GetShape(master_id); 00505 int font_count = 0; 00506 for (int c = 0; c < shape.size(); ++c) { 00507 font_count += shape[c].font_ids.size(); 00508 } 00509 return font_count; 00510 } 00511 00512 // Returns the number of unichars that would result from merging the shapes. 00513 int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const { 00514 // Do it the easy way for now. 00515 int master_id1 = MasterDestinationIndex(shape_id1); 00516 int master_id2 = MasterDestinationIndex(shape_id2); 00517 Shape combined_shape(*shape_table_[master_id1]); 00518 combined_shape.AddShape(*shape_table_[master_id2]); 00519 return combined_shape.size(); 00520 } 00521 00522 // Merges two shape_ids, leaving shape_id2 marked as merged. 00523 void ShapeTable::MergeShapes(int shape_id1, int shape_id2) { 00524 int master_id1 = MasterDestinationIndex(shape_id1); 00525 int master_id2 = MasterDestinationIndex(shape_id2); 00526 // Point master_id2 (and all merged shapes) to master_id1. 00527 shape_table_[master_id2]->set_destination_index(master_id1); 00528 // Add all the shapes of master_id2 to master_id1. 00529 shape_table_[master_id1]->AddShape(*shape_table_[master_id2]); 00530 } 00531 00532 // Swaps two shape_ids. 00533 void ShapeTable::SwapShapes(int shape_id1, int shape_id2) { 00534 Shape* tmp = shape_table_[shape_id1]; 00535 shape_table_[shape_id1] = shape_table_[shape_id2]; 00536 shape_table_[shape_id2] = tmp; 00537 } 00538 00539 // Returns the destination of this shape, (if merged), taking into account 00540 // the fact that the destination may itself have been merged. 00541 int ShapeTable::MasterDestinationIndex(int shape_id) const { 00542 int dest_id = shape_table_[shape_id]->destination_index(); 00543 if (dest_id == shape_id || dest_id < 0) 00544 return shape_id; // Is master already. 00545 int master_id = shape_table_[dest_id]->destination_index(); 00546 if (master_id == dest_id || master_id < 0) 00547 return dest_id; // Dest is the master and shape_id points to it. 00548 master_id = MasterDestinationIndex(master_id); 00549 return master_id; 00550 } 00551 00552 // Returns false if the unichars in neither shape is a subset of the other. 00553 bool ShapeTable::SubsetUnichar(int shape_id1, int shape_id2) const { 00554 const Shape& shape1 = GetShape(shape_id1); 00555 const Shape& shape2 = GetShape(shape_id2); 00556 int c1, c2; 00557 for (c1 = 0; c1 < shape1.size(); ++c1) { 00558 int unichar_id1 = shape1[c1].unichar_id; 00559 if (!shape2.ContainsUnichar(unichar_id1)) 00560 break; 00561 } 00562 for (c2 = 0; c2 < shape2.size(); ++c2) { 00563 int unichar_id2 = shape2[c2].unichar_id; 00564 if (!shape1.ContainsUnichar(unichar_id2)) 00565 break; 00566 } 00567 return c1 == shape1.size() || c2 == shape2.size(); 00568 } 00569 00570 // Returns false if the unichars in neither shape is a subset of the other. 00571 bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2, 00572 int shape_id) const { 00573 const Shape& merge1 = GetShape(merge_id1); 00574 const Shape& merge2 = GetShape(merge_id2); 00575 const Shape& shape = GetShape(shape_id); 00576 int cm1, cm2, cs; 00577 for (cs = 0; cs < shape.size(); ++cs) { 00578 int unichar_id = shape[cs].unichar_id; 00579 if (!merge1.ContainsUnichar(unichar_id) && 00580 !merge2.ContainsUnichar(unichar_id)) 00581 break; // Shape is not a subset of the merge. 00582 } 00583 for (cm1 = 0; cm1 < merge1.size(); ++cm1) { 00584 int unichar_id1 = merge1[cm1].unichar_id; 00585 if (!shape.ContainsUnichar(unichar_id1)) 00586 break; // Merge is not a subset of shape 00587 } 00588 for (cm2 = 0; cm2 < merge2.size(); ++cm2) { 00589 int unichar_id2 = merge2[cm2].unichar_id; 00590 if (!shape.ContainsUnichar(unichar_id2)) 00591 break; // Merge is not a subset of shape 00592 } 00593 return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size()); 00594 } 00595 00596 // Returns true if the unichar sets are equal between the shapes. 00597 bool ShapeTable::EqualUnichars(int shape_id1, int shape_id2) const { 00598 const Shape& shape1 = GetShape(shape_id1); 00599 const Shape& shape2 = GetShape(shape_id2); 00600 for (int c1 = 0; c1 < shape1.size(); ++c1) { 00601 int unichar_id1 = shape1[c1].unichar_id; 00602 if (!shape2.ContainsUnichar(unichar_id1)) 00603 return false; 00604 } 00605 for (int c2 = 0; c2 < shape2.size(); ++c2) { 00606 int unichar_id2 = shape2[c2].unichar_id; 00607 if (!shape1.ContainsUnichar(unichar_id2)) 00608 return false; 00609 } 00610 return true; 00611 } 00612 00613 // Returns true if the unichar sets are equal between the shapes. 00614 bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2, 00615 int shape_id) const { 00616 const Shape& merge1 = GetShape(merge_id1); 00617 const Shape& merge2 = GetShape(merge_id2); 00618 const Shape& shape = GetShape(shape_id); 00619 for (int cs = 0; cs < shape.size(); ++cs) { 00620 int unichar_id = shape[cs].unichar_id; 00621 if (!merge1.ContainsUnichar(unichar_id) && 00622 !merge2.ContainsUnichar(unichar_id)) 00623 return false; // Shape has a unichar that appears in neither merge. 00624 } 00625 for (int cm1 = 0; cm1 < merge1.size(); ++cm1) { 00626 int unichar_id1 = merge1[cm1].unichar_id; 00627 if (!shape.ContainsUnichar(unichar_id1)) 00628 return false; // Merge has a unichar that is not in shape. 00629 } 00630 for (int cm2 = 0; cm2 < merge2.size(); ++cm2) { 00631 int unichar_id2 = merge2[cm2].unichar_id; 00632 if (!shape.ContainsUnichar(unichar_id2)) 00633 return false; // Merge has a unichar that is not in shape. 00634 } 00635 return true; 00636 } 00637 00638 // Returns true if there is a common unichar between the shapes. 00639 bool ShapeTable::CommonUnichars(int shape_id1, int shape_id2) const { 00640 const Shape& shape1 = GetShape(shape_id1); 00641 const Shape& shape2 = GetShape(shape_id2); 00642 for (int c1 = 0; c1 < shape1.size(); ++c1) { 00643 int unichar_id1 = shape1[c1].unichar_id; 00644 if (shape2.ContainsUnichar(unichar_id1)) 00645 return true; 00646 } 00647 return false; 00648 } 00649 00650 // Returns true if there is a common font id between the shapes. 00651 bool ShapeTable::CommonFont(int shape_id1, int shape_id2) const { 00652 const Shape& shape1 = GetShape(shape_id1); 00653 const Shape& shape2 = GetShape(shape_id2); 00654 for (int c1 = 0; c1 < shape1.size(); ++c1) { 00655 const GenericVector<int>& font_list1 = shape1[c1].font_ids; 00656 for (int f = 0; f < font_list1.size(); ++f) { 00657 if (shape2.ContainsFont(font_list1[f])) 00658 return true; 00659 } 00660 } 00661 return false; 00662 } 00663 00664 // Appends the master shapes from other to this. 00665 // If not NULL, shape_map is set to map other shape_ids to this's shape_ids. 00666 void ShapeTable::AppendMasterShapes(const ShapeTable& other, 00667 GenericVector<int>* shape_map) { 00668 if (shape_map != NULL) 00669 shape_map->init_to_size(other.NumShapes(), -1); 00670 for (int s = 0; s < other.shape_table_.size(); ++s) { 00671 if (other.shape_table_[s]->destination_index() < 0) { 00672 int index = AddShape(*other.shape_table_[s]); 00673 if (shape_map != NULL) 00674 (*shape_map)[s] = index; 00675 } 00676 } 00677 } 00678 00679 // Returns the number of master shapes remaining after merging. 00680 int ShapeTable::NumMasterShapes() const { 00681 int num_shapes = 0; 00682 for (int s = 0; s < shape_table_.size(); ++s) { 00683 if (shape_table_[s]->destination_index() < 0) 00684 ++num_shapes; 00685 } 00686 return num_shapes; 00687 } 00688 00689 00690 // Adds the unichars of the given shape_id to the vector of results. Any 00691 // unichar_id that is already present just has the fonts added to the 00692 // font set for that result without adding a new entry in the vector. 00693 // NOTE: it is assumed that the results are given to this function in order 00694 // of decreasing rating. 00695 // The unichar_map vector indicates the index of the results entry containing 00696 // each unichar, or -1 if the unichar is not yet included in results. 00697 void ShapeTable::AddShapeToResults(const ShapeRating& shape_rating, 00698 GenericVector<int>* unichar_map, 00699 GenericVector<UnicharRating>* results)const { 00700 if (shape_rating.joined) { 00701 AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map, 00702 results); 00703 } 00704 if (shape_rating.broken) { 00705 AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map, 00706 results); 00707 } 00708 const Shape& shape = GetShape(shape_rating.shape_id); 00709 for (int u = 0; u < shape.size(); ++u) { 00710 int result_index = AddUnicharToResults(shape[u].unichar_id, 00711 shape_rating.rating, 00712 unichar_map, results); 00713 (*results)[result_index].fonts += shape[u].font_ids; 00714 } 00715 } 00716 00717 // Adds the given unichar_id to the results if needed, updating unichar_map 00718 // and returning the index of unichar in results. 00719 int ShapeTable::AddUnicharToResults( 00720 int unichar_id, float rating, GenericVector<int>* unichar_map, 00721 GenericVector<UnicharRating>* results) const { 00722 int result_index = unichar_map->get(unichar_id); 00723 if (result_index < 0) { 00724 UnicharRating result(unichar_id, rating); 00725 result_index = results->push_back(result); 00726 (*unichar_map)[unichar_id] = result_index; 00727 } 00728 return result_index; 00729 } 00730 00731 00732 } // namespace tesseract