tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccmain/osdetect.cpp
Go to the documentation of this file.
00001 
00002 // File:        osdetect.cpp
00003 // Description: Orientation and script detection.
00004 // Author:      Samuel Charron
00005 //              Ranjith Unnikrishnan
00006 //
00007 // (C) Copyright 2008, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #include "osdetect.h"
00021 
00022 #include "blobbox.h"
00023 #include "blread.h"
00024 #include "colfind.h"
00025 #include "fontinfo.h"
00026 #include "imagefind.h"
00027 #include "linefind.h"
00028 #include "oldlist.h"
00029 #include "qrsequence.h"
00030 #include "ratngs.h"
00031 #include "strngs.h"
00032 #include "tabvector.h"
00033 #include "tesseractclass.h"
00034 #include "textord.h"
00035 
00036 const int kMinCharactersToTry = 50;
00037 const int kMaxCharactersToTry = 5 * kMinCharactersToTry;
00038 
00039 const float kSizeRatioToReject = 2.0;
00040 const int kMinAcceptableBlobHeight = 10;
00041 
00042 const float kOrientationAcceptRatio = 1.3;
00043 const float kScriptAcceptRatio = 1.3;
00044 
00045 const float kHanRatioInKorean = 0.7;
00046 const float kHanRatioInJapanese = 0.3;
00047 
00048 const float kNonAmbiguousMargin = 1.0;
00049 
00050 // General scripts
00051 static const char* han_script = "Han";
00052 static const char* latin_script = "Latin";
00053 static const char* katakana_script = "Katakana";
00054 static const char* hiragana_script = "Hiragana";
00055 static const char* hangul_script = "Hangul";
00056 
00057 // Pseudo-scripts Name
00058 const char* ScriptDetector::korean_script_ = "Korean";
00059 const char* ScriptDetector::japanese_script_ = "Japanese";
00060 const char* ScriptDetector::fraktur_script_ = "Fraktur";
00061 
00062 // Minimum believable resolution.
00063 const int kMinCredibleResolution = 70;
00064 // Default resolution used if input is not believable.
00065 const int kDefaultResolution = 300;
00066 
00067 void OSResults::update_best_orientation() {
00068   float first = orientations[0];
00069   float second = orientations[1];
00070   best_result.orientation_id = 0;
00071   if (orientations[0] < orientations[1]) {
00072     first = orientations[1];
00073     second = orientations[0];
00074     best_result.orientation_id = 1;
00075   }
00076   for (int i = 2; i < 4; ++i) {
00077     if (orientations[i] > first) {
00078       second = first;
00079       first = orientations[i];
00080       best_result.orientation_id = i;
00081     } else if (orientations[i] > second) {
00082       second = orientations[i];
00083     }
00084   }
00085   // Store difference of top two orientation scores.
00086   best_result.oconfidence = first - second;
00087 }
00088 
00089 void OSResults::set_best_orientation(int orientation_id) {
00090   best_result.orientation_id = orientation_id;
00091   best_result.oconfidence = 0;
00092 }
00093 
00094 void OSResults::update_best_script(int orientation) {
00095   // We skip index 0 to ignore the "Common" script.
00096   float first = scripts_na[orientation][1];
00097   float second = scripts_na[orientation][2];
00098   best_result.script_id = 1;
00099   if (scripts_na[orientation][1] < scripts_na[orientation][2]) {
00100     first = scripts_na[orientation][2];
00101     second = scripts_na[orientation][1];
00102     best_result.script_id = 2;
00103   }
00104   for (int i = 3; i < kMaxNumberOfScripts; ++i) {
00105     if (scripts_na[orientation][i] > first) {
00106       best_result.script_id = i;
00107       second = first;
00108       first = scripts_na[orientation][i];
00109     } else if (scripts_na[orientation][i] > second) {
00110       second = scripts_na[orientation][i];
00111     }
00112   }
00113   best_result.sconfidence =
00114       (first / second - 1.0) / (kScriptAcceptRatio - 1.0);
00115 }
00116 
00117 int OSResults::get_best_script(int orientation_id) const {
00118   int max_id = -1;
00119   for (int j = 0; j < kMaxNumberOfScripts; ++j) {
00120     const char *script = unicharset->get_script_from_script_id(j);
00121     if (strcmp(script, "Common") && strcmp(script, "NULL")) {
00122       if (max_id == -1 ||
00123           scripts_na[orientation_id][j] > scripts_na[orientation_id][max_id])
00124         max_id = j;
00125     }
00126   }
00127   return max_id;
00128 }
00129 
00130 // Print the script scores for all possible orientations.
00131 void OSResults::print_scores(void) const {
00132   for (int i = 0; i < 4; ++i) {
00133     tprintf("Orientation id #%d", i);
00134     print_scores(i);
00135   }
00136 }
00137 
00138 // Print the script scores for the given candidate orientation.
00139 void OSResults::print_scores(int orientation_id) const {
00140   for (int j = 0; j < kMaxNumberOfScripts; ++j) {
00141     if (scripts_na[orientation_id][j]) {
00142       tprintf("%12s\t: %f\n", unicharset->get_script_from_script_id(j),
00143              scripts_na[orientation_id][j]);
00144     }
00145   }
00146 }
00147 
00148 // Accumulate scores with given OSResults instance and update the best script.
00149 void OSResults::accumulate(const OSResults& osr) {
00150   for (int i = 0; i < 4; ++i) {
00151     orientations[i] += osr.orientations[i];
00152     for (int j = 0; j < kMaxNumberOfScripts; ++j)
00153       scripts_na[i][j] += osr.scripts_na[i][j];
00154   }
00155   unicharset = osr.unicharset;
00156   update_best_orientation();
00157   update_best_script(best_result.orientation_id);
00158 }
00159 
00160 // Detect and erase horizontal/vertical lines and picture regions from the
00161 // image, so that non-text blobs are removed from consideration.
00162 void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks,
00163                             TO_BLOCK_LIST *to_blocks) {
00164   Pix *pix = tess->pix_binary();
00165   ASSERT_HOST(pix != NULL);
00166   int vertical_x = 0;
00167   int vertical_y = 1;
00168   tesseract::TabVector_LIST v_lines;
00169   tesseract::TabVector_LIST h_lines;
00170   const int kMinCredibleResolution = 70;
00171   int resolution = (kMinCredibleResolution > pixGetXRes(pix)) ?
00172       kMinCredibleResolution : pixGetXRes(pix);
00173 
00174   tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix,
00175                                             &vertical_x, &vertical_y,
00176                                             NULL, &v_lines, &h_lines);
00177   Pix* im_pix = tesseract::ImageFind::FindImages(pix);
00178   if (im_pix != NULL) {
00179     pixSubtract(pix, pix, im_pix);
00180     pixDestroy(&im_pix);
00181   }
00182   tess->mutable_textord()->find_components(tess->pix_binary(),
00183                                            blocks, to_blocks);
00184 }
00185 
00186 // Find connected components in the page and process a subset until finished or
00187 // a stopping criterion is met.
00188 // Returns the number of blobs used in making the estimate. 0 implies failure.
00189 int orientation_and_script_detection(STRING& filename,
00190                                      OSResults* osr,
00191                                      tesseract::Tesseract* tess) {
00192   STRING name = filename;        //truncated name
00193   const char *lastdot;           //of name
00194   TBOX page_box;
00195 
00196   lastdot = strrchr (name.string (), '.');
00197   if (lastdot != NULL)
00198     name[lastdot-name.string()] = '\0';
00199 
00200   ASSERT_HOST(tess->pix_binary() != NULL)
00201   int width = pixGetWidth(tess->pix_binary());
00202   int height = pixGetHeight(tess->pix_binary());
00203   int resolution = pixGetXRes(tess->pix_binary());
00204   // Zero resolution messes up the algorithms, so make sure it is credible.
00205   if (resolution < kMinCredibleResolution)
00206     resolution = kDefaultResolution;
00207 
00208   BLOCK_LIST blocks;
00209   if (!read_unlv_file(name, width, height, &blocks))
00210     FullPageBlock(width, height, &blocks);
00211 
00212   // Try to remove non-text regions from consideration.
00213   TO_BLOCK_LIST land_blocks, port_blocks;
00214   remove_nontext_regions(tess, &blocks, &port_blocks);
00215 
00216   if (port_blocks.empty()) {
00217     // page segmentation did not succeed, so we need to find_components first.
00218     tess->mutable_textord()->find_components(tess->pix_binary(),
00219                                              &blocks, &port_blocks);
00220   } else {
00221     page_box.set_left(0);
00222     page_box.set_bottom(0);
00223     page_box.set_right(width);
00224     page_box.set_top(height);
00225     // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
00226     tess->mutable_textord()->filter_blobs(page_box.topright(),
00227                                           &port_blocks, true);
00228   }
00229 
00230   return os_detect(&port_blocks, osr, tess);
00231 }
00232 
00233 // Filter and sample the blobs.
00234 // Returns a non-zero number of blobs if the page was successfully processed, or
00235 // zero if the page had too few characters to be reliable
00236 int os_detect(TO_BLOCK_LIST* port_blocks, OSResults* osr,
00237               tesseract::Tesseract* tess) {
00238   int blobs_total = 0;
00239   TO_BLOCK_IT block_it;
00240   block_it.set_to_list(port_blocks);
00241 
00242   BLOBNBOX_CLIST filtered_list;
00243   BLOBNBOX_C_IT filtered_it(&filtered_list);
00244 
00245   for (block_it.mark_cycle_pt(); !block_it.cycled_list();
00246        block_it.forward ()) {
00247     TO_BLOCK* to_block = block_it.data();
00248     if (to_block->block->poly_block() &&
00249         !to_block->block->poly_block()->IsText()) continue;
00250     BLOBNBOX_IT bbox_it;
00251     bbox_it.set_to_list(&to_block->blobs);
00252     for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list ();
00253          bbox_it.forward ()) {
00254       BLOBNBOX* bbox = bbox_it.data();
00255       C_BLOB*   blob = bbox->cblob();
00256       TBOX      box = blob->bounding_box();
00257       ++blobs_total;
00258 
00259       float y_x = fabs((box.height() * 1.0) / box.width());
00260       float x_y = 1.0f / y_x;
00261       // Select a >= 1.0 ratio
00262       float ratio = x_y > y_x ? x_y : y_x;
00263       // Blob is ambiguous
00264       if (ratio > kSizeRatioToReject) continue;
00265       if (box.height() < kMinAcceptableBlobHeight) continue;
00266       filtered_it.add_to_end(bbox);
00267     }
00268   }
00269   return os_detect_blobs(&filtered_list, osr, tess);
00270 }
00271 
00272 // Detect orientation and script from a list of blobs.
00273 // Returns a non-zero number of blobs if the list was successfully processed, or
00274 // zero if the list had too few characters to be reliable
00275 int os_detect_blobs(BLOBNBOX_CLIST* blob_list, OSResults* osr,
00276                     tesseract::Tesseract* tess) {
00277   OSResults osr_;
00278   if (osr == NULL)
00279     osr = &osr_;
00280 
00281   osr->unicharset = &tess->unicharset;
00282   OrientationDetector o(osr);
00283   ScriptDetector s(osr, tess);
00284 
00285   BLOBNBOX_C_IT filtered_it(blob_list);
00286   int real_max = MIN(filtered_it.length(), kMaxCharactersToTry);
00287   // tprintf("Total blobs found = %d\n", blobs_total);
00288   // tprintf("Number of blobs post-filtering = %d\n", filtered_it.length());
00289   // tprintf("Number of blobs to try = %d\n", real_max);
00290 
00291   // If there are too few characters, skip this page entirely.
00292   if (real_max < kMinCharactersToTry / 2) {
00293     tprintf("Too few characters. Skipping this page\n");
00294     return 0;
00295   }
00296 
00297   BLOBNBOX** blobs = new BLOBNBOX*[filtered_it.length()];
00298   int number_of_blobs = 0;
00299   for (filtered_it.mark_cycle_pt (); !filtered_it.cycled_list ();
00300        filtered_it.forward ()) {
00301     blobs[number_of_blobs++] = (BLOBNBOX*)filtered_it.data();
00302   }
00303   QRSequenceGenerator sequence(number_of_blobs);
00304   int num_blobs_evaluated = 0;
00305   for (int i = 0; i < real_max; ++i) {
00306     if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess)
00307         && i > kMinCharactersToTry) {
00308       break;
00309     }
00310     ++num_blobs_evaluated;
00311   }
00312   delete [] blobs;
00313 
00314   // Make sure the best_result is up-to-date
00315   int orientation = o.get_orientation();
00316   osr->update_best_script(orientation);
00317   return num_blobs_evaluated;
00318 }
00319 
00320 // Processes a single blob to estimate script and orientation.
00321 // Return true if estimate of orientation and script satisfies stopping
00322 // criteria.
00323 bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o,
00324                     ScriptDetector* s, OSResults* osr,
00325                     tesseract::Tesseract* tess) {
00326   tess->tess_cn_matching.set_value(true); // turn it on
00327   tess->tess_bn_matching.set_value(false);
00328   C_BLOB* blob = bbox->cblob();
00329   TBLOB* tblob = TBLOB::PolygonalCopy(tess->poly_allow_detailed_fx, blob);
00330   TBOX box = tblob->bounding_box();
00331   FCOORD current_rotation(1.0f, 0.0f);
00332   FCOORD rotation90(0.0f, 1.0f);
00333   BLOB_CHOICE_LIST ratings[4];
00334   // Test the 4 orientations
00335   for (int i = 0; i < 4; ++i) {
00336     // Normalize the blob. Set the origin to the place we want to be the
00337     // bottom-middle after rotation.
00338     // Scaling is to make the rotated height the x-height.
00339     float scaling = static_cast<float>(kBlnXHeight) / box.height();
00340     float x_origin = (box.left() + box.right()) / 2.0f;
00341     float y_origin = (box.bottom() + box.top()) / 2.0f;
00342     if (i == 0 || i == 2) {
00343       // Rotation is 0 or 180.
00344       y_origin = i == 0 ? box.bottom() : box.top();
00345     } else {
00346       // Rotation is 90 or 270.
00347       scaling = static_cast<float>(kBlnXHeight) / box.width();
00348       x_origin = i == 1 ? box.left() : box.right();
00349     }
00350     TBLOB* rotated_blob = new TBLOB(*tblob);
00351     rotated_blob->Normalize(NULL, &current_rotation, NULL,
00352                             x_origin, y_origin, scaling, scaling,
00353                             0.0f, static_cast<float>(kBlnBaselineOffset),
00354                             false, NULL);
00355     tess->AdaptiveClassifier(rotated_blob, ratings + i);
00356     delete rotated_blob;
00357     current_rotation.rotate(rotation90);
00358   }
00359   delete tblob;
00360 
00361   bool stop = o->detect_blob(ratings);
00362   s->detect_blob(ratings);
00363   int orientation = o->get_orientation();
00364   stop = s->must_stop(orientation) && stop;
00365   return stop;
00366 }
00367 
00368 
00369 OrientationDetector::OrientationDetector(OSResults* osr) {
00370   osr_ = osr;
00371 }
00372 
00373 // Score the given blob and return true if it is now sure of the orientation
00374 // after adding this block.
00375 bool OrientationDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
00376   float blob_o_score[4] = {0.0, 0.0, 0.0, 0.0};
00377   float total_blob_o_score = 0.0;
00378 
00379   for (int i = 0; i < 4; ++i) {
00380     BLOB_CHOICE_IT choice_it;
00381     choice_it.set_to_list(scores + i);
00382     if (!choice_it.empty()) {
00383       // The certainty score ranges between [-20,0]. This is converted here to
00384       // [0,1], with 1 indicating best match.
00385       blob_o_score[i] = 1 + 0.05 * choice_it.data()->certainty();
00386       total_blob_o_score += blob_o_score[i];
00387     }
00388   }
00389   // Normalize the orientation scores for the blob and use them to
00390   // update the aggregated orientation score.
00391   for (int i = 0; total_blob_o_score != 0 && i < 4; ++i) {
00392     osr_->orientations[i] += log(blob_o_score[i] / total_blob_o_score);
00393   }
00394 
00395   float first = -1;
00396   float second = -1;
00397 
00398   for (int i = 0; i < 4; ++i) {
00399     if (osr_->orientations[i] > first) {
00400       second = first;
00401       first = osr_->orientations[i];
00402     } else if (osr_->orientations[i] > second) {
00403       second = osr_->orientations[i];
00404     }
00405   }
00406 
00407   return first / second > kOrientationAcceptRatio;
00408 }
00409 
00410 int OrientationDetector::get_orientation() {
00411   osr_->update_best_orientation();
00412   return osr_->best_result.orientation_id;
00413 }
00414 
00415 
00416 ScriptDetector::ScriptDetector(OSResults* osr, tesseract::Tesseract* tess) {
00417   osr_ = osr;
00418   tess_ = tess;
00419   katakana_id_ = tess_->unicharset.add_script(katakana_script);
00420   hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
00421   han_id_ = tess_->unicharset.add_script(han_script);
00422   hangul_id_ = tess_->unicharset.add_script(hangul_script);
00423   japanese_id_ = tess_->unicharset.add_script(japanese_script_);
00424   korean_id_ = tess_->unicharset.add_script(korean_script_);
00425   latin_id_ = tess_->unicharset.add_script(latin_script);
00426   fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
00427 }
00428 
00429 
00430 // Score the given blob and return true if it is now sure of the script after
00431 // adding this blob.
00432 void ScriptDetector::detect_blob(BLOB_CHOICE_LIST* scores) {
00433   bool done[kMaxNumberOfScripts];
00434   for (int i = 0; i < 4; ++i) {
00435     for (int j = 0; j < kMaxNumberOfScripts; ++j)
00436       done[j] = false;
00437 
00438     BLOB_CHOICE_IT choice_it;
00439     choice_it.set_to_list(scores + i);
00440 
00441     float prev_score = -1;
00442     int script_count = 0;
00443     int prev_id = -1;
00444     int prev_fontinfo_id = -1;
00445     const char* prev_unichar = "";
00446     const char* unichar = "";
00447 
00448     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
00449          choice_it.forward()) {
00450       BLOB_CHOICE* choice = choice_it.data();
00451       int id = choice->script_id();
00452       // Script already processed before.
00453       if (done[id]) continue;
00454       done[id] = true;
00455 
00456       unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
00457       // Save data from the first match
00458       if (prev_score < 0) {
00459         prev_score = -choice->certainty();
00460         script_count = 1;
00461         prev_id = id;
00462         prev_unichar = unichar;
00463         prev_fontinfo_id = choice->fontinfo_id();
00464       } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
00465         ++script_count;
00466       }
00467 
00468       if (strlen(prev_unichar) == 1)
00469         if (unichar[0] >= '0' && unichar[0] <= '9')
00470           break;
00471 
00472       // if script_count is >= 2, character is ambiguous, skip other matches
00473       // since they are useless.
00474       if (script_count >= 2)
00475         break;
00476     }
00477     // Character is non ambiguous
00478     if (script_count == 1) {
00479       // Update the score of the winning script
00480       osr_->scripts_na[i][prev_id] += 1.0;
00481 
00482       // Workaround for Fraktur
00483       if (prev_id == latin_id_) {
00484         if (prev_fontinfo_id >= 0) {
00485           const tesseract::FontInfo &fi =
00486               tess_->get_fontinfo_table().get(prev_fontinfo_id);
00487           //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
00488           //       fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
00489           //       fi.is_serif(), fi.is_fraktur(),
00490           //       prev_unichar);
00491           if (fi.is_fraktur()) {
00492             osr_->scripts_na[i][prev_id] -= 1.0;
00493             osr_->scripts_na[i][fraktur_id_] += 1.0;
00494           }
00495         }
00496       }
00497 
00498       // Update Japanese / Korean pseudo-scripts
00499       if (prev_id == katakana_id_)
00500         osr_->scripts_na[i][japanese_id_] += 1.0;
00501       if (prev_id == hiragana_id_)
00502         osr_->scripts_na[i][japanese_id_] += 1.0;
00503       if (prev_id == hangul_id_)
00504         osr_->scripts_na[i][korean_id_] += 1.0;
00505       if (prev_id == han_id_)
00506         osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
00507       if (prev_id == han_id_)
00508         osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
00509     }
00510   }  // iterate over each orientation
00511 }
00512 
00513 bool ScriptDetector::must_stop(int orientation) {
00514   osr_->update_best_script(orientation);
00515   return osr_->best_result.sconfidence > 1;
00516 }
00517 
00518 // Helper method to convert an orientation index to its value in degrees.
00519 // The value represents the amount of clockwise rotation in degrees that must be
00520 // applied for the text to be upright (readable).
00521 const int OrientationIdToValue(const int& id) {
00522   switch (id) {
00523     case 0:
00524       return 0;
00525     case 1:
00526       return 270;
00527     case 2:
00528       return 180;
00529     case 3:
00530       return 90;
00531     default:
00532       return -1;
00533   }
00534 }
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines