tesseract
3.03
|
00001 // Copyright 2011 Google Inc. All Rights Reserved. 00002 // Author: rays@google.com (Ray Smith) 00003 // 00004 // Licensed under the Apache License, Version 2.0 (the "License"); 00005 // you may not use this file except in compliance with the License. 00006 // You may obtain a copy of the License at 00007 // http://www.apache.org/licenses/LICENSE-2.0 00008 // Unless required by applicable law or agreed to in writing, software 00009 // distributed under the License is distributed on an "AS IS" BASIS, 00010 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00011 // See the License for the specific language governing permissions and 00012 // limitations under the License. 00013 00014 #ifdef HAVE_CONFIG_H 00015 #include "config_auto.h" 00016 #endif 00017 00018 #include "textlineprojection.h" 00019 #include "allheaders.h" 00020 #include "bbgrid.h" // Base class. 00021 #include "blobbox.h" // BlobNeighourDir. 00022 #include "blobs.h" 00023 #include "colpartition.h" 00024 #include "normalis.h" 00025 00026 // Padding factor to use on definitely oriented blobs 00027 const int kOrientedPadFactor = 8; 00028 // Padding factor to use on not definitely oriented blobs. 00029 const int kDefaultPadFactor = 2; 00030 // Penalty factor for going away from the line center. 00031 const int kWrongWayPenalty = 4; 00032 // Ratio between parallel gap and perpendicular gap used to measure total 00033 // distance of a box from a target box in curved textline space. 00034 // parallel-gap is treated more favorably by this factor to allow catching 00035 // quotes and elipsis at the end of textlines. 00036 const int kParaPerpDistRatio = 4; 00037 // Multiple of scale_factor_ that the inter-line gap must be before we start 00038 // padding the increment box perpendicular to the text line. 00039 const int kMinLineSpacingFactor = 4; 00040 // Maximum tab-stop overrun for horizontal padding, in projection pixels. 00041 const int kMaxTabStopOverrun = 6; 00042 00043 namespace tesseract { 00044 00045 TextlineProjection::TextlineProjection(int resolution) 00046 : x_origin_(0), y_origin_(0), pix_(NULL) { 00047 // The projection map should be about 100 ppi, whatever the input. 00048 scale_factor_ = IntCastRounded(resolution / 100.0); 00049 if (scale_factor_ < 1) scale_factor_ = 1; 00050 } 00051 TextlineProjection::~TextlineProjection() { 00052 pixDestroy(&pix_); 00053 } 00054 00055 // Build the projection profile given the input_block containing lists of 00056 // blobs, a rotation to convert to image coords, 00057 // and a full-resolution nontext_map, marking out areas to avoid. 00058 // During construction, we have the following assumptions: 00059 // The rotation is a multiple of 90 degrees, ie no deskew yet. 00060 // The blobs have had their left and right rules set to also limit 00061 // the range of projection. 00062 void TextlineProjection::ConstructProjection(TO_BLOCK* input_block, 00063 const FCOORD& rotation, 00064 Pix* nontext_map) { 00065 pixDestroy(&pix_); 00066 TBOX image_box(0, 0, pixGetWidth(nontext_map), pixGetHeight(nontext_map)); 00067 x_origin_ = 0; 00068 y_origin_ = image_box.height(); 00069 int width = (image_box.width() + scale_factor_ - 1) / scale_factor_; 00070 int height = (image_box.height() + scale_factor_ - 1) / scale_factor_; 00071 00072 pix_ = pixCreate(width, height, 8); 00073 ProjectBlobs(&input_block->blobs, rotation, image_box, nontext_map); 00074 ProjectBlobs(&input_block->large_blobs, rotation, image_box, nontext_map); 00075 Pix* final_pix = pixBlockconv(pix_, 1, 1); 00076 // Pix* final_pix = pixBlockconv(pix_, 2, 2); 00077 pixDestroy(&pix_); 00078 pix_ = final_pix; 00079 } 00080 00081 // Display the blobs in the window colored according to textline quality. 00082 void TextlineProjection::PlotGradedBlobs(BLOBNBOX_LIST* blobs, 00083 ScrollView* win) { 00084 #ifndef GRAPHICS_DISABLED 00085 BLOBNBOX_IT it(blobs); 00086 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00087 BLOBNBOX* blob = it.data(); 00088 const TBOX& box = blob->bounding_box(); 00089 bool bad_box = BoxOutOfHTextline(box, NULL, false); 00090 if (blob->UniquelyVertical()) 00091 win->Pen(ScrollView::YELLOW); 00092 else 00093 win->Pen(bad_box ? ScrollView::RED : ScrollView::BLUE); 00094 win->Rectangle(box.left(), box.bottom(), box.right(), box.top()); 00095 } 00096 win->Update(); 00097 #endif // GRAPHICS_DISABLED 00098 } 00099 00100 // Moves blobs that look like they don't sit well on a textline from the 00101 // input blobs list to the output small_blobs list. 00102 // This gets them away from initial textline finding to stop diacritics 00103 // from forming incorrect textlines. (Introduced mainly to fix Thai.) 00104 void TextlineProjection::MoveNonTextlineBlobs( 00105 BLOBNBOX_LIST* blobs, BLOBNBOX_LIST* small_blobs) const { 00106 BLOBNBOX_IT it(blobs); 00107 BLOBNBOX_IT small_it(small_blobs); 00108 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { 00109 BLOBNBOX* blob = it.data(); 00110 const TBOX& box = blob->bounding_box(); 00111 bool debug = AlignedBlob::WithinTestRegion(2, box.left(), 00112 box.bottom()); 00113 if (BoxOutOfHTextline(box, NULL, debug) && !blob->UniquelyVertical()) { 00114 blob->ClearNeighbours(); 00115 small_it.add_to_end(it.extract()); 00116 } 00117 } 00118 } 00119 00120 // Create a window and display the projection in it. 00121 void TextlineProjection::DisplayProjection() const { 00122 int width = pixGetWidth(pix_); 00123 int height = pixGetHeight(pix_); 00124 Pix* pixc = pixCreate(width, height, 32); 00125 int src_wpl = pixGetWpl(pix_); 00126 int col_wpl = pixGetWpl(pixc); 00127 uinT32* src_data = pixGetData(pix_); 00128 uinT32* col_data = pixGetData(pixc); 00129 for (int y = 0; y < height; ++y, src_data += src_wpl, col_data += col_wpl) { 00130 for (int x = 0; x < width; ++x) { 00131 int pixel = GET_DATA_BYTE(src_data, x); 00132 l_uint32 result; 00133 if (pixel <= 17) 00134 composeRGBPixel(0, 0, pixel * 15, &result); 00135 else if (pixel <= 145) 00136 composeRGBPixel(0, (pixel - 17) * 2, 255, &result); 00137 else 00138 composeRGBPixel((pixel - 145) * 2, 255, 255, &result); 00139 col_data[x] = result; 00140 } 00141 } 00142 #if 0 00143 // TODO(rays) uncomment when scrollview can display non-binary images. 00144 ScrollView* win = new ScrollView("Projection", 0, 0, 00145 width, height, width, height); 00146 win->Image(pixc, 0, 0); 00147 win->Update(); 00148 #else 00149 pixWrite("projection.png", pixc, IFF_PNG); 00150 #endif 00151 pixDestroy(&pixc); 00152 } 00153 00154 // Compute the distance of the box from the partition using curved projection 00155 // space. As DistanceOfBoxFromBox, except that the direction is taken from 00156 // the ColPartition and the median bounds of the ColPartition are used as 00157 // the to_box. 00158 int TextlineProjection::DistanceOfBoxFromPartition(const TBOX& box, 00159 const ColPartition& part, 00160 const DENORM* denorm, 00161 bool debug) const { 00162 // Compute a partition box that uses the median top/bottom of the blobs 00163 // within and median left/right for vertical. 00164 TBOX part_box = part.bounding_box(); 00165 if (part.IsHorizontalType()) { 00166 part_box.set_top(part.median_top()); 00167 part_box.set_bottom(part.median_bottom()); 00168 } else { 00169 part_box.set_left(part.median_left()); 00170 part_box.set_right(part.median_right()); 00171 } 00172 // Now use DistanceOfBoxFromBox to make the actual calculation. 00173 return DistanceOfBoxFromBox(box, part_box, part.IsHorizontalType(), 00174 denorm, debug); 00175 } 00176 00177 // Compute the distance from the from_box to the to_box using curved 00178 // projection space. Separation that involves a decrease in projection 00179 // density (moving from the from_box to the to_box) is weighted more heavily 00180 // than constant density, and an increase is weighted less. 00181 // If horizontal_textline is true, then curved space is used vertically, 00182 // as for a diacritic on the edge of a textline. 00183 // The projection uses original image coords, so denorm is used to get 00184 // back to the image coords from box/part space. 00185 // How the calculation works: Think of a diacritic near a textline. 00186 // Distance is measured from the far side of the from_box to the near side of 00187 // the to_box. Shown is the horizontal textline case. 00188 // |------^-----| 00189 // | from | box | 00190 // |------|-----| 00191 // perpendicular | 00192 // <------v-------->|--------------------| 00193 // parallel | to box | 00194 // |--------------------| 00195 // Perpendicular distance uses "curved space" See VerticalDistance below. 00196 // Parallel distance is linear. 00197 // Result is perpendicular_gap + parallel_gap / kParaPerpDistRatio. 00198 int TextlineProjection::DistanceOfBoxFromBox(const TBOX& from_box, 00199 const TBOX& to_box, 00200 bool horizontal_textline, 00201 const DENORM* denorm, 00202 bool debug) const { 00203 // The parallel_gap is the horizontal gap between a horizontal textline and 00204 // the box. Analogous for vertical. 00205 int parallel_gap = 0; 00206 // start_pt is the box end of the line to be modified for curved space. 00207 TPOINT start_pt; 00208 // end_pt is the partition end of the line to be modified for curved space. 00209 TPOINT end_pt; 00210 if (horizontal_textline) { 00211 parallel_gap = from_box.x_gap(to_box) + from_box.width(); 00212 start_pt.x = (from_box.left() + from_box.right()) / 2; 00213 end_pt.x = start_pt.x; 00214 if (from_box.top() - to_box.top() >= to_box.bottom() - from_box.bottom()) { 00215 start_pt.y = from_box.top(); 00216 end_pt.y = MIN(to_box.top(), start_pt.y); 00217 } else { 00218 start_pt.y = from_box.bottom(); 00219 end_pt.y = MAX(to_box.bottom(), start_pt.y); 00220 } 00221 } else { 00222 parallel_gap = from_box.y_gap(to_box) + from_box.height(); 00223 if (from_box.right() - to_box.right() >= to_box.left() - from_box.left()) { 00224 start_pt.x = from_box.right(); 00225 end_pt.x = MIN(to_box.right(), start_pt.x); 00226 } else { 00227 start_pt.x = from_box.left(); 00228 end_pt.x = MAX(to_box.left(), start_pt.x); 00229 } 00230 start_pt.y = (from_box.bottom() + from_box.top()) / 2; 00231 end_pt.y = start_pt.y; 00232 } 00233 // The perpendicular gap is the max vertical distance gap out of: 00234 // top of from_box to to_box top and bottom of from_box to to_box bottom. 00235 // This value is then modified for curved projection space. 00236 // Analogous for vertical. 00237 int perpendicular_gap = 0; 00238 // If start_pt == end_pt, then the from_box lies entirely within the to_box 00239 // (in the perpendicular direction), so we don't need to calculate the 00240 // perpendicular_gap. 00241 if (start_pt.x != end_pt.x || start_pt.y != end_pt.y) { 00242 if (denorm != NULL) { 00243 // Denormalize the start and end. 00244 denorm->DenormTransform(NULL, start_pt, &start_pt); 00245 denorm->DenormTransform(NULL, end_pt, &end_pt); 00246 } 00247 if (abs(start_pt.y - end_pt.y) >= abs(start_pt.x - end_pt.x)) { 00248 perpendicular_gap = VerticalDistance(debug, start_pt.x, start_pt.y, 00249 end_pt.y); 00250 } else { 00251 perpendicular_gap = HorizontalDistance(debug, start_pt.x, end_pt.x, 00252 start_pt.y); 00253 } 00254 } 00255 // The parallel_gap weighs less than the perpendicular_gap. 00256 return perpendicular_gap + parallel_gap / kParaPerpDistRatio; 00257 } 00258 00259 // Compute the distance between (x, y1) and (x, y2) using the rule that 00260 // a decrease in textline density is weighted more heavily than an increase. 00261 // The coordinates are in source image space, ie processed by any denorm 00262 // already, but not yet scaled by scale_factor_. 00263 // Going from the outside of a textline to the inside should measure much 00264 // less distance than going from the inside of a textline to the outside. 00265 // How it works: 00266 // An increase is cheap (getting closer to a textline). 00267 // Constant costs unity. 00268 // A decrease is expensive (getting further from a textline). 00269 // Pixels in projection map Counted distance 00270 // 2 00271 // 3 1/x 00272 // 3 1 00273 // 2 x 00274 // 5 1/x 00275 // 7 1/x 00276 // Total: 1 + x + 3/x where x = kWrongWayPenalty. 00277 int TextlineProjection::VerticalDistance(bool debug, int x, 00278 int y1, int y2) const { 00279 x = ImageXToProjectionX(x); 00280 y1 = ImageYToProjectionY(y1); 00281 y2 = ImageYToProjectionY(y2); 00282 if (y1 == y2) return 0; 00283 int wpl = pixGetWpl(pix_); 00284 int step = y1 < y2 ? 1 : -1; 00285 uinT32* data = pixGetData(pix_) + y1 * wpl; 00286 wpl *= step; 00287 int prev_pixel = GET_DATA_BYTE(data, x); 00288 int distance = 0; 00289 int right_way_steps = 0; 00290 for (int y = y1; y != y2; y += step) { 00291 data += wpl; 00292 int pixel = GET_DATA_BYTE(data, x); 00293 if (debug) 00294 tprintf("At (%d,%d), pix = %d, prev=%d\n", 00295 x, y + step, pixel, prev_pixel); 00296 if (pixel < prev_pixel) 00297 distance += kWrongWayPenalty; 00298 else if (pixel > prev_pixel) 00299 ++right_way_steps; 00300 else 00301 ++distance; 00302 prev_pixel = pixel; 00303 } 00304 return distance * scale_factor_ + 00305 right_way_steps * scale_factor_ / kWrongWayPenalty; 00306 } 00307 00308 // Compute the distance between (x1, y) and (x2, y) using the rule that 00309 // a decrease in textline density is weighted more heavily than an increase. 00310 int TextlineProjection::HorizontalDistance(bool debug, int x1, int x2, 00311 int y) const { 00312 x1 = ImageXToProjectionX(x1); 00313 x2 = ImageXToProjectionX(x2); 00314 y = ImageYToProjectionY(y); 00315 if (x1 == x2) return 0; 00316 int wpl = pixGetWpl(pix_); 00317 int step = x1 < x2 ? 1 : -1; 00318 uinT32* data = pixGetData(pix_) + y * wpl; 00319 int prev_pixel = GET_DATA_BYTE(data, x1); 00320 int distance = 0; 00321 int right_way_steps = 0; 00322 for (int x = x1; x != x2; x += step) { 00323 int pixel = GET_DATA_BYTE(data, x + step); 00324 if (debug) 00325 tprintf("At (%d,%d), pix = %d, prev=%d\n", 00326 x + step, y, pixel, prev_pixel); 00327 if (pixel < prev_pixel) 00328 distance += kWrongWayPenalty; 00329 else if (pixel > prev_pixel) 00330 ++right_way_steps; 00331 else 00332 ++distance; 00333 prev_pixel = pixel; 00334 } 00335 return distance * scale_factor_ + 00336 right_way_steps * scale_factor_ / kWrongWayPenalty; 00337 } 00338 00339 // Returns true if the blob appears to be outside of a textline. 00340 // Such blobs are potentially diacritics (even if large in Thai) and should 00341 // be kept away from initial textline finding. 00342 bool TextlineProjection::BoxOutOfHTextline(const TBOX& box, 00343 const DENORM* denorm, 00344 bool debug) const { 00345 int grad1 = 0; 00346 int grad2 = 0; 00347 EvaluateBoxInternal(box, denorm, debug, &grad1, &grad2, NULL, NULL); 00348 int worst_result = MIN(grad1, grad2); 00349 int total_result = grad1 + grad2; 00350 if (total_result >= 6) return false; // Strongly in textline. 00351 // Medium strength: if either gradient is negative, it is likely outside 00352 // the body of the textline. 00353 if (worst_result < 0) 00354 return true; 00355 return false; 00356 } 00357 00358 // Evaluates the textlineiness of a ColPartition. Uses EvaluateBox below, 00359 // but uses the median top/bottom for horizontal and median left/right for 00360 // vertical instead of the bounding box edges. 00361 // Evaluates for both horizontal and vertical and returns the best result, 00362 // with a positive value for horizontal and a negative value for vertical. 00363 int TextlineProjection::EvaluateColPartition(const ColPartition& part, 00364 const DENORM* denorm, 00365 bool debug) const { 00366 if (part.IsSingleton()) 00367 return EvaluateBox(part.bounding_box(), denorm, debug); 00368 // Test vertical orientation. 00369 TBOX box = part.bounding_box(); 00370 // Use the partition median for left/right. 00371 box.set_left(part.median_left()); 00372 box.set_right(part.median_right()); 00373 int vresult = EvaluateBox(box, denorm, debug); 00374 00375 // Test horizontal orientation. 00376 box = part.bounding_box(); 00377 // Use the partition median for top/bottom. 00378 box.set_top(part.median_top()); 00379 box.set_bottom(part.median_bottom()); 00380 int hresult = EvaluateBox(box, denorm, debug); 00381 if (debug) { 00382 tprintf("Partition hresult=%d, vresult=%d from:", hresult, vresult); 00383 part.bounding_box().print(); 00384 part.Print(); 00385 } 00386 return hresult >= -vresult ? hresult : vresult; 00387 } 00388 00389 // Computes the mean projection gradients over the horizontal and vertical 00390 // edges of the box: 00391 // -h-h-h-h-h-h 00392 // |------------| mean=htop -v|+v--------+v|-v 00393 // |+h+h+h+h+h+h| -v|+v +v|-v 00394 // | | -v|+v +v|-v 00395 // | box | -v|+v box +v|-v 00396 // | | -v|+v +v|-v 00397 // |+h+h+h+h+h+h| -v|+v +v|-v 00398 // |------------| mean=hbot -v|+v--------+v|-v 00399 // -h-h-h-h-h-h 00400 // mean=vleft mean=vright 00401 // 00402 // Returns MAX(htop,hbot) - MAX(vleft,vright), which is a positive number 00403 // for a horizontal textline, a negative number for a vertical textline, 00404 // and near zero for undecided. Undecided is most likely non-text. 00405 // All the gradients are truncated to remain non-negative, since negative 00406 // horizontal gradients don't give any indication of being vertical and 00407 // vice versa. 00408 // Additional complexity: The coordinates have to be transformed to original 00409 // image coordinates with denorm (if not null), scaled to match the projection 00410 // pix, and THEN step out 2 pixels each way from the edge to compute the 00411 // gradient, and tries 3 positions, each measuring the gradient over a 00412 // 4-pixel spread: (+3/-1), (+2/-2), (+1/-3). This complexity is handled by 00413 // several layers of helpers below. 00414 int TextlineProjection::EvaluateBox(const TBOX& box, const DENORM* denorm, 00415 bool debug) const { 00416 return EvaluateBoxInternal(box, denorm, debug, NULL, NULL, NULL, NULL); 00417 } 00418 00419 // Internal version of EvaluateBox returns the unclipped gradients as well 00420 // as the result of EvaluateBox. 00421 // hgrad1 and hgrad2 are the gradients for the horizontal textline. 00422 int TextlineProjection::EvaluateBoxInternal(const TBOX& box, 00423 const DENORM* denorm, bool debug, 00424 int* hgrad1, int* hgrad2, 00425 int* vgrad1, int* vgrad2) const { 00426 int top_gradient = BestMeanGradientInRow(denorm, box.left(), box.right(), 00427 box.top(), true); 00428 int bottom_gradient = -BestMeanGradientInRow(denorm, box.left(), box.right(), 00429 box.bottom(), false); 00430 int left_gradient = BestMeanGradientInColumn(denorm, box.left(), box.bottom(), 00431 box.top(), true); 00432 int right_gradient = -BestMeanGradientInColumn(denorm, box.right(), 00433 box.bottom(), box.top(), 00434 false); 00435 int top_clipped = MAX(top_gradient, 0); 00436 int bottom_clipped = MAX(bottom_gradient, 0); 00437 int left_clipped = MAX(left_gradient, 0); 00438 int right_clipped = MAX(right_gradient, 0); 00439 if (debug) { 00440 tprintf("Gradients: top = %d, bottom = %d, left= %d, right= %d for box:", 00441 top_gradient, bottom_gradient, left_gradient, right_gradient); 00442 box.print(); 00443 } 00444 int result = MAX(top_clipped, bottom_clipped) - 00445 MAX(left_clipped, right_clipped); 00446 if (hgrad1 != NULL && hgrad2 != NULL) { 00447 *hgrad1 = top_gradient; 00448 *hgrad2 = bottom_gradient; 00449 } 00450 if (vgrad1 != NULL && vgrad2 != NULL) { 00451 *vgrad1 = left_gradient; 00452 *vgrad2 = right_gradient; 00453 } 00454 return result; 00455 } 00456 00457 // Helper returns the mean gradient value for the horizontal row at the given 00458 // y, (in the external coordinates) by subtracting the mean of the transformed 00459 // row 2 pixels above from the mean of the transformed row 2 pixels below. 00460 // This gives a positive value for a good top edge and negative for bottom. 00461 // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge. 00462 int TextlineProjection::BestMeanGradientInRow(const DENORM* denorm, 00463 inT16 min_x, inT16 max_x, inT16 y, 00464 bool best_is_max) const { 00465 TPOINT start_pt(min_x, y); 00466 TPOINT end_pt(max_x, y); 00467 int upper = MeanPixelsInLineSegment(denorm, -2, start_pt, end_pt); 00468 int lower = MeanPixelsInLineSegment(denorm, 2, start_pt, end_pt); 00469 int best_gradient = lower - upper; 00470 upper = MeanPixelsInLineSegment(denorm, -1, start_pt, end_pt); 00471 lower = MeanPixelsInLineSegment(denorm, 3, start_pt, end_pt); 00472 int gradient = lower - upper; 00473 if ((gradient > best_gradient) == best_is_max) 00474 best_gradient = gradient; 00475 upper = MeanPixelsInLineSegment(denorm, -3, start_pt, end_pt); 00476 lower = MeanPixelsInLineSegment(denorm, 1, start_pt, end_pt); 00477 gradient = lower - upper; 00478 if ((gradient > best_gradient) == best_is_max) 00479 best_gradient = gradient; 00480 return best_gradient; 00481 } 00482 00483 // Helper returns the mean gradient value for the vertical column at the 00484 // given x, (in the external coordinates) by subtracting the mean of the 00485 // transformed column 2 pixels left from the mean of the transformed column 00486 // 2 pixels to the right. 00487 // This gives a positive value for a good left edge and negative for right. 00488 // Returns the best result out of +2/-2, +3/-1, +1/-3 pixels from the edge. 00489 int TextlineProjection::BestMeanGradientInColumn(const DENORM* denorm, inT16 x, 00490 inT16 min_y, inT16 max_y, 00491 bool best_is_max) const { 00492 TPOINT start_pt(x, min_y); 00493 TPOINT end_pt(x, max_y); 00494 int left = MeanPixelsInLineSegment(denorm, -2, start_pt, end_pt); 00495 int right = MeanPixelsInLineSegment(denorm, 2, start_pt, end_pt); 00496 int best_gradient = right - left; 00497 left = MeanPixelsInLineSegment(denorm, -1, start_pt, end_pt); 00498 right = MeanPixelsInLineSegment(denorm, 3, start_pt, end_pt); 00499 int gradient = right - left; 00500 if ((gradient > best_gradient) == best_is_max) 00501 best_gradient = gradient; 00502 left = MeanPixelsInLineSegment(denorm, -3, start_pt, end_pt); 00503 right = MeanPixelsInLineSegment(denorm, 1, start_pt, end_pt); 00504 gradient = right - left; 00505 if ((gradient > best_gradient) == best_is_max) 00506 best_gradient = gradient; 00507 return best_gradient; 00508 } 00509 00510 // Helper returns the mean pixel value over the line between the start_pt and 00511 // end_pt (inclusive), but shifted perpendicular to the line in the projection 00512 // image by offset pixels. For simplicity, it is assumed that the vector is 00513 // either nearly horizontal or nearly vertical. It works on skewed textlines! 00514 // The end points are in external coordinates, and will be denormalized with 00515 // the denorm if not NULL before further conversion to pix coordinates. 00516 // After all the conversions, the offset is added to the direction 00517 // perpendicular to the line direction. The offset is thus in projection image 00518 // coordinates, which allows the caller to get a guaranteed displacement 00519 // between pixels used to calculate gradients. 00520 int TextlineProjection::MeanPixelsInLineSegment(const DENORM* denorm, 00521 int offset, 00522 TPOINT start_pt, 00523 TPOINT end_pt) const { 00524 TransformToPixCoords(denorm, &start_pt); 00525 TransformToPixCoords(denorm, &end_pt); 00526 TruncateToImageBounds(&start_pt); 00527 TruncateToImageBounds(&end_pt); 00528 int wpl = pixGetWpl(pix_); 00529 uinT32* data = pixGetData(pix_); 00530 int total = 0; 00531 int count = 0; 00532 int x_delta = end_pt.x - start_pt.x; 00533 int y_delta = end_pt.y - start_pt.y; 00534 if (abs(x_delta) >= abs(y_delta)) { 00535 if (x_delta == 0) 00536 return 0; 00537 // Horizontal line. Add the offset vertically. 00538 int x_step = x_delta > 0 ? 1 : -1; 00539 // Correct offset for rotation, keeping it anti-clockwise of the delta. 00540 offset *= x_step; 00541 start_pt.y += offset; 00542 end_pt.y += offset; 00543 TruncateToImageBounds(&start_pt); 00544 TruncateToImageBounds(&end_pt); 00545 x_delta = end_pt.x - start_pt.x; 00546 y_delta = end_pt.y - start_pt.y; 00547 count = x_delta * x_step + 1; 00548 for (int x = start_pt.x; x != end_pt.x; x += x_step) { 00549 int y = start_pt.y + DivRounded(y_delta * (x - start_pt.x), x_delta); 00550 total += GET_DATA_BYTE(data + wpl * y, x); 00551 } 00552 } else { 00553 // Vertical line. Add the offset horizontally. 00554 int y_step = y_delta > 0 ? 1 : -1; 00555 // Correct offset for rotation, keeping it anti-clockwise of the delta. 00556 // Pix holds the image with y=0 at the top, so the offset is negated. 00557 offset *= -y_step; 00558 start_pt.x += offset; 00559 end_pt.x += offset; 00560 TruncateToImageBounds(&start_pt); 00561 TruncateToImageBounds(&end_pt); 00562 x_delta = end_pt.x - start_pt.x; 00563 y_delta = end_pt.y - start_pt.y; 00564 count = y_delta * y_step + 1; 00565 for (int y = start_pt.y; y != end_pt.y; y += y_step) { 00566 int x = start_pt.x + DivRounded(x_delta * (y - start_pt.y), y_delta); 00567 total += GET_DATA_BYTE(data + wpl * y, x); 00568 } 00569 } 00570 return DivRounded(total, count); 00571 } 00572 00573 // Given an input pix, and a box, the sides of the box are shrunk inwards until 00574 // they bound any black pixels found within the original box. 00575 // The function converts between tesseract coords and the pix coords assuming 00576 // that this pix is full resolution equal in size to the original image. 00577 // Returns an empty box if there are no black pixels in the source box. 00578 static TBOX BoundsWithinBox(Pix* pix, const TBOX& box) { 00579 int im_height = pixGetHeight(pix); 00580 Box* input_box = boxCreate(box.left(), im_height - box.top(), 00581 box.width(), box.height()); 00582 Box* output_box = NULL; 00583 pixClipBoxToForeground(pix, input_box, NULL, &output_box); 00584 TBOX result_box; 00585 if (output_box != NULL) { 00586 l_int32 x, y, width, height; 00587 boxGetGeometry(output_box, &x, &y, &width, &height); 00588 result_box.set_left(x); 00589 result_box.set_right(x + width); 00590 result_box.set_top(im_height - y); 00591 result_box.set_bottom(result_box.top() - height); 00592 boxDestroy(&output_box); 00593 } 00594 boxDestroy(&input_box); 00595 return result_box; 00596 } 00597 00598 // Splits the given box in half at x_middle or y_middle according to split_on_x 00599 // and checks for nontext_map pixels in each half. Reduces the bbox so that it 00600 // still includes the middle point, but does not touch any fg pixels in 00601 // nontext_map. An empty box may be returned if there is no such box. 00602 static void TruncateBoxToMissNonText(int x_middle, int y_middle, 00603 bool split_on_x, Pix* nontext_map, 00604 TBOX* bbox) { 00605 TBOX box1(*bbox); 00606 TBOX box2(*bbox); 00607 TBOX im_box; 00608 if (split_on_x) { 00609 box1.set_right(x_middle); 00610 im_box = BoundsWithinBox(nontext_map, box1); 00611 if (!im_box.null_box()) box1.set_left(im_box.right()); 00612 box2.set_left(x_middle); 00613 im_box = BoundsWithinBox(nontext_map, box2); 00614 if (!im_box.null_box()) box2.set_right(im_box.left()); 00615 } else { 00616 box1.set_bottom(y_middle); 00617 im_box = BoundsWithinBox(nontext_map, box1); 00618 if (!im_box.null_box()) box1.set_top(im_box.bottom()); 00619 box2.set_top(y_middle); 00620 im_box = BoundsWithinBox(nontext_map, box2); 00621 if (!im_box.null_box()) box2.set_bottom(im_box.top()); 00622 } 00623 box1 += box2; 00624 *bbox = box1; 00625 } 00626 00627 00628 // Helper function to add 1 to a rectangle in source image coords to the 00629 // internal projection pix_. 00630 void TextlineProjection::IncrementRectangle8Bit(const TBOX& box) { 00631 int scaled_left = ImageXToProjectionX(box.left()); 00632 int scaled_top = ImageYToProjectionY(box.top()); 00633 int scaled_right = ImageXToProjectionX(box.right()); 00634 int scaled_bottom = ImageYToProjectionY(box.bottom()); 00635 int wpl = pixGetWpl(pix_); 00636 uinT32* data = pixGetData(pix_) + scaled_top * wpl; 00637 for (int y = scaled_top; y <= scaled_bottom; ++y) { 00638 for (int x = scaled_left; x <= scaled_right; ++x) { 00639 int pixel = GET_DATA_BYTE(data, x); 00640 if (pixel < 255) 00641 SET_DATA_BYTE(data, x, pixel + 1); 00642 } 00643 data += wpl; 00644 } 00645 } 00646 00647 // Inserts a list of blobs into the projection. 00648 // Rotation is a multiple of 90 degrees to get from blob coords to 00649 // nontext_map coords, nontext_map_box is the bounds of the nontext_map. 00650 // Blobs are spread horizontally or vertically according to their internal 00651 // flags, but the spreading is truncated by set pixels in the nontext_map 00652 // and also by the horizontal rule line limits on the blobs. 00653 void TextlineProjection::ProjectBlobs(BLOBNBOX_LIST* blobs, 00654 const FCOORD& rotation, 00655 const TBOX& nontext_map_box, 00656 Pix* nontext_map) { 00657 BLOBNBOX_IT blob_it(blobs); 00658 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { 00659 BLOBNBOX* blob = blob_it.data(); 00660 TBOX bbox = blob->bounding_box(); 00661 ICOORD middle((bbox.left() + bbox.right()) / 2, 00662 (bbox.bottom() + bbox.top()) / 2); 00663 bool spreading_horizontally = PadBlobBox(blob, &bbox); 00664 // Rotate to match the nontext_map. 00665 bbox.rotate(rotation); 00666 middle.rotate(rotation); 00667 if (rotation.x() == 0.0f) 00668 spreading_horizontally = !spreading_horizontally; 00669 // Clip to the image before applying the increments. 00670 bbox &= nontext_map_box; // This is in-place box intersection. 00671 // Check for image pixels before spreading. 00672 TruncateBoxToMissNonText(middle.x(), middle.y(), spreading_horizontally, 00673 nontext_map, &bbox); 00674 if (bbox.area() > 0) { 00675 IncrementRectangle8Bit(bbox); 00676 } 00677 } 00678 } 00679 00680 // Pads the bounding box of the given blob according to whether it is on 00681 // a horizontal or vertical text line, taking into account tab-stops near 00682 // the blob. Returns true if padding was in the horizontal direction. 00683 bool TextlineProjection::PadBlobBox(BLOBNBOX* blob, TBOX* bbox) { 00684 // Determine which direction to spread. 00685 // If text is well spaced out, it can be useful to pad perpendicular to 00686 // the textline direction, so as to ensure diacritics get absorbed 00687 // correctly, but if the text is tightly spaced, this will destroy the 00688 // blank space between textlines in the projection map, and that would 00689 // be very bad. 00690 int pad_limit = scale_factor_ * kMinLineSpacingFactor; 00691 int xpad = 0; 00692 int ypad = 0; 00693 bool padding_horizontally = false; 00694 if (blob->UniquelyHorizontal()) { 00695 xpad = bbox->height() * kOrientedPadFactor; 00696 padding_horizontally = true; 00697 // If the text appears to be very well spaced, pad the other direction by a 00698 // single pixel in the projection profile space to help join diacritics to 00699 // the textline. 00700 if ((blob->neighbour(BND_ABOVE) == NULL || 00701 bbox->y_gap(blob->neighbour(BND_ABOVE)->bounding_box()) > pad_limit) && 00702 (blob->neighbour(BND_BELOW) == NULL || 00703 bbox->y_gap(blob->neighbour(BND_BELOW)->bounding_box()) > pad_limit)) { 00704 ypad = scale_factor_; 00705 } 00706 } else if (blob->UniquelyVertical()) { 00707 ypad = bbox->width() * kOrientedPadFactor; 00708 if ((blob->neighbour(BND_LEFT) == NULL || 00709 bbox->x_gap(blob->neighbour(BND_LEFT)->bounding_box()) > pad_limit) && 00710 (blob->neighbour(BND_RIGHT) == NULL || 00711 bbox->x_gap(blob->neighbour(BND_RIGHT)->bounding_box()) > pad_limit)) { 00712 xpad = scale_factor_; 00713 } 00714 } else { 00715 if ((blob->neighbour(BND_ABOVE) != NULL && 00716 blob->neighbour(BND_ABOVE)->neighbour(BND_BELOW) == blob) || 00717 (blob->neighbour(BND_BELOW) != NULL && 00718 blob->neighbour(BND_BELOW)->neighbour(BND_ABOVE) == blob)) { 00719 ypad = bbox->width() * kDefaultPadFactor; 00720 } 00721 if ((blob->neighbour(BND_RIGHT) != NULL && 00722 blob->neighbour(BND_RIGHT)->neighbour(BND_LEFT) == blob) || 00723 (blob->neighbour(BND_LEFT) != NULL && 00724 blob->neighbour(BND_LEFT)->neighbour(BND_RIGHT) == blob)) { 00725 xpad = bbox->height() * kDefaultPadFactor; 00726 padding_horizontally = true; 00727 } 00728 } 00729 bbox->pad(xpad, ypad); 00730 pad_limit = scale_factor_ * kMaxTabStopOverrun; 00731 // Now shrink horizontally to avoid stepping more than pad_limit over a 00732 // tab-stop. 00733 if (bbox->left() < blob->left_rule() - pad_limit) { 00734 bbox->set_left(blob->left_rule() - pad_limit); 00735 } 00736 if (bbox->right() > blob->right_rule() + pad_limit) { 00737 bbox->set_right(blob->right_rule() + pad_limit); 00738 } 00739 return padding_horizontally; 00740 } 00741 00742 // Helper denormalizes the TPOINT with the denorm if not NULL, then 00743 // converts to pix_ coordinates. 00744 void TextlineProjection::TransformToPixCoords(const DENORM* denorm, 00745 TPOINT* pt) const { 00746 if (denorm != NULL) { 00747 // Denormalize the point. 00748 denorm->DenormTransform(NULL, *pt, pt); 00749 } 00750 pt->x = ImageXToProjectionX(pt->x); 00751 pt->y = ImageYToProjectionY(pt->y); 00752 } 00753 00754 // Helper truncates the TPOINT to be within the pix_. 00755 void TextlineProjection::TruncateToImageBounds(TPOINT* pt) const { 00756 pt->x = ClipToRange<int>(pt->x, 0, pixGetWidth(pix_) - 1); 00757 pt->y = ClipToRange<int>(pt->y, 0, pixGetHeight(pix_) - 1); 00758 } 00759 00760 // Transform tesseract image coordinates to coordinates used in the projection. 00761 int TextlineProjection::ImageXToProjectionX(int x) const { 00762 x = ClipToRange((x - x_origin_) / scale_factor_, 0, pixGetWidth(pix_) - 1); 00763 return x; 00764 } 00765 int TextlineProjection::ImageYToProjectionY(int y) const { 00766 y = ClipToRange((y_origin_ - y) / scale_factor_, 0, pixGetHeight(pix_) - 1); 00767 return y; 00768 } 00769 00770 } // namespace tesseract.