tesseract
3.03
|
00001 00002 #ifndef _OCL_KERNEL_H_ 00003 #define _OCL_KERNEL_H_ 00004 #ifndef USE_EXTERNAL_KERNEL 00005 #define KERNEL( ... )# __VA_ARGS__ "\n" 00006 // Double precision is a default of spreadsheets 00007 // cl_khr_fp64: Khronos extension 00008 // cl_amd_fp64: AMD extension 00009 // use build option outside to define fp_t 00011 const char *kernel_src = KERNEL( 00012 \n#ifdef KHR_DP_EXTENSION\n 00013 \n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n 00014 \n#elif AMD_DP_EXTENSION\n 00015 \n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n 00016 \n#else\n 00017 \n#endif\n 00018 __kernel void composeRGBPixel(__global uint *tiffdata, int w, int h,int wpl, __global uint *output) 00019 { 00020 int i = get_global_id(1); 00021 int j = get_global_id(0); 00022 int tiffword,rval,gval,bval; 00023 00024 //Ignore the excess 00025 if ((i >= h) || (j >= w)) 00026 return; 00027 00028 tiffword = tiffdata[i * w + j]; 00029 rval = ((tiffword) & 0xff); 00030 gval = (((tiffword) >> 8) & 0xff); 00031 bval = (((tiffword) >> 16) & 0xff); 00032 output[i*wpl+j] = (rval << (8 * (sizeof(uint) - 1 - 0))) | (gval << (8 * (sizeof(uint) - 1 - 1))) | (bval << (8 * (sizeof(uint) - 1 - 2))); 00033 } 00034 ) 00035 00036 KERNEL( 00037 \n__kernel void pixSubtract_inplace(__global int *dword, __global int *sword, 00038 const int wpl, const int h) 00039 { 00040 const unsigned int row = get_global_id(1); 00041 const unsigned int col = get_global_id(0); 00042 const unsigned int pos = row * wpl + col; 00043 00044 //Ignore the execss 00045 if (row >= h || col >= wpl) 00046 return; 00047 00048 *(dword + pos) &= ~(*(sword + pos)); 00049 }\n 00050 ) 00051 00052 KERNEL( 00053 \n__kernel void pixSubtract(__global int *dword, __global int *sword, 00054 const int wpl, const int h, __global int *outword) 00055 { 00056 const unsigned int row = get_global_id(1); 00057 const unsigned int col = get_global_id(0); 00058 const unsigned int pos = row * wpl + col; 00059 00060 //Ignore the execss 00061 if (row >= h || col >= wpl) 00062 return; 00063 00064 *(outword + pos) = *(dword + pos) & ~(*(sword + pos)); 00065 }\n 00066 ) 00067 00068 KERNEL( 00069 \n__kernel void pixAND(__global int *dword, __global int *sword, __global int *outword, 00070 const int wpl, const int h) 00071 { 00072 const unsigned int row = get_global_id(1); 00073 const unsigned int col = get_global_id(0); 00074 const unsigned int pos = row * wpl + col; 00075 00076 //Ignore the execss 00077 if (row >= h || col >= wpl) 00078 return; 00079 00080 *(outword + pos) = *(dword + pos) & (*(sword + pos)); 00081 }\n 00082 ) 00083 00084 KERNEL( 00085 \n__kernel void pixOR(__global int *dword, __global int *sword, __global int *outword, 00086 const int wpl, const int h) 00087 { 00088 const unsigned int row = get_global_id(1); 00089 const unsigned int col = get_global_id(0); 00090 const unsigned int pos = row * wpl + col; 00091 00092 //Ignore the execss 00093 if (row >= h || col >= wpl) 00094 return; 00095 00096 *(outword + pos) = *(dword + pos) | (*(sword + pos)); 00097 }\n 00098 ) 00099 00100 KERNEL( 00101 \n__kernel void morphoDilateHor_5x5(__global int *sword,__global int *dword, 00102 const int wpl, const int h) 00103 { 00104 const unsigned int pos = get_global_id(0); 00105 unsigned int prevword, nextword, currword,tempword; 00106 unsigned int destword; 00107 const int col = pos % wpl; 00108 00109 //Ignore the execss 00110 if (pos >= (wpl * h)) 00111 return; 00112 00113 00114 currword = *(sword + pos); 00115 destword = currword; 00116 00117 //Handle boundary conditions 00118 if(col==0) 00119 prevword=0; 00120 else 00121 prevword = *(sword + pos - 1); 00122 00123 if(col==(wpl - 1)) 00124 nextword=0; 00125 else 00126 nextword = *(sword + pos + 1); 00127 00128 //Loop unrolled 00129 00130 //1 bit to left and 1 bit to right 00131 //Get the max value on LHS of every pixel 00132 tempword = (prevword << (31)) | ((currword >> 1)); 00133 destword |= tempword; 00134 //Get max value on RHS of every pixel 00135 tempword = (currword << 1) | (nextword >> (31)); 00136 destword |= tempword; 00137 00138 //2 bit to left and 2 bit to right 00139 //Get the max value on LHS of every pixel 00140 tempword = (prevword << (30)) | ((currword >> 2)); 00141 destword |= tempword; 00142 //Get max value on RHS of every pixel 00143 tempword = (currword << 2) | (nextword >> (30)); 00144 destword |= tempword; 00145 00146 00147 *(dword + pos) = destword; 00148 00149 }\n 00150 ) 00151 00152 KERNEL( 00153 \n__kernel void morphoDilateVer_5x5(__global int *sword,__global int *dword, 00154 const int wpl, const int h) 00155 { 00156 const int col = get_global_id(0); 00157 const int row = get_global_id(1); 00158 const unsigned int pos = row * wpl + col; 00159 unsigned int tempword; 00160 unsigned int destword; 00161 int i; 00162 00163 //Ignore the execss 00164 if (row >= h || col >= wpl) 00165 return; 00166 00167 destword = *(sword + pos); 00168 00169 //2 words above 00170 i = (row - 2) < 0 ? row : (row - 2); 00171 tempword = *(sword + i*wpl + col); 00172 destword |= tempword; 00173 00174 //1 word above 00175 i = (row - 1) < 0 ? row : (row - 1); 00176 tempword = *(sword + i*wpl + col); 00177 destword |= tempword; 00178 00179 //1 word below 00180 i = (row >= (h - 1)) ? row : (row + 1); 00181 tempword = *(sword + i*wpl + col); 00182 destword |= tempword; 00183 00184 //2 words below 00185 i = (row >= (h - 2)) ? row : (row + 2); 00186 tempword = *(sword + i*wpl + col); 00187 destword |= tempword; 00188 00189 *(dword + pos) = destword; 00190 }\n 00191 ) 00192 00193 KERNEL( 00194 \n__kernel void morphoDilateHor(__global int *sword,__global int *dword,const int xp, const int xn, const int wpl, const int h) 00195 { 00196 const int col = get_global_id(0); 00197 const int row = get_global_id(1); 00198 const unsigned int pos = row * wpl + col; 00199 unsigned int parbitsxp, parbitsxn, nwords; 00200 unsigned int destword, tempword, lastword, currword; 00201 unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword; 00202 int i, j, siter, eiter; 00203 00204 //Ignore the execss 00205 if (pos >= (wpl*h) || (xn < 1 && xp < 1)) 00206 return; 00207 00208 currword = *(sword + pos); 00209 destword = currword; 00210 00211 parbitsxp = xp & 31; 00212 parbitsxn = xn & 31; 00213 nwords = xp >> 5; 00214 00215 if (parbitsxp > 0) 00216 nwords += 1; 00217 else 00218 parbitsxp = 31; 00219 00220 siter = (col - nwords); 00221 eiter = (col + nwords); 00222 00223 //Get prev word 00224 if (col==0) 00225 firstword = 0x0; 00226 else 00227 firstword = *(sword + pos - 1); 00228 00229 //Get next word 00230 if (col == (wpl - 1)) 00231 secondword = 0x0; 00232 else 00233 secondword = *(sword + pos + 1); 00234 00235 //Last partial bits on either side 00236 for (i = 1; i <= parbitsxp; i++) 00237 { 00238 //Get the max value on LHS of every pixel 00239 tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i)); 00240 00241 destword |= tempword; 00242 00243 //Get max value on RHS of every pixel 00244 tempword = (currword << i) | (secondword >> (32 - i)); 00245 destword |= tempword; 00246 } 00247 00248 //Return if halfwidth <= 1 word 00249 if (nwords == 1) 00250 { 00251 if (xn == 32) 00252 { 00253 destword |= firstword; 00254 } 00255 if (xp == 32) 00256 { 00257 destword |= secondword; 00258 } 00259 00260 *(dword + pos) = destword; 00261 return; 00262 } 00263 00264 if (siter < 0) 00265 firstword = 0x0; 00266 else 00267 firstword = *(sword + row*wpl + siter); 00268 00269 if (eiter >= wpl) 00270 lastword = 0x0; 00271 else 00272 lastword = *(sword + row*wpl + eiter); 00273 00274 for ( i = 1; i < nwords; i++) 00275 { 00276 //Gets LHS words 00277 if ((siter + i) < 0) 00278 secondword = 0x0; 00279 else 00280 secondword = *(sword + row*wpl + siter + i); 00281 00282 lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn; 00283 00284 firstword = secondword; 00285 00286 if ((siter + i + 1) < 0) 00287 secondword = 0x0; 00288 else 00289 secondword = *(sword + row*wpl + siter + i + 1); 00290 00291 lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn; 00292 00293 //Gets RHS words 00294 if ((eiter - i) >= wpl) 00295 firstword = 0x0; 00296 else 00297 firstword = *(sword + row*wpl + eiter - i); 00298 00299 rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp); 00300 00301 lastword = firstword; 00302 if ((eiter - i - 1) >= wpl) 00303 firstword = 0x0; 00304 else 00305 firstword = *(sword + row*wpl + eiter - i - 1); 00306 00307 rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp); 00308 00309 for (j = 1; j < 32; j++) 00310 { 00311 //OR LHS full words 00312 tempword = (lprevword << j) | (lnextword >> (32 - j)); 00313 destword |= tempword; 00314 00315 //OR RHS full words 00316 tempword = (rprevword << j) | (rnextword >> (32 - j)); 00317 destword |= tempword; 00318 } 00319 00320 destword |= lprevword; 00321 destword |= lnextword; 00322 destword |= rprevword; 00323 destword |= rnextword; 00324 00325 lastword = firstword; 00326 firstword = secondword; 00327 } 00328 00329 *(dword + pos) = destword; 00330 }\n 00331 ) 00332 00333 KERNEL( 00334 \n__kernel void morphoDilateHor_32word(__global int *sword,__global int *dword, 00335 const int halfwidth, 00336 const int wpl, const int h, 00337 const char isEven) 00338 { 00339 const int col = get_global_id(0); 00340 const int row = get_global_id(1); 00341 const unsigned int pos = row * wpl + col; 00342 unsigned int prevword, nextword, currword,tempword; 00343 unsigned int destword; 00344 int i; 00345 00346 //Ignore the execss 00347 if (pos >= (wpl * h)) 00348 return; 00349 00350 currword = *(sword + pos); 00351 destword = currword; 00352 00353 //Handle boundary conditions 00354 if(col==0) 00355 prevword=0; 00356 else 00357 prevword = *(sword + pos - 1); 00358 00359 if(col==(wpl - 1)) 00360 nextword=0; 00361 else 00362 nextword = *(sword + pos + 1); 00363 00364 for (i = 1; i <= halfwidth; i++) 00365 { 00366 //Get the max value on LHS of every pixel 00367 if (i == halfwidth && isEven) 00368 { 00369 tempword = 0x0; 00370 } 00371 else 00372 { 00373 tempword = (prevword << (32-i)) | ((currword >> i)); 00374 } 00375 00376 destword |= tempword; 00377 00378 //Get max value on RHS of every pixel 00379 tempword = (currword << i) | (nextword >> (32 - i)); 00380 00381 destword |= tempword; 00382 } 00383 00384 *(dword + pos) = destword; 00385 }\n 00386 ) 00387 00388 KERNEL( 00389 \n__kernel void morphoDilateVer(__global int *sword,__global int *dword, 00390 const int yp, 00391 const int wpl, const int h, 00392 const int yn) 00393 { 00394 const int col = get_global_id(0); 00395 const int row = get_global_id(1); 00396 const unsigned int pos = row * wpl + col; 00397 unsigned int tempword; 00398 unsigned int destword; 00399 int i, siter, eiter; 00400 00401 //Ignore the execss 00402 if (row >= h || col >= wpl) 00403 return; 00404 00405 destword = *(sword + pos); 00406 00407 //Set start position and end position considering the boundary conditions 00408 siter = (row - yn) < 0 ? 0 : (row - yn); 00409 eiter = (row >= (h - yp)) ? (h - 1) : (row + yp); 00410 00411 for (i = siter; i <= eiter; i++) 00412 { 00413 tempword = *(sword + i*wpl + col); 00414 00415 destword |= tempword; 00416 } 00417 00418 *(dword + pos) = destword; 00419 }\n 00420 ) 00421 00422 KERNEL( 00423 \n__kernel void morphoErodeHor_5x5(__global int *sword,__global int *dword, 00424 const int wpl, const int h) 00425 { 00426 const unsigned int pos = get_global_id(0); 00427 unsigned int prevword, nextword, currword,tempword; 00428 unsigned int destword; 00429 const int col = pos % wpl; 00430 00431 //Ignore the execss 00432 if (pos >= (wpl * h)) 00433 return; 00434 00435 currword = *(sword + pos); 00436 destword = currword; 00437 00438 //Handle boundary conditions 00439 if(col==0) 00440 prevword=0xffffffff; 00441 else 00442 prevword = *(sword + pos - 1); 00443 00444 if(col==(wpl - 1)) 00445 nextword=0xffffffff; 00446 else 00447 nextword = *(sword + pos + 1); 00448 00449 //Loop unrolled 00450 00451 //1 bit to left and 1 bit to right 00452 //Get the min value on LHS of every pixel 00453 tempword = (prevword << (31)) | ((currword >> 1)); 00454 destword &= tempword; 00455 //Get min value on RHS of every pixel 00456 tempword = (currword << 1) | (nextword >> (31)); 00457 destword &= tempword; 00458 00459 //2 bit to left and 2 bit to right 00460 //Get the min value on LHS of every pixel 00461 tempword = (prevword << (30)) | ((currword >> 2)); 00462 destword &= tempword; 00463 //Get min value on RHS of every pixel 00464 tempword = (currword << 2) | (nextword >> (30)); 00465 destword &= tempword; 00466 00467 00468 *(dword + pos) = destword; 00469 00470 }\n 00471 ) 00472 00473 KERNEL( 00474 \n__kernel void morphoErodeVer_5x5(__global int *sword,__global int *dword, 00475 const int wpl, const int h, 00476 const int fwmask, const int lwmask) 00477 { 00478 const int col = get_global_id(0); 00479 const int row = get_global_id(1); 00480 const unsigned int pos = row * wpl + col; 00481 unsigned int tempword; 00482 unsigned int destword; 00483 int i; 00484 00485 //Ignore the execss 00486 if (row >= h || col >= wpl) 00487 return; 00488 00489 destword = *(sword + pos); 00490 00491 if (row < 2 || row >= (h - 2)) 00492 { 00493 destword = 0x0; 00494 } 00495 else 00496 { 00497 //2 words above 00498 //i = (row - 2) < 0 ? row : (row - 2); 00499 i = (row - 2); 00500 tempword = *(sword + i*wpl + col); 00501 destword &= tempword; 00502 00503 //1 word above 00504 //i = (row - 1) < 0 ? row : (row - 1); 00505 i = (row - 1); 00506 tempword = *(sword + i*wpl + col); 00507 destword &= tempword; 00508 00509 //1 word below 00510 //i = (row >= (h - 1)) ? row : (row + 1); 00511 i = (row + 1); 00512 tempword = *(sword + i*wpl + col); 00513 destword &= tempword; 00514 00515 //2 words below 00516 //i = (row >= (h - 2)) ? row : (row + 2); 00517 i = (row + 2); 00518 tempword = *(sword + i*wpl + col); 00519 destword &= tempword; 00520 00521 if (col == 0) 00522 { 00523 destword &= fwmask; 00524 } 00525 if (col == (wpl - 1)) 00526 { 00527 destword &= lwmask; 00528 } 00529 } 00530 00531 00532 *(dword + pos) = destword; 00533 }\n 00534 ) 00535 00536 KERNEL( 00537 \n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl, 00538 const int h, const char isAsymmetric, const int rwmask, const int lwmask) 00539 { 00540 const int col = get_global_id(0); 00541 const int row = get_global_id(1); 00542 const unsigned int pos = row * wpl + col; 00543 unsigned int parbitsxp, parbitsxn, nwords; 00544 unsigned int destword, tempword, lastword, currword; 00545 unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword; 00546 int i, j, siter, eiter; 00547 00548 //Ignore the execss 00549 if (pos >= (wpl*h) || (xn < 1 && xp < 1)) 00550 return; 00551 00552 currword = *(sword + pos); 00553 destword = currword; 00554 00555 parbitsxp = xp & 31; 00556 parbitsxn = xn & 31; 00557 nwords = xp >> 5; 00558 00559 if (parbitsxp > 0) 00560 nwords += 1; 00561 else 00562 parbitsxp = 31; 00563 00564 siter = (col - nwords); 00565 eiter = (col + nwords); 00566 00567 //Get prev word 00568 if (col==0) 00569 firstword = 0xffffffff; 00570 else 00571 firstword = *(sword + pos - 1); 00572 00573 //Get next word 00574 if (col == (wpl - 1)) 00575 secondword = 0xffffffff; 00576 else 00577 secondword = *(sword + pos + 1); 00578 00579 //Last partial bits on either side 00580 for (i = 1; i <= parbitsxp; i++) 00581 { 00582 //Get the max value on LHS of every pixel 00583 tempword = (firstword << (32-i)) | ((currword >> i)); 00584 destword &= tempword; 00585 00586 //Get max value on RHS of every pixel 00587 tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i)); 00588 00589 //tempword = (currword << i) | (secondword >> (32 - i)); 00590 destword &= tempword; 00591 } 00592 00593 //Return if halfwidth <= 1 word 00594 if (nwords == 1) 00595 { 00596 if (xp == 32) 00597 { 00598 destword &= firstword; 00599 } 00600 if (xn == 32) 00601 { 00602 destword &= secondword; 00603 } 00604 00605 //Clear boundary pixels 00606 if (isAsymmetric) 00607 { 00608 if (col == 0) 00609 destword &= rwmask; 00610 if (col == (wpl - 1)) 00611 destword &= lwmask; 00612 } 00613 00614 *(dword + pos) = destword; 00615 return; 00616 } 00617 00618 if (siter < 0) 00619 firstword = 0xffffffff; 00620 else 00621 firstword = *(sword + row*wpl + siter); 00622 00623 if (eiter >= wpl) 00624 lastword = 0xffffffff; 00625 else 00626 lastword = *(sword + row*wpl + eiter); 00627 00628 00629 for ( i = 1; i < nwords; i++) 00630 { 00631 //Gets LHS words 00632 if ((siter + i) < 0) 00633 secondword = 0xffffffff; 00634 else 00635 secondword = *(sword + row*wpl + siter + i); 00636 00637 lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp); 00638 00639 firstword = secondword; 00640 00641 if ((siter + i + 1) < 0) 00642 secondword = 0xffffffff; 00643 else 00644 secondword = *(sword + row*wpl + siter + i + 1); 00645 00646 lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp); 00647 00648 //Gets RHS words 00649 if ((eiter - i) >= wpl) 00650 firstword = 0xffffffff; 00651 else 00652 firstword = *(sword + row*wpl + eiter - i); 00653 00654 rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn); 00655 00656 lastword = firstword; 00657 if ((eiter - i - 1) >= wpl) 00658 firstword = 0xffffffff; 00659 else 00660 firstword = *(sword + row*wpl + eiter - i - 1); 00661 00662 rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn); 00663 00664 for (j = 0; j < 32; j++) 00665 { 00666 //OR LHS full words 00667 tempword = (lprevword << j) | (lnextword >> (32 - j)); 00668 destword &= tempword; 00669 00670 //OR RHS full words 00671 tempword = (rprevword << j) | (rnextword >> (32 - j)); 00672 destword &= tempword; 00673 } 00674 00675 destword &= lprevword; 00676 destword &= lnextword; 00677 destword &= rprevword; 00678 destword &= rnextword; 00679 00680 lastword = firstword; 00681 firstword = secondword; 00682 } 00683 00684 if (isAsymmetric) 00685 { 00686 //Clear boundary pixels 00687 if (col < (nwords - 1)) 00688 destword = 0x0; 00689 else if (col == (nwords - 1)) 00690 destword &= rwmask; 00691 else if (col > (wpl - nwords)) 00692 destword = 0x0; 00693 else if (col == (wpl - nwords)) 00694 destword &= lwmask; 00695 } 00696 00697 *(dword + pos) = destword; 00698 }\n 00699 ) 00700 00701 KERNEL( 00702 \n__kernel void morphoErodeHor_32word(__global int *sword,__global int *dword, 00703 const int halfwidth, const int wpl, 00704 const int h, const char clearBoundPixH, 00705 const int rwmask, const int lwmask, 00706 const char isEven) 00707 { 00708 const int col = get_global_id(0); 00709 const int row = get_global_id(1); 00710 const unsigned int pos = row * wpl + col; 00711 unsigned int prevword, nextword, currword,tempword, destword; 00712 int i; 00713 00714 //Ignore the execss 00715 if (pos >= (wpl * h)) 00716 return; 00717 00718 currword = *(sword + pos); 00719 destword = currword; 00720 00721 //Handle boundary conditions 00722 if(col==0) 00723 prevword=0xffffffff; 00724 else 00725 prevword = *(sword + pos - 1); 00726 00727 if(col==(wpl - 1)) 00728 nextword=0xffffffff; 00729 else 00730 nextword = *(sword + pos + 1); 00731 00732 for (i = 1; i <= halfwidth; i++) 00733 { 00734 //Get the min value on LHS of every pixel 00735 tempword = (prevword << (32-i)) | ((currword >> i)); 00736 00737 destword &= tempword; 00738 00739 //Get min value on RHS of every pixel 00740 if (i == halfwidth && isEven) 00741 { 00742 tempword = 0xffffffff; 00743 } 00744 else 00745 { 00746 tempword = (currword << i) | (nextword >> (32 - i)); 00747 } 00748 00749 destword &= tempword; 00750 } 00751 00752 if (clearBoundPixH) 00753 { 00754 if (col == 0) 00755 { 00756 destword &= rwmask; 00757 } 00758 else if (col == (wpl - 1)) 00759 { 00760 destword &= lwmask; 00761 } 00762 } 00763 00764 *(dword + pos) = destword; 00765 }\n 00766 ) 00767 00768 KERNEL( 00769 \n__kernel void morphoErodeVer(__global int *sword,__global int *dword, 00770 const int yp, 00771 const int wpl, const int h, 00772 const char clearBoundPixV, const int yn) 00773 { 00774 const int col = get_global_id(0); 00775 const int row = get_global_id(1); 00776 const unsigned int pos = row * wpl + col; 00777 unsigned int tempword, destword; 00778 int i, siter, eiter; 00779 00780 //Ignore the execss 00781 if (row >= h || col >= wpl) 00782 return; 00783 00784 destword = *(sword + pos); 00785 00786 //Set start position and end position considering the boundary conditions 00787 siter = (row - yp) < 0 ? 0 : (row - yp); 00788 eiter = (row >= (h - yn)) ? (h - 1) : (row + yn); 00789 00790 for (i = siter; i <= eiter; i++) 00791 { 00792 tempword = *(sword + i*wpl + col); 00793 00794 destword &= tempword; 00795 } 00796 00797 //Clear boundary pixels 00798 if (clearBoundPixV && ((row < yp) || ((h - row) <= yn))) 00799 { 00800 destword = 0x0; 00801 } 00802 00803 *(dword + pos) = destword; 00804 }\n 00805 ) 00806 00807 // HistogramRect Kernel: Accumulate 00808 // assumes 4 channels, i.e., bytes_per_pixel = 4 00809 // assumes number of pixels is multiple of 8 00810 // data is layed out as 00811 // ch0 ch1 ... 00812 // bin0 bin1 bin2... bin0... 00813 // rpt0,1,2...256 rpt0,1,2... 00814 KERNEL( 00815 \n#define HIST_REDUNDANCY 256\n 00816 \n#define GROUP_SIZE 256\n 00817 \n#define HIST_SIZE 256\n 00818 \n#define NUM_CHANNELS 4\n 00819 \n#define HR_UNROLL_SIZE 8 \n 00820 \n#define HR_UNROLL_TYPE uchar8 \n 00821 00822 __attribute__((reqd_work_group_size(256, 1, 1))) 00823 __kernel 00824 void kernel_HistogramRectAllChannels( 00825 __global const uchar8 *data, 00826 uint numPixels, 00827 __global uint *histBuffer) { 00828 00829 // declare variables 00830 uchar8 pixels; 00831 int threadOffset = get_global_id(0)%HIST_REDUNDANCY; 00832 00833 // for each pixel/channel, accumulate in global memory 00834 for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0) ) { 00835 pixels = data[pc]; 00836 // channel bin thread 00837 atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset ]); // ch0 00838 atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset ]); // ch0 00839 atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset ]); // ch1 00840 atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset ]); // ch1 00841 atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset ]); // ch2 00842 atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset ]); // ch2 00843 atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset ]); // ch3 00844 atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset ]); // ch3 00845 } 00846 } 00847 ) 00848 00849 KERNEL( 00850 // NUM_CHANNELS = 1 00851 __attribute__((reqd_work_group_size(256, 1, 1))) 00852 __kernel 00853 void kernel_HistogramRectOneChannel( 00854 __global const uchar8 *data, 00855 uint numPixels, 00856 __global uint *histBuffer) { 00857 00858 // declare variables 00859 uchar8 pixels; 00860 int threadOffset = get_global_id(0)%HIST_REDUNDANCY; 00861 00862 // for each pixel/channel, accumulate in global memory 00863 for ( uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0) ) { 00864 pixels = data[pc]; 00865 // bin thread 00866 atomic_inc( &histBuffer[ pixels.s0*HIST_REDUNDANCY + threadOffset ]); 00867 atomic_inc( &histBuffer[ pixels.s1*HIST_REDUNDANCY + threadOffset ]); 00868 atomic_inc( &histBuffer[ pixels.s2*HIST_REDUNDANCY + threadOffset ]); 00869 atomic_inc( &histBuffer[ pixels.s3*HIST_REDUNDANCY + threadOffset ]); 00870 atomic_inc( &histBuffer[ pixels.s4*HIST_REDUNDANCY + threadOffset ]); 00871 atomic_inc( &histBuffer[ pixels.s5*HIST_REDUNDANCY + threadOffset ]); 00872 atomic_inc( &histBuffer[ pixels.s6*HIST_REDUNDANCY + threadOffset ]); 00873 atomic_inc( &histBuffer[ pixels.s7*HIST_REDUNDANCY + threadOffset ]); 00874 } 00875 } 00876 ) 00877 00878 00879 KERNEL( 00880 // unused 00881 \n __attribute__((reqd_work_group_size(256, 1, 1))) 00882 \n __kernel 00883 \n void kernel_HistogramRectAllChannels_Grey( 00884 \n __global const uchar* data, 00885 \n uint numPixels, 00886 \n __global uint *histBuffer) { // each wg will write HIST_SIZE*NUM_CHANNELS into this result; cpu will accumulate across wg's 00887 \n 00888 \n /* declare variables */ 00889 \n 00890 \n // work indices 00891 \n size_t groupId = get_group_id(0); 00892 \n size_t localId = get_local_id(0); // 0 -> 256-1 00893 \n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1 00894 \n uint numThreads = get_global_size(0); 00895 \n 00896 \n /* accumulate in global memory */ 00897 \n for ( uint pc = get_global_id(0); pc < numPixels; pc += get_global_size(0) ) { 00898 \n uchar value = data[ pc ]; 00899 \n int idx = value * get_global_size(0) + get_global_id(0); 00900 \n histBuffer[ idx ]++; 00901 \n 00902 \n } 00903 \n 00904 \n } // kernel_HistogramRectAllChannels_Grey 00905 00906 ) 00907 00908 // HistogramRect Kernel: Reduction 00909 // only supports 4 channels 00910 // each work group handles a single channel of a single histogram bin 00911 KERNEL( 00912 __attribute__((reqd_work_group_size(256, 1, 1))) 00913 __kernel 00914 void kernel_HistogramRectAllChannelsReduction( 00915 int n, // unused pixel redundancy 00916 __global uint *histBuffer, 00917 __global int* histResult) { 00918 00919 // declare variables 00920 int channel = get_group_id(0)/HIST_SIZE; 00921 int bin = get_group_id(0)%HIST_SIZE; 00922 int value = 0; 00923 00924 // accumulate in register 00925 for ( uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) { 00926 value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i]; 00927 } 00928 00929 // reduction in local memory 00930 __local int localHist[GROUP_SIZE]; 00931 localHist[get_local_id(0)] = value; 00932 barrier(CLK_LOCAL_MEM_FENCE); 00933 for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) { 00934 if (get_local_id(0) < stride) { 00935 value = localHist[ get_local_id(0)+stride]; 00936 } 00937 barrier(CLK_LOCAL_MEM_FENCE); 00938 if (get_local_id(0) < stride) { 00939 localHist[ get_local_id(0)] += value; 00940 } 00941 barrier(CLK_LOCAL_MEM_FENCE); 00942 } 00943 00944 // write reduction to final result 00945 if (get_local_id(0) == 0) { 00946 histResult[get_group_id(0)] = localHist[0]; 00947 } 00948 } // kernel_HistogramRectAllChannels 00949 ) 00950 00951 00952 KERNEL( 00953 // NUM_CHANNELS = 1 00954 __attribute__((reqd_work_group_size(256, 1, 1))) 00955 __kernel 00956 void kernel_HistogramRectOneChannelReduction( 00957 int n, // unused pixel redundancy 00958 __global uint *histBuffer, 00959 __global int* histResult) { 00960 00961 // declare variables 00962 // int channel = get_group_id(0)/HIST_SIZE; 00963 int bin = get_group_id(0)%HIST_SIZE; 00964 int value = 0; 00965 00966 // accumulate in register 00967 for ( int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) { 00968 value += histBuffer[ bin*HIST_REDUNDANCY+i]; 00969 } 00970 00971 // reduction in local memory 00972 __local int localHist[GROUP_SIZE]; 00973 localHist[get_local_id(0)] = value; 00974 barrier(CLK_LOCAL_MEM_FENCE); 00975 for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) { 00976 if (get_local_id(0) < stride) { 00977 value = localHist[ get_local_id(0)+stride]; 00978 } 00979 barrier(CLK_LOCAL_MEM_FENCE); 00980 if (get_local_id(0) < stride) { 00981 localHist[ get_local_id(0)] += value; 00982 } 00983 barrier(CLK_LOCAL_MEM_FENCE); 00984 } 00985 00986 // write reduction to final result 00987 if (get_local_id(0) == 0) { 00988 histResult[get_group_id(0)] = localHist[0]; 00989 } 00990 } // kernel_HistogramRectOneChannelReduction 00991 ) 00992 00993 00994 KERNEL( 00995 // unused 00996 // each work group (x256) handles a histogram bin 00997 \n __attribute__((reqd_work_group_size(256, 1, 1))) 00998 \n __kernel 00999 \n void kernel_HistogramRectAllChannelsReduction_Grey( 01000 \n int n, // pixel redundancy that needs to be accumulated 01001 \n __global uint *histBuffer, 01002 \n __global uint* histResult) { // each wg accumulates 1 bin 01003 \n 01004 \n /* declare variables */ 01005 \n 01006 \n // work indices 01007 \n size_t groupId = get_group_id(0); 01008 \n size_t localId = get_local_id(0); // 0 -> 256-1 01009 \n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1 01010 \n uint numThreads = get_global_size(0); 01011 \n unsigned int hist = 0; 01012 \n 01013 \n /* accumulate in global memory */ 01014 \n for ( uint p = 0; p < n; p+=GROUP_SIZE) { 01015 \n hist += histBuffer[ (get_group_id(0)*n + p)]; 01016 \n } 01017 \n 01018 \n /* reduction in local memory */ 01019 \n // populate local memory 01020 \n __local unsigned int localHist[GROUP_SIZE]; 01021 01022 \n localHist[localId] = hist; 01023 \n barrier(CLK_LOCAL_MEM_FENCE); 01024 \n 01025 \n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) { 01026 \n if (localId < stride) { 01027 \n hist = localHist[ (localId+stride)]; 01028 \n } 01029 \n barrier(CLK_LOCAL_MEM_FENCE); 01030 \n if (localId < stride) { 01031 \n localHist[ localId] += hist; 01032 \n } 01033 \n barrier(CLK_LOCAL_MEM_FENCE); 01034 \n } 01035 \n 01036 \n if (localId == 0) 01037 \n histResult[get_group_id(0)] = localHist[0]; 01038 \n 01039 \n } // kernel_HistogramRectAllChannelsReduction_Grey 01040 01041 ) 01042 01043 // ThresholdRectToPix Kernel 01044 // only supports 4 channels 01045 // imageData is input image (24-bits/pixel) 01046 // pix is output image (1-bit/pixel) 01047 KERNEL( 01048 \n#define CHAR_VEC_WIDTH 8 \n 01049 \n#define PIXELS_PER_WORD 32 \n 01050 \n#define PIXELS_PER_BURST 8 \n 01051 \n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n 01052 typedef union { 01053 uchar s[PIXELS_PER_BURST*NUM_CHANNELS]; 01054 uchar8 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH]; 01055 } charVec; 01056 01057 __attribute__((reqd_work_group_size(256, 1, 1))) 01058 __kernel 01059 void kernel_ThresholdRectToPix( 01060 __global const uchar8 *imageData, 01061 int height, 01062 int width, 01063 int wpl, // words per line 01064 __global int *thresholds, 01065 __global int *hi_values, 01066 __global int *pix) { 01067 01068 // declare variables 01069 int pThresholds[NUM_CHANNELS]; 01070 int pHi_Values[NUM_CHANNELS]; 01071 for ( int i = 0; i < NUM_CHANNELS; i++) { 01072 pThresholds[i] = thresholds[i]; 01073 pHi_Values[i] = hi_values[i]; 01074 } 01075 01076 // for each word (32 pixels) in output image 01077 for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) { 01078 unsigned int word = 0; // all bits start at zero 01079 01080 // for each burst in word 01081 for ( int b = 0; b < BURSTS_PER_WORD; b++) { 01082 01083 // load burst 01084 charVec pixels; 01085 for ( int i = 0; i < (PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH; i++ ) { 01086 pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + i]; 01087 } 01088 01089 // for each pixel in burst 01090 for ( int p = 0; p < PIXELS_PER_BURST; p++) { 01091 for ( int c = 0; c < NUM_CHANNELS; c++) { 01092 unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c]; 01093 if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) { 01094 word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31)); 01095 } 01096 } 01097 } 01098 } 01099 pix[w] = word; 01100 } 01101 } 01102 01103 // only supports 1 channel 01104 typedef union { 01105 uchar s[PIXELS_PER_BURST]; 01106 uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH]; 01107 } charVec1; 01108 01109 __attribute__((reqd_work_group_size(256, 1, 1))) 01110 __kernel 01111 void kernel_ThresholdRectToPix_OneChan( 01112 __global const uchar8 *imageData, 01113 int height, 01114 int width, 01115 int wpl, // words per line 01116 __global int *thresholds, 01117 __global int *hi_values, 01118 __global int *pix) { 01119 01120 // declare variables 01121 int pThresholds[1]; 01122 int pHi_Values[1]; 01123 for ( int i = 0; i < 1; i++) { 01124 pThresholds[i] = thresholds[i]; 01125 pHi_Values[i] = hi_values[i]; 01126 } 01127 01128 // for each word (32 pixels) in output image 01129 for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) { 01130 unsigned int word = 0; // all bits start at zero 01131 01132 // for each burst in word 01133 for ( int b = 0; b < BURSTS_PER_WORD; b++) { 01134 01135 // load burst 01136 charVec1 pixels; 01137 for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) { 01138 pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + i]; 01139 } 01140 01141 // for each pixel in burst 01142 for ( int p = 0; p < PIXELS_PER_BURST; p++) { 01143 for ( int c = 0; c < 1; c++) { 01144 unsigned char pixChan = pixels.s[p + c]; 01145 if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) { 01146 word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31)); 01147 } 01148 } 01149 } 01150 } 01151 pix[w] = word; 01152 } 01153 } 01154 ) 01155 01156 ; // close char* 01157 01158 #endif // USE_EXTERNAL_KERNEL 01159 #endif //_OCL_KERNEL_H_ 01160 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ 01161 01162 // Alternative histogram kernel written to use uchar and different global memory scattered write 01163 // was a little better for intel platforms but still not faster then native serial code 01164 #if 0 01165 /* data layed out as 01166 bin0 bin1 bin2... 01167 r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies 01168 */ 01169 \n__attribute__((reqd_work_group_size(256, 1, 1))) 01170 \n __kernel 01171 \n void kernel_HistogramRectAllChannels_uchar( 01172 \n volatile __global const uchar *data, 01173 \n uint numPixels, 01174 \n volatile __global uint *histBuffer) { 01175 \n 01176 \n // for each pixel/channel, accumulate in global memory 01177 \n for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) { 01178 \n uchar value = data[pc]; 01179 \n int idx = value*get_global_size(0) + get_global_id(0); 01180 \n histBuffer[ idx ]++; // coalesced if same value 01181 \n } 01182 \n } // kernel_HistogramRectAllChannels 01183 \n 01184 \n __attribute__((reqd_work_group_size(256, 1, 1))) 01185 \n __kernel 01186 \n void kernel_HistogramRectAllChannelsReduction_uchar( 01187 \n int n, // pixel redundancy that needs to be accumulated = nthreads/4 01188 \n __global uint4 *histBuffer, 01189 \n __global uint* histResult) { // each wg accumulates 1 bin (all channels within it 01190 \n 01191 \n // declare variables 01192 \n int binIdx = get_group_id(0); 01193 \n size_t groupId = get_group_id(0); 01194 \n size_t localId = get_local_id(0); // 0 -> 256-1 01195 \n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1 01196 \n uint numThreads = get_global_size(0); 01197 \n uint4 hist = {0, 0, 0, 0}; 01198 \n 01199 \n // accumulate in register 01200 \n for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) { 01201 \n hist += histBuffer[binIdx*n+p]; 01202 \n } 01203 \n 01204 \n // reduction in local memory 01205 \n __local uint4 localHist[GROUP_SIZE]; 01206 \n localHist[localId] = hist; 01207 \n barrier(CLK_LOCAL_MEM_FENCE); 01208 \n 01209 \n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) { 01210 \n if (localId < stride) { 01211 \n hist = localHist[ localId+stride]; 01212 \n } 01213 \n barrier(CLK_LOCAL_MEM_FENCE); 01214 \n if (localId < stride) { 01215 \n localHist[ localId] += hist; 01216 \n } 01217 \n barrier(CLK_LOCAL_MEM_FENCE); 01218 \n } 01219 \n 01220 \n // write reduction to final result 01221 \n if (localId == 0) { 01222 \n histResult[0*HIST_SIZE+binIdx] = localHist[0].s0; 01223 \n histResult[1*HIST_SIZE+binIdx] = localHist[0].s1; 01224 \n histResult[2*HIST_SIZE+binIdx] = localHist[0].s2; 01225 \n histResult[3*HIST_SIZE+binIdx] = localHist[0].s3; 01226 \n } 01227 \n 01228 \n } // kernel_HistogramRectAllChannels 01229 #endif