tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/opencl/oclkernels.h
Go to the documentation of this file.
00001 
00002 #ifndef _OCL_KERNEL_H_
00003 #define _OCL_KERNEL_H_
00004 #ifndef USE_EXTERNAL_KERNEL
00005 #define KERNEL( ... )# __VA_ARGS__ "\n"
00006 // Double precision is a default of spreadsheets
00007 // cl_khr_fp64: Khronos extension
00008 // cl_amd_fp64: AMD extension
00009 // use build option outside to define fp_t
00011 const char *kernel_src = KERNEL(
00012 \n#ifdef KHR_DP_EXTENSION\n
00013 \n#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n
00014 \n#elif AMD_DP_EXTENSION\n
00015 \n#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n
00016 \n#else\n
00017 \n#endif\n
00018 __kernel void composeRGBPixel(__global uint *tiffdata, int w, int h,int wpl, __global uint *output)
00019 {
00020     int i = get_global_id(1);
00021     int j = get_global_id(0);
00022     int tiffword,rval,gval,bval;
00023 
00024     //Ignore the excess
00025     if ((i >= h) || (j >= w))
00026         return;
00027 
00028     tiffword = tiffdata[i * w + j];
00029     rval = ((tiffword) & 0xff);
00030     gval = (((tiffword) >> 8) & 0xff);
00031     bval = (((tiffword) >> 16) & 0xff);
00032     output[i*wpl+j] = (rval << (8 * (sizeof(uint) - 1 - 0))) | (gval << (8 * (sizeof(uint) - 1 - 1))) | (bval << (8 * (sizeof(uint) - 1 - 2)));
00033 }
00034 )
00035 
00036 KERNEL(
00037 \n__kernel void pixSubtract_inplace(__global int *dword, __global int *sword,
00038                             const int wpl, const int h)
00039 {
00040     const unsigned int row = get_global_id(1);
00041     const unsigned int col = get_global_id(0);
00042     const unsigned int pos = row * wpl + col;
00043 
00044     //Ignore the execss
00045     if (row >= h || col >= wpl)
00046         return;
00047 
00048     *(dword + pos) &= ~(*(sword + pos));
00049 }\n
00050 )
00051 
00052 KERNEL(
00053 \n__kernel void pixSubtract(__global int *dword, __global int *sword, 
00054                             const int wpl, const int h, __global int *outword)
00055 {
00056     const unsigned int row = get_global_id(1);
00057     const unsigned int col = get_global_id(0);
00058     const unsigned int pos = row * wpl + col;
00059 
00060     //Ignore the execss
00061     if (row >= h || col >= wpl)
00062         return;
00063 
00064     *(outword + pos) = *(dword + pos) & ~(*(sword + pos));
00065 }\n
00066 )
00067 
00068 KERNEL(
00069 \n__kernel void pixAND(__global int *dword, __global int *sword, __global int *outword,
00070                             const int wpl, const int h)
00071 {
00072     const unsigned int row = get_global_id(1);
00073     const unsigned int col = get_global_id(0);
00074     const unsigned int pos = row * wpl + col;
00075 
00076     //Ignore the execss
00077     if (row >= h || col >= wpl)
00078         return;
00079 
00080      *(outword + pos) = *(dword + pos) & (*(sword + pos));
00081 }\n
00082 )
00083 
00084 KERNEL(
00085 \n__kernel void pixOR(__global int *dword, __global int *sword, __global int *outword,
00086                             const int wpl, const int h)
00087 {
00088     const unsigned int row = get_global_id(1);
00089     const unsigned int col = get_global_id(0);
00090     const unsigned int pos = row * wpl + col;
00091 
00092     //Ignore the execss
00093     if (row >= h || col >= wpl)
00094         return;
00095 
00096     *(outword + pos) = *(dword + pos) | (*(sword + pos));
00097 }\n
00098 )
00099 
00100 KERNEL(
00101 \n__kernel void morphoDilateHor_5x5(__global int *sword,__global int *dword,
00102                             const int wpl, const int h)
00103 {
00104     const unsigned int pos = get_global_id(0);
00105     unsigned int prevword, nextword, currword,tempword;
00106     unsigned int destword;
00107     const int col = pos % wpl;
00108     
00109     //Ignore the execss
00110     if (pos >= (wpl * h))
00111         return;
00112     
00113     
00114     currword = *(sword + pos);  
00115     destword = currword;
00116     
00117     //Handle boundary conditions
00118     if(col==0)
00119         prevword=0;
00120     else
00121         prevword = *(sword + pos - 1);
00122 
00123     if(col==(wpl - 1))
00124         nextword=0;
00125     else
00126         nextword = *(sword + pos + 1);
00127     
00128     //Loop unrolled
00129     
00130     //1 bit to left and 1 bit to right
00131         //Get the max value on LHS of every pixel
00132         tempword = (prevword << (31)) | ((currword >> 1));
00133         destword |= tempword;
00134         //Get max value on RHS of every pixel
00135         tempword = (currword << 1) | (nextword >> (31));
00136         destword |= tempword;
00137 
00138     //2 bit to left and 2 bit to right
00139         //Get the max value on LHS of every pixel
00140         tempword = (prevword << (30)) | ((currword >> 2));
00141         destword |= tempword;
00142         //Get max value on RHS of every pixel
00143         tempword = (currword << 2) | (nextword >> (30));
00144         destword |= tempword;
00145     
00146     
00147     *(dword + pos) = destword;
00148     
00149 }\n
00150 )
00151 
00152 KERNEL(
00153 \n__kernel void morphoDilateVer_5x5(__global int *sword,__global int *dword,
00154                             const int wpl, const int h)
00155 {
00156     const int col = get_global_id(0);
00157     const int row = get_global_id(1);
00158     const unsigned int pos = row * wpl + col;
00159     unsigned int tempword;
00160     unsigned int destword;
00161     int i;
00162 
00163     //Ignore the execss
00164     if (row >= h || col >= wpl)
00165         return;
00166 
00167     destword = *(sword + pos);
00168 
00169     //2 words above
00170     i = (row - 2) < 0 ? row : (row - 2);
00171     tempword = *(sword + i*wpl + col);
00172     destword |= tempword;
00173 
00174     //1 word above
00175     i = (row - 1) < 0 ? row  : (row - 1);
00176     tempword = *(sword + i*wpl + col);
00177     destword |= tempword;
00178 
00179     //1 word below
00180     i = (row >= (h - 1)) ? row : (row + 1);
00181     tempword = *(sword + i*wpl + col);
00182     destword |= tempword;
00183 
00184     //2 words below
00185     i = (row >= (h - 2)) ? row : (row + 2);
00186     tempword = *(sword + i*wpl + col);
00187     destword |= tempword;
00188 
00189     *(dword + pos) = destword;
00190 }\n
00191 )
00192 
00193 KERNEL(
00194 \n__kernel void morphoDilateHor(__global int *sword,__global int *dword,const int xp, const int xn, const int wpl, const int h)
00195 {
00196     const int col = get_global_id(0);
00197     const int row = get_global_id(1);
00198     const unsigned int pos = row * wpl + col;
00199     unsigned int parbitsxp, parbitsxn, nwords;
00200     unsigned int destword, tempword, lastword, currword;
00201     unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
00202     int i, j, siter, eiter;
00203     
00204     //Ignore the execss
00205     if (pos >= (wpl*h) || (xn < 1 && xp < 1))
00206         return;
00207 
00208     currword = *(sword + pos);
00209     destword = currword;
00210 
00211     parbitsxp = xp & 31;
00212     parbitsxn = xn & 31;
00213     nwords = xp >> 5;
00214 
00215     if (parbitsxp > 0)
00216         nwords += 1;
00217     else
00218         parbitsxp = 31;
00219 
00220     siter = (col - nwords);
00221     eiter = (col + nwords);
00222 
00223     //Get prev word
00224     if (col==0)
00225         firstword = 0x0;
00226     else
00227         firstword = *(sword + pos - 1);
00228     
00229     //Get next word
00230     if (col == (wpl - 1))
00231         secondword = 0x0;
00232     else
00233         secondword = *(sword + pos + 1);
00234 
00235     //Last partial bits on either side
00236     for (i = 1; i <= parbitsxp; i++)
00237     {
00238         //Get the max value on LHS of every pixel
00239         tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i));
00240         
00241         destword |= tempword;
00242 
00243         //Get max value on RHS of every pixel
00244         tempword = (currword << i) | (secondword >> (32 - i));
00245         destword |= tempword;
00246     }
00247 
00248     //Return if halfwidth <= 1 word
00249     if (nwords == 1)
00250     {
00251         if (xn == 32)
00252         {
00253             destword |= firstword;
00254         }
00255         if (xp == 32)
00256         {
00257             destword |= secondword;
00258         }
00259 
00260         *(dword + pos) = destword;
00261         return;
00262     }
00263 
00264     if (siter < 0)
00265         firstword = 0x0;
00266     else
00267         firstword = *(sword + row*wpl + siter);
00268 
00269     if (eiter >= wpl)   
00270         lastword = 0x0;
00271     else
00272         lastword = *(sword + row*wpl + eiter);
00273     
00274     for ( i = 1; i < nwords; i++)
00275     {
00276         //Gets LHS words
00277         if ((siter + i) < 0)
00278             secondword = 0x0;
00279         else
00280             secondword = *(sword + row*wpl + siter + i);
00281 
00282         lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
00283         
00284         firstword = secondword;
00285 
00286         if ((siter + i + 1) < 0)
00287             secondword = 0x0;
00288         else
00289             secondword = *(sword + row*wpl + siter + i + 1);
00290         
00291         lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn;
00292 
00293         //Gets RHS words
00294         if ((eiter - i) >= wpl)
00295             firstword = 0x0;
00296         else
00297             firstword = *(sword + row*wpl + eiter - i);
00298             
00299         rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
00300 
00301         lastword = firstword;
00302         if ((eiter - i - 1) >= wpl)
00303             firstword = 0x0;
00304         else
00305             firstword = *(sword + row*wpl + eiter - i - 1);
00306 
00307         rprevword = firstword << parbitsxp | lastword >> (32 - parbitsxp);
00308 
00309         for (j = 1; j < 32; j++)
00310         {
00311             //OR LHS full words
00312             tempword = (lprevword << j) | (lnextword >> (32 - j));
00313             destword |= tempword;
00314 
00315             //OR RHS full words
00316             tempword = (rprevword << j) | (rnextword >> (32 - j));
00317             destword |= tempword;
00318         }
00319 
00320         destword |= lprevword;
00321         destword |= lnextword;
00322         destword |= rprevword;
00323         destword |= rnextword;
00324 
00325         lastword = firstword;
00326         firstword = secondword;
00327     }
00328     
00329     *(dword + pos) = destword;
00330 }\n
00331 )
00332 
00333 KERNEL(
00334 \n__kernel void morphoDilateHor_32word(__global int *sword,__global int *dword,
00335                             const int halfwidth,
00336                             const int wpl, const int h,
00337                             const char isEven)
00338 {
00339     const int col = get_global_id(0);
00340     const int row = get_global_id(1);
00341     const unsigned int pos = row * wpl + col;
00342     unsigned int prevword, nextword, currword,tempword;
00343     unsigned int destword;
00344     int i;
00345     
00346     //Ignore the execss
00347     if (pos >= (wpl * h))
00348         return;
00349 
00350     currword = *(sword + pos);  
00351     destword = currword;
00352     
00353     //Handle boundary conditions
00354     if(col==0)
00355         prevword=0;
00356     else
00357         prevword = *(sword + pos - 1);
00358 
00359     if(col==(wpl - 1))
00360         nextword=0;
00361     else
00362         nextword = *(sword + pos + 1);
00363     
00364     for (i = 1; i <= halfwidth; i++)
00365     {
00366         //Get the max value on LHS of every pixel
00367         if (i == halfwidth && isEven)
00368         {
00369             tempword = 0x0;
00370         }
00371         else
00372         {
00373             tempword = (prevword << (32-i)) | ((currword >> i));
00374         }
00375 
00376         destword |= tempword;
00377 
00378         //Get max value on RHS of every pixel
00379         tempword = (currword << i) | (nextword >> (32 - i));
00380         
00381         destword |= tempword;
00382     }
00383 
00384     *(dword + pos) = destword;
00385 }\n
00386 )
00387 
00388 KERNEL(
00389 \n__kernel void morphoDilateVer(__global int *sword,__global int *dword,
00390                             const int yp,
00391                             const int wpl, const int h,
00392                             const int yn)
00393 {
00394     const int col = get_global_id(0);
00395     const int row = get_global_id(1);
00396     const unsigned int pos = row * wpl + col;
00397     unsigned int tempword;
00398     unsigned int destword;
00399     int i, siter, eiter;
00400     
00401     //Ignore the execss
00402     if (row >= h || col >= wpl)
00403         return;
00404 
00405     destword = *(sword + pos);
00406 
00407     //Set start position and end position considering the boundary conditions
00408     siter = (row - yn) < 0 ? 0 : (row - yn);
00409     eiter = (row >= (h - yp)) ? (h - 1) : (row + yp);
00410 
00411     for (i = siter; i <= eiter; i++)
00412     {
00413         tempword = *(sword + i*wpl + col);
00414 
00415         destword |= tempword;
00416     }
00417 
00418     *(dword + pos) = destword;
00419 }\n
00420 )
00421 
00422 KERNEL(
00423 \n__kernel void morphoErodeHor_5x5(__global int *sword,__global int *dword,
00424                             const int wpl, const int h)
00425 {
00426     const unsigned int pos = get_global_id(0);
00427     unsigned int prevword, nextword, currword,tempword;
00428     unsigned int destword;
00429     const int col = pos % wpl;
00430     
00431     //Ignore the execss
00432     if (pos >= (wpl * h))
00433         return;
00434     
00435     currword = *(sword + pos);  
00436     destword = currword;
00437     
00438     //Handle boundary conditions
00439     if(col==0)
00440         prevword=0xffffffff;
00441     else
00442         prevword = *(sword + pos - 1);
00443     
00444     if(col==(wpl - 1))
00445         nextword=0xffffffff;
00446     else
00447         nextword = *(sword + pos + 1);
00448     
00449     //Loop unrolled
00450     
00451     //1 bit to left and 1 bit to right
00452         //Get the min value on LHS of every pixel
00453         tempword = (prevword << (31)) | ((currword >> 1));
00454         destword &= tempword;
00455         //Get min value on RHS of every pixel
00456         tempword = (currword << 1) | (nextword >> (31));
00457         destword &= tempword;
00458 
00459     //2 bit to left and 2 bit to right
00460         //Get the min value on LHS of every pixel
00461         tempword = (prevword << (30)) | ((currword >> 2));
00462         destword &= tempword;
00463         //Get min value on RHS of every pixel
00464         tempword = (currword << 2) | (nextword >> (30));
00465         destword &= tempword;
00466     
00467     
00468     *(dword + pos) = destword;
00469     
00470 }\n
00471 )
00472 
00473 KERNEL(
00474 \n__kernel void morphoErodeVer_5x5(__global int *sword,__global int *dword,
00475                             const int wpl, const int h,
00476                             const int fwmask, const int lwmask)
00477 {
00478     const int col = get_global_id(0);
00479     const int row = get_global_id(1);
00480     const unsigned int pos = row * wpl + col;
00481     unsigned int tempword;
00482     unsigned int destword;
00483     int i;
00484 
00485     //Ignore the execss
00486     if (row >= h || col >= wpl)
00487         return;
00488 
00489     destword = *(sword + pos);
00490 
00491     if (row < 2 || row >= (h - 2))
00492     {
00493         destword = 0x0;
00494     }   
00495     else
00496     {
00497         //2 words above
00498         //i = (row - 2) < 0 ? row : (row - 2);
00499         i = (row - 2);
00500         tempword = *(sword + i*wpl + col);
00501         destword &= tempword;
00502 
00503         //1 word above
00504         //i = (row - 1) < 0 ? row  : (row - 1);
00505         i = (row - 1);
00506         tempword = *(sword + i*wpl + col);
00507         destword &= tempword;
00508 
00509         //1 word below
00510         //i = (row >= (h - 1)) ? row : (row + 1);
00511         i = (row + 1);
00512         tempword = *(sword + i*wpl + col);
00513         destword &= tempword;
00514 
00515         //2 words below
00516         //i = (row >= (h - 2)) ? row : (row + 2);
00517         i = (row + 2);
00518         tempword = *(sword + i*wpl + col);
00519         destword &= tempword;
00520 
00521         if (col == 0) 
00522         {
00523             destword &= fwmask;
00524         }
00525         if (col == (wpl - 1))
00526         {
00527             destword &= lwmask;
00528         }
00529     }
00530 
00531 
00532     *(dword + pos) = destword;
00533 }\n
00534 )
00535 
00536 KERNEL(
00537 \n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl, 
00538                                 const int h, const char isAsymmetric, const int rwmask, const int lwmask)
00539 {
00540     const int col = get_global_id(0);
00541     const int row = get_global_id(1);
00542     const unsigned int pos = row * wpl + col;
00543     unsigned int parbitsxp, parbitsxn, nwords;
00544     unsigned int destword, tempword, lastword, currword;
00545     unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword;
00546     int i, j, siter, eiter;
00547 
00548     //Ignore the execss
00549     if (pos >= (wpl*h) || (xn < 1 && xp < 1))
00550         return;
00551 
00552     currword = *(sword + pos);
00553     destword = currword;
00554 
00555     parbitsxp = xp & 31;
00556     parbitsxn = xn & 31;
00557     nwords = xp >> 5;
00558 
00559     if (parbitsxp > 0)
00560         nwords += 1;
00561     else
00562         parbitsxp = 31;
00563 
00564     siter = (col - nwords);
00565     eiter = (col + nwords);
00566 
00567     //Get prev word
00568     if (col==0)
00569         firstword = 0xffffffff;
00570     else
00571         firstword = *(sword + pos - 1);
00572     
00573     //Get next word
00574     if (col == (wpl - 1))
00575         secondword = 0xffffffff;
00576     else
00577         secondword = *(sword + pos + 1);
00578 
00579     //Last partial bits on either side
00580     for (i = 1; i <= parbitsxp; i++)
00581     {
00582         //Get the max value on LHS of every pixel
00583         tempword = (firstword << (32-i)) | ((currword >> i));
00584         destword &= tempword;
00585 
00586         //Get max value on RHS of every pixel
00587         tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i));
00588         
00589         //tempword = (currword << i) | (secondword >> (32 - i));
00590         destword &= tempword;
00591     }
00592 
00593     //Return if halfwidth <= 1 word
00594     if (nwords == 1)
00595     {
00596         if (xp == 32)
00597         {
00598             destword &= firstword;
00599         }
00600         if (xn == 32)
00601         {
00602             destword &= secondword;
00603         }
00604 
00605         //Clear boundary pixels
00606         if (isAsymmetric)
00607         {
00608             if (col == 0)
00609                 destword &= rwmask;
00610             if (col == (wpl - 1))
00611                 destword &= lwmask;
00612         }
00613 
00614         *(dword + pos) = destword;
00615         return;
00616     }
00617     
00618     if (siter < 0)
00619         firstword = 0xffffffff;
00620     else
00621         firstword = *(sword + row*wpl + siter);
00622 
00623     if (eiter >= wpl)   
00624         lastword = 0xffffffff;
00625     else
00626         lastword = *(sword + row*wpl + eiter);
00627     
00628     
00629     for ( i = 1; i < nwords; i++)
00630     {
00631         //Gets LHS words
00632         if ((siter + i) < 0)
00633             secondword = 0xffffffff;
00634         else
00635             secondword = *(sword + row*wpl + siter + i);
00636 
00637         lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
00638         
00639         firstword = secondword;
00640 
00641         if ((siter + i + 1) < 0)
00642             secondword = 0xffffffff;
00643         else
00644             secondword = *(sword + row*wpl + siter + i + 1);
00645         
00646         lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp);
00647 
00648         //Gets RHS words
00649         if ((eiter - i) >= wpl)
00650             firstword = 0xffffffff;
00651         else
00652             firstword = *(sword + row*wpl + eiter - i);
00653             
00654         rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
00655 
00656         lastword = firstword;
00657         if ((eiter - i - 1) >= wpl)
00658             firstword = 0xffffffff;
00659         else
00660             firstword = *(sword + row*wpl + eiter - i - 1);
00661 
00662         rprevword = firstword << parbitsxn | lastword >> (32 - parbitsxn);
00663 
00664         for (j = 0; j < 32; j++)
00665         {
00666             //OR LHS full words
00667             tempword = (lprevword << j) | (lnextword >> (32 - j));
00668             destword &= tempword;
00669 
00670             //OR RHS full words
00671             tempword = (rprevword << j) | (rnextword >> (32 - j));
00672             destword &= tempword;
00673         }
00674 
00675         destword &= lprevword;
00676         destword &= lnextword;
00677         destword &= rprevword;
00678         destword &= rnextword;
00679 
00680         lastword = firstword;
00681         firstword = secondword;
00682     }
00683     
00684     if (isAsymmetric)
00685     {
00686         //Clear boundary pixels
00687         if (col < (nwords - 1))
00688             destword = 0x0;
00689         else if (col == (nwords - 1))
00690             destword &= rwmask;
00691         else if (col > (wpl - nwords))
00692             destword = 0x0;
00693         else if (col == (wpl - nwords))
00694             destword &= lwmask;
00695     }
00696 
00697     *(dword + pos) = destword;
00698 }\n
00699 )
00700 
00701 KERNEL(
00702 \n__kernel void morphoErodeHor_32word(__global int *sword,__global int *dword,
00703                             const int halfwidth, const int wpl, 
00704                             const int h, const char clearBoundPixH, 
00705                             const int rwmask, const int lwmask,
00706                             const char isEven)
00707 {
00708     const int col = get_global_id(0);
00709     const int row = get_global_id(1);
00710     const unsigned int pos = row * wpl + col;
00711     unsigned int prevword, nextword, currword,tempword, destword;
00712     int i;
00713 
00714     //Ignore the execss
00715     if (pos >= (wpl * h))
00716         return;
00717 
00718     currword = *(sword + pos);  
00719     destword = currword;
00720     
00721     //Handle boundary conditions
00722     if(col==0)
00723         prevword=0xffffffff;
00724     else
00725         prevword = *(sword + pos - 1);
00726     
00727     if(col==(wpl - 1))
00728         nextword=0xffffffff;
00729     else
00730         nextword = *(sword + pos + 1);
00731     
00732     for (i = 1; i <= halfwidth; i++)
00733     {
00734         //Get the min value on LHS of every pixel
00735         tempword = (prevword << (32-i)) | ((currword >> i));
00736         
00737         destword &= tempword;
00738 
00739         //Get min value on RHS of every pixel
00740         if (i == halfwidth && isEven)
00741         {
00742             tempword = 0xffffffff;
00743         }
00744         else
00745         {
00746             tempword = (currword << i) | (nextword >> (32 - i));
00747         }
00748 
00749         destword &= tempword;
00750     }
00751 
00752     if (clearBoundPixH)
00753     {
00754         if (col == 0) 
00755         {
00756             destword &= rwmask;
00757         }
00758         else if (col == (wpl - 1))
00759         {
00760             destword &= lwmask;
00761         }
00762     }
00763 
00764     *(dword + pos) = destword;
00765 }\n
00766 )
00767 
00768 KERNEL(
00769 \n__kernel void morphoErodeVer(__global int *sword,__global int *dword,
00770                             const int yp, 
00771                             const int wpl, const int h,
00772                             const char clearBoundPixV, const int yn)
00773 {
00774     const int col = get_global_id(0);
00775     const int row = get_global_id(1);
00776     const unsigned int pos = row * wpl + col;
00777     unsigned int tempword, destword;
00778     int i, siter, eiter;
00779     
00780     //Ignore the execss
00781     if (row >= h || col >= wpl)
00782         return;
00783 
00784     destword = *(sword + pos);
00785 
00786     //Set start position and end position considering the boundary conditions
00787     siter = (row - yp) < 0 ? 0 : (row - yp);
00788     eiter = (row >= (h - yn)) ? (h - 1) : (row + yn);
00789 
00790     for (i = siter; i <= eiter; i++)
00791     {
00792         tempword = *(sword + i*wpl + col);
00793 
00794         destword &= tempword;
00795     }
00796 
00797     //Clear boundary pixels
00798     if (clearBoundPixV && ((row < yp) || ((h - row) <= yn)))
00799     {   
00800         destword = 0x0;
00801     }
00802 
00803     *(dword + pos) = destword;
00804 }\n
00805 )
00806 
00807 // HistogramRect Kernel: Accumulate
00808 // assumes 4 channels, i.e., bytes_per_pixel = 4
00809 // assumes number of pixels is multiple of 8
00810 // data is layed out as
00811 // ch0                                           ch1 ...
00812 // bin0          bin1            bin2...         bin0...
00813 // rpt0,1,2...256  rpt0,1,2...
00814 KERNEL(
00815 \n#define HIST_REDUNDANCY 256\n
00816 \n#define GROUP_SIZE 256\n
00817 \n#define HIST_SIZE 256\n
00818 \n#define NUM_CHANNELS 4\n
00819 \n#define HR_UNROLL_SIZE 8 \n
00820 \n#define HR_UNROLL_TYPE uchar8 \n
00821 
00822 __attribute__((reqd_work_group_size(256, 1, 1)))
00823 __kernel
00824 void kernel_HistogramRectAllChannels(
00825     __global const uchar8 *data,
00826     uint numPixels,
00827     __global uint *histBuffer) {
00828 
00829     // declare variables
00830     uchar8 pixels;
00831     int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
00832 
00833     // for each pixel/channel, accumulate in global memory
00834     for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
00835         pixels = data[pc];
00836         //                       channel                        bin                         thread
00837         atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s0*HIST_REDUNDANCY + threadOffset ]); // ch0
00838         atomic_inc( &histBuffer[ 0*HIST_SIZE*HIST_REDUNDANCY + pixels.s4*HIST_REDUNDANCY + threadOffset ]); // ch0
00839         atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s1*HIST_REDUNDANCY + threadOffset ]); // ch1
00840         atomic_inc( &histBuffer[ 1*HIST_SIZE*HIST_REDUNDANCY + pixels.s5*HIST_REDUNDANCY + threadOffset ]); // ch1
00841         atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s2*HIST_REDUNDANCY + threadOffset ]); // ch2
00842         atomic_inc( &histBuffer[ 2*HIST_SIZE*HIST_REDUNDANCY + pixels.s6*HIST_REDUNDANCY + threadOffset ]); // ch2
00843         atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s3*HIST_REDUNDANCY + threadOffset ]); // ch3
00844         atomic_inc( &histBuffer[ 3*HIST_SIZE*HIST_REDUNDANCY + pixels.s7*HIST_REDUNDANCY + threadOffset ]); // ch3
00845     }
00846 }
00847 )
00848 
00849 KERNEL(
00850 // NUM_CHANNELS = 1
00851 __attribute__((reqd_work_group_size(256, 1, 1)))
00852 __kernel
00853 void kernel_HistogramRectOneChannel(
00854     __global const uchar8 *data,
00855     uint numPixels,
00856     __global uint *histBuffer) {
00857 
00858     // declare variables
00859     uchar8 pixels;
00860     int threadOffset = get_global_id(0)%HIST_REDUNDANCY;
00861 
00862     // for each pixel/channel, accumulate in global memory
00863     for ( uint pc = get_global_id(0); pc < numPixels/HR_UNROLL_SIZE; pc += get_global_size(0) ) {
00864         pixels = data[pc];
00865         //                        bin                         thread
00866         atomic_inc( &histBuffer[ pixels.s0*HIST_REDUNDANCY + threadOffset ]);
00867         atomic_inc( &histBuffer[ pixels.s1*HIST_REDUNDANCY + threadOffset ]);
00868         atomic_inc( &histBuffer[ pixels.s2*HIST_REDUNDANCY + threadOffset ]);
00869         atomic_inc( &histBuffer[ pixels.s3*HIST_REDUNDANCY + threadOffset ]);
00870         atomic_inc( &histBuffer[ pixels.s4*HIST_REDUNDANCY + threadOffset ]);
00871         atomic_inc( &histBuffer[ pixels.s5*HIST_REDUNDANCY + threadOffset ]);
00872         atomic_inc( &histBuffer[ pixels.s6*HIST_REDUNDANCY + threadOffset ]);
00873         atomic_inc( &histBuffer[ pixels.s7*HIST_REDUNDANCY + threadOffset ]);
00874     }
00875 }
00876 )
00877 
00878 
00879 KERNEL(
00880 // unused
00881 \n  __attribute__((reqd_work_group_size(256, 1, 1)))
00882 \n  __kernel
00883 \n  void kernel_HistogramRectAllChannels_Grey(
00884 \n      __global const uchar* data,
00885 \n      uint numPixels,
00886 \n        __global uint *histBuffer) { // each wg will write HIST_SIZE*NUM_CHANNELS into this result; cpu will accumulate across wg's
00887 \n  
00888 \n      /* declare variables */
00889 \n  
00890 \n      // work indices
00891 \n      size_t groupId = get_group_id(0);
00892 \n      size_t localId = get_local_id(0); // 0 -> 256-1
00893 \n      size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
00894 \n      uint numThreads = get_global_size(0);
00895 \n  
00896 \n      /* accumulate in global memory */
00897 \n      for ( uint pc = get_global_id(0); pc < numPixels; pc += get_global_size(0) ) {
00898 \n          uchar value = data[ pc ];
00899 \n          int idx = value * get_global_size(0) + get_global_id(0);
00900 \n           histBuffer[ idx ]++;
00901 \n          
00902 \n      }
00903 \n      
00904 \n  } // kernel_HistogramRectAllChannels_Grey
00905 
00906 )
00907 
00908 // HistogramRect Kernel: Reduction
00909 // only supports 4 channels
00910 // each work group handles a single channel of a single histogram bin
00911 KERNEL(
00912 __attribute__((reqd_work_group_size(256, 1, 1)))
00913 __kernel
00914 void kernel_HistogramRectAllChannelsReduction(
00915     int n, // unused pixel redundancy
00916     __global uint *histBuffer,
00917     __global int* histResult) {
00918 
00919     // declare variables
00920     int channel = get_group_id(0)/HIST_SIZE;
00921     int bin     = get_group_id(0)%HIST_SIZE;
00922     int value = 0;
00923 
00924     // accumulate in register
00925     for ( uint i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
00926         value += histBuffer[ channel*HIST_SIZE*HIST_REDUNDANCY+bin*HIST_REDUNDANCY+i];
00927     }
00928 
00929     // reduction in local memory
00930     __local int localHist[GROUP_SIZE];
00931     localHist[get_local_id(0)] = value;
00932     barrier(CLK_LOCAL_MEM_FENCE);
00933     for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
00934         if (get_local_id(0) < stride) {
00935             value = localHist[ get_local_id(0)+stride];
00936         }
00937         barrier(CLK_LOCAL_MEM_FENCE);
00938         if (get_local_id(0) < stride) {
00939             localHist[ get_local_id(0)] += value;
00940         }
00941         barrier(CLK_LOCAL_MEM_FENCE);
00942     }
00943 
00944     // write reduction to final result
00945     if (get_local_id(0) == 0) {
00946         histResult[get_group_id(0)] = localHist[0];
00947     }
00948 } // kernel_HistogramRectAllChannels
00949 )
00950 
00951 
00952 KERNEL(
00953 // NUM_CHANNELS = 1
00954 __attribute__((reqd_work_group_size(256, 1, 1)))
00955 __kernel
00956 void kernel_HistogramRectOneChannelReduction(
00957     int n, // unused pixel redundancy
00958     __global uint *histBuffer,
00959     __global int* histResult) {
00960 
00961     // declare variables
00962     // int channel = get_group_id(0)/HIST_SIZE;
00963     int bin     = get_group_id(0)%HIST_SIZE;
00964     int value = 0;
00965 
00966     // accumulate in register
00967     for ( int i = get_local_id(0); i < HIST_REDUNDANCY; i+=GROUP_SIZE) {
00968         value += histBuffer[ bin*HIST_REDUNDANCY+i];
00969     }
00970 
00971     // reduction in local memory
00972     __local int localHist[GROUP_SIZE];
00973     localHist[get_local_id(0)] = value;
00974     barrier(CLK_LOCAL_MEM_FENCE);
00975     for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
00976         if (get_local_id(0) < stride) {
00977             value = localHist[ get_local_id(0)+stride];
00978         }
00979         barrier(CLK_LOCAL_MEM_FENCE);
00980         if (get_local_id(0) < stride) {
00981             localHist[ get_local_id(0)] += value;
00982         }
00983         barrier(CLK_LOCAL_MEM_FENCE);
00984     }
00985 
00986     // write reduction to final result
00987     if (get_local_id(0) == 0) {
00988         histResult[get_group_id(0)] = localHist[0];
00989     }
00990 } // kernel_HistogramRectOneChannelReduction
00991 )
00992 
00993 
00994 KERNEL(
00995 // unused
00996   // each work group (x256) handles a histogram bin 
00997 \n  __attribute__((reqd_work_group_size(256, 1, 1)))
00998 \n  __kernel
00999 \n  void kernel_HistogramRectAllChannelsReduction_Grey(
01000 \n      int n, // pixel redundancy that needs to be accumulated
01001 \n      __global uint *histBuffer,
01002 \n      __global uint* histResult) { // each wg accumulates 1 bin
01003 \n  
01004 \n      /* declare variables */
01005 \n  
01006 \n      // work indices
01007 \n      size_t groupId = get_group_id(0);
01008 \n      size_t localId = get_local_id(0); // 0 -> 256-1
01009 \n      size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
01010 \n      uint numThreads = get_global_size(0);
01011 \n        unsigned int hist = 0;
01012 \n  
01013 \n      /* accumulate in global memory */
01014 \n      for ( uint p = 0; p < n; p+=GROUP_SIZE) {
01015 \n            hist += histBuffer[ (get_group_id(0)*n + p)];
01016 \n      }
01017 \n  
01018 \n      /* reduction in local memory */
01019 \n      // populate local memory
01020 \n      __local unsigned int localHist[GROUP_SIZE];
01021 
01022 \n      localHist[localId] = hist;
01023 \n      barrier(CLK_LOCAL_MEM_FENCE);
01024 \n  
01025 \n      for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
01026 \n          if (localId < stride) {
01027 \n              hist = localHist[ (localId+stride)];
01028 \n          }
01029 \n          barrier(CLK_LOCAL_MEM_FENCE);
01030 \n          if (localId < stride) {
01031 \n              localHist[ localId] += hist;
01032 \n          }
01033 \n          barrier(CLK_LOCAL_MEM_FENCE);
01034 \n      }
01035 \n  
01036 \n      if (localId == 0)
01037 \n          histResult[get_group_id(0)] = localHist[0];
01038 \n  
01039 \n  } // kernel_HistogramRectAllChannelsReduction_Grey
01040 
01041 )
01042 
01043 // ThresholdRectToPix Kernel
01044 // only supports 4 channels
01045 // imageData is input image (24-bits/pixel)
01046 // pix is output image (1-bit/pixel)
01047 KERNEL(
01048 \n#define CHAR_VEC_WIDTH 8 \n
01049 \n#define PIXELS_PER_WORD 32 \n
01050 \n#define PIXELS_PER_BURST 8 \n
01051 \n#define BURSTS_PER_WORD (PIXELS_PER_WORD/PIXELS_PER_BURST) \n
01052  typedef union {
01053   uchar s[PIXELS_PER_BURST*NUM_CHANNELS];
01054   uchar8 v[(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH];
01055  } charVec;
01056 
01057 __attribute__((reqd_work_group_size(256, 1, 1)))
01058 __kernel
01059 void kernel_ThresholdRectToPix(
01060     __global const uchar8 *imageData,
01061     int height,
01062     int width,
01063     int wpl, // words per line
01064     __global int *thresholds,
01065     __global int *hi_values,
01066     __global int *pix) {
01067 
01068     // declare variables
01069     int pThresholds[NUM_CHANNELS];
01070     int pHi_Values[NUM_CHANNELS];
01071     for ( int i = 0; i < NUM_CHANNELS; i++) {
01072         pThresholds[i] = thresholds[i];
01073         pHi_Values[i] = hi_values[i];
01074     }
01075 
01076     // for each word (32 pixels) in output image
01077     for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
01078         unsigned int word = 0; // all bits start at zero
01079 
01080         // for each burst in word
01081         for ( int b = 0; b < BURSTS_PER_WORD; b++) {
01082 
01083             // load burst
01084             charVec pixels;
01085             for ( int i = 0; i < (PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH; i++ ) {
01086                 pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH)  + i];
01087             }
01088 
01089             // for each pixel in burst
01090             for ( int p = 0; p < PIXELS_PER_BURST; p++) {
01091                 for ( int c = 0; c < NUM_CHANNELS; c++) {
01092                     unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c];
01093                     if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
01094                         word |=  (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
01095                     }
01096                 }
01097             }
01098         }
01099         pix[w] = word;
01100     }
01101 }
01102 
01103 // only supports 1 channel
01104  typedef union {
01105   uchar s[PIXELS_PER_BURST];
01106   uchar8 v[(PIXELS_PER_BURST)/CHAR_VEC_WIDTH];
01107  } charVec1;
01108 
01109 __attribute__((reqd_work_group_size(256, 1, 1)))
01110 __kernel
01111 void kernel_ThresholdRectToPix_OneChan(
01112     __global const uchar8 *imageData,
01113     int height,
01114     int width,
01115     int wpl, // words per line
01116     __global int *thresholds,
01117     __global int *hi_values,
01118     __global int *pix) {
01119 
01120     // declare variables
01121     int pThresholds[1];
01122     int pHi_Values[1];
01123     for ( int i = 0; i < 1; i++) {
01124         pThresholds[i] = thresholds[i];
01125         pHi_Values[i] = hi_values[i];
01126     }
01127 
01128     // for each word (32 pixels) in output image
01129     for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) {
01130         unsigned int word = 0; // all bits start at zero
01131 
01132         // for each burst in word
01133         for ( int b = 0; b < BURSTS_PER_WORD; b++) {
01134 
01135             // load burst
01136             charVec1 pixels;
01137             for ( int i = 0; i < (PIXELS_PER_BURST)/CHAR_VEC_WIDTH; i++ ) {
01138                 pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST)/CHAR_VEC_WIDTH)  + i];
01139             }
01140 
01141             // for each pixel in burst
01142             for ( int p = 0; p < PIXELS_PER_BURST; p++) {
01143                 for ( int c = 0; c < 1; c++) {
01144                     unsigned char pixChan = pixels.s[p + c];
01145                     if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) {
01146                         word |=  (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31));
01147                     }
01148                 }
01149             }
01150         }
01151         pix[w] = word;
01152     }
01153 }
01154 )
01155 
01156  ; // close char*
01157 
01158 #endif // USE_EXTERNAL_KERNEL
01159 #endif //_OCL_KERNEL_H_
01160 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
01161 
01162 // Alternative histogram kernel written to use uchar and different global memory scattered write
01163 // was a little better for intel platforms but still not faster then native serial code
01164 #if 0
01165 /*  data layed out as
01166     bin0                                        bin1                            bin2...
01167     r,g,b,a,r,g,b,a,r,g,b,a nthreads/4 copies
01168 */
01169 \n__attribute__((reqd_work_group_size(256, 1, 1)))
01170 \n  __kernel
01171 \n  void kernel_HistogramRectAllChannels_uchar(
01172 \n      volatile __global const uchar  *data,
01173 \n                              uint   numPixels,
01174 \n      volatile __global       uint   *histBuffer) {
01175 \n      
01176 \n      // for each pixel/channel, accumulate in global memory
01177 \n      for ( uint pc = get_global_id(0); pc < numPixels*NUM_CHANNELS; pc += get_global_size(0) ) {
01178 \n          uchar value = data[pc];
01179 \n          int idx = value*get_global_size(0) + get_global_id(0);
01180 \n          histBuffer[ idx ]++; // coalesced if same value
01181 \n      }
01182 \n  } // kernel_HistogramRectAllChannels
01183 \n
01184 \n  __attribute__((reqd_work_group_size(256, 1, 1)))
01185 \n  __kernel
01186 \n  void kernel_HistogramRectAllChannelsReduction_uchar(
01187 \n      int n, // pixel redundancy that needs to be accumulated = nthreads/4
01188 \n      __global uint4 *histBuffer,
01189 \n      __global uint* histResult) { // each wg accumulates 1 bin (all channels within it
01190 \n  
01191 \n      // declare variables
01192 \n      int binIdx     = get_group_id(0);
01193 \n      size_t groupId = get_group_id(0);
01194 \n      size_t localId = get_local_id(0); // 0 -> 256-1
01195 \n      size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1
01196 \n      uint numThreads = get_global_size(0);
01197 \n      uint4 hist = {0, 0, 0, 0};
01198 \n
01199 \n      // accumulate in register
01200 \n      for ( uint p = get_local_id(0); p < n; p+=GROUP_SIZE) {
01201 \n          hist += histBuffer[binIdx*n+p];
01202 \n      }
01203 \n  
01204 \n      // reduction in local memory
01205 \n      __local uint4 localHist[GROUP_SIZE];
01206 \n      localHist[localId] = hist;
01207 \n      barrier(CLK_LOCAL_MEM_FENCE);
01208 \n  
01209 \n      for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) {
01210 \n          if (localId < stride) {
01211 \n              hist = localHist[ localId+stride];
01212 \n          }
01213 \n          barrier(CLK_LOCAL_MEM_FENCE);
01214 \n          if (localId < stride) {
01215 \n              localHist[ localId] += hist;
01216 \n          }
01217 \n          barrier(CLK_LOCAL_MEM_FENCE);
01218 \n      }
01219 \n
01220 \n      // write reduction to final result
01221 \n      if (localId == 0) {
01222 \n          histResult[0*HIST_SIZE+binIdx] = localHist[0].s0;
01223 \n          histResult[1*HIST_SIZE+binIdx] = localHist[0].s1;
01224 \n          histResult[2*HIST_SIZE+binIdx] = localHist[0].s2;
01225 \n          histResult[3*HIST_SIZE+binIdx] = localHist[0].s3;
01226 \n      }
01227 \n  
01228 \n  } // kernel_HistogramRectAllChannels
01229 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines