tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/opencl/openclwrapper.cpp
Go to the documentation of this file.
00001 #ifdef _WIN32
00002 #include <Windows.h>
00003 #include <io.h>
00004 #else
00005 #include <sys/types.h>
00006 #include <unistd.h>
00007 #endif
00008 #include <float.h>
00009 
00010 #include "openclwrapper.h"
00011 #include "oclkernels.h"
00012 
00013 // for micro-benchmark
00014 #include "otsuthr.h"
00015 #include "thresholder.h"
00016 
00017 #ifdef USE_OPENCL
00018 
00019 #include "opencl_device_selection.h"
00020 #ifdef _MSC_VER
00021 int LeptMsgSeverity = 3;  // L_SEVERITY_INFO
00022 #endif  // _MSC_VER
00023 GPUEnv OpenclDevice::gpuEnv;
00024 
00025 #if USE_DEVICE_SELECTION
00026 bool OpenclDevice::deviceIsSelected = false;
00027 ds_device OpenclDevice::selectedDevice;
00028 #endif
00029 
00030 int OpenclDevice::isInited =0;
00031 
00032 struct tiff_transform {
00033     int vflip;    /* if non-zero, image needs a vertical fip */
00034     int hflip;    /* if non-zero, image needs a horizontal flip */
00035     int rotate;   /* -1 -> counterclockwise 90-degree rotation,
00036                       0 -> no rotation
00037                       1 -> clockwise 90-degree rotation */
00038 };
00039 
00040 static struct tiff_transform tiff_orientation_transforms[] = {
00041     {0, 0, 0},
00042     {0, 1, 0},
00043     {1, 1, 0},
00044     {1, 0, 0},
00045     {0, 1, -1},
00046     {0, 0, 1},
00047     {0, 1, 1},
00048     {0, 0, -1}
00049 };
00050 
00051 static const l_int32  MAX_PAGES_IN_TIFF_FILE = 3000; 
00052 
00053 cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers
00054 cl_mem pixThBuffer; //output from thresholdtopix calculation
00055 cl_int clStatus;
00056 KernelEnv rEnv;
00057 
00058 // substitute invalid characters in device name with _
00059 void legalizeFileName( char *fileName) {
00060     //printf("fileName: %s\n", fileName);
00061     char *invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches
00062     // for each invalid char
00063     for (int i = 0; i < strlen(invalidChars); i++) {
00064         char invalidStr[4];
00065         invalidStr[0] = invalidChars[i];
00066         invalidStr[1] = NULL;
00067         //printf("eliminating %s\n", invalidStr);
00068         //char *pos = strstr(fileName, invalidStr);
00069         // initial ./ is valid for present directory
00070         //if (*pos == '.') pos++;
00071         //if (*pos == '/') pos++;
00072         for ( char *pos = strstr(fileName, invalidStr); pos != NULL; pos = strstr(pos+1, invalidStr)) {
00073             //printf("\tfound: %s, ", pos);
00074             pos[0] = '_';
00075             //printf("fileName: %s\n", fileName);
00076         }
00077     }
00078 }
00079 
00080 void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) {
00081     //printf("[DS] populateGPUEnvFromDevice\n");
00082     size_t size;
00083     gpuInfo->mnIsUserCreated = 1;
00084     // device
00085     gpuInfo->mpDevID = device;
00086     gpuInfo->mpArryDevsID = new cl_device_id[1];
00087     gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
00088     clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE       , sizeof(cl_device_type), (void *) &gpuInfo->mDevType       , &size);
00089     CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
00090     // platform
00091     clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM   , sizeof(cl_platform_id), (void *) &gpuInfo->mpPlatformID   , &size);
00092     CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
00093     // context
00094     cl_context_properties props[3];
00095     props[0] = CL_CONTEXT_PLATFORM;
00096     props[1] = (cl_context_properties) gpuInfo->mpPlatformID;
00097     props[2] = 0;
00098     gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL, NULL, &clStatus);
00099     CHECK_OPENCL( clStatus, "populateGPUEnv::createContext");
00100     // queue
00101     cl_command_queue_properties queueProperties = 0;
00102     gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus );
00103     CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue");
00104     
00105 }
00106 
00107 int OpenclDevice::LoadOpencl()
00108 {
00109 #ifdef WIN32
00110     HINSTANCE HOpenclDll = NULL;
00111   void * OpenclDll = NULL;
00112     //fprintf(stderr, " LoadOpenclDllxx... \n");
00113     OpenclDll = static_cast<HINSTANCE>( HOpenclDll );
00114     OpenclDll = LoadLibrary( "openCL.dll" );
00115     if ( !static_cast<HINSTANCE>( OpenclDll ) )
00116     {
00117         fprintf(stderr, "[OD] Load opencl.dll failed!\n");
00118         FreeLibrary( static_cast<HINSTANCE>( OpenclDll ) );
00119         return 0;
00120         
00121     }
00122     fprintf(stderr, "[OD] Load opencl.dll successful!\n");
00123 #endif
00124     return 1;
00125 }
00126 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
00127 {
00128     envInfo->mpkContext = gpuEnv.mpContext;
00129     envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
00130     envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
00131 
00132     return 1;
00133 }
00134 
00135 cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElements, cl_mem_flags flags, cl_int *pStatus)
00136 {
00137     cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags),
00138                                         nElements * sizeof(l_uint32), hostbuffer, pStatus);
00139 
00140     return membuffer;
00141 }
00142 
00143 PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, int elements, cl_mem_flags flags, bool memcopy = false, bool sync = true)
00144 {   
00145     PROCNAME("mapOutputCLBuffer");
00146     if (!pixd)
00147     {
00148         if (memcopy)
00149         {
00150             if ((pixd = pixCreateTemplate(pixs)) == NULL)
00151                 (PIX *)ERROR_PTR("pixd not made", procName, NULL);
00152         }
00153         else
00154         {
00155             if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) == NULL)
00156                 (PIX *)ERROR_PTR("pixd not made", procName, NULL);
00157         }
00158     }
00159     l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
00160                                                     elements * sizeof(l_uint32), 0, NULL, NULL, NULL );
00161     
00162     if (memcopy)
00163     {
00164         memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
00165     }
00166     else
00167     {
00168         pixSetData(pixd, pValues);
00169     }
00170 
00171     clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,clbuffer,pValues,0,NULL,NULL);
00172     
00173     if (sync)
00174     {
00175         clFinish( rEnv.mpkCmdQueue );
00176     }
00177 
00178     return pixd;
00179 }
00180 
00181  cl_mem allocateIntBuffer( KernelEnv rEnv, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus , bool sync = false)
00182 {
00183     cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE),
00184         nElements * sizeof(l_int32), NULL, pStatus);
00185 
00186     if (_pValues != NULL)
00187     {
00188         l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
00189             nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
00190 
00191         memcpy(pValues, _pValues, nElements * sizeof(l_int32));
00192 
00193         clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
00194 
00195         if (sync)
00196             clFinish( rEnv.mpkCmdQueue );
00197     }
00198 
00199     return xValues;
00200 }
00201 
00202 int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo )
00203 {
00204     size_t length;
00205     cl_int clStatus;
00206     cl_uint numPlatforms, numDevices;
00207     cl_platform_id *platforms;
00208     cl_context_properties cps[3];
00209     char platformName[256];
00210     unsigned int i;
00211 
00212 
00213     // Have a look at the available platforms.
00214 
00215     if ( !gpuInfo->mnIsUserCreated )
00216     {
00217         clStatus = clGetPlatformIDs( 0, NULL, &numPlatforms );
00218         if ( clStatus != CL_SUCCESS )
00219         {
00220             return 1;
00221         }
00222         gpuInfo->mpPlatformID = NULL;
00223 
00224         if ( 0 < numPlatforms )
00225         {
00226             platforms = (cl_platform_id*) malloc( numPlatforms * sizeof( cl_platform_id ) );
00227             if ( platforms == (cl_platform_id*) NULL )
00228             {
00229                 return 1;
00230             }
00231             clStatus = clGetPlatformIDs( numPlatforms, platforms, NULL );
00232 
00233             if ( clStatus != CL_SUCCESS )
00234             {
00235                 return 1;
00236             }
00237 
00238             for ( i = 0; i < numPlatforms; i++ )
00239             {
00240                 clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,
00241                     sizeof( platformName ), platformName, NULL );
00242 
00243                 if ( clStatus != CL_SUCCESS )
00244                 {
00245                     return 1;
00246                 }
00247                 gpuInfo->mpPlatformID = platforms[i];
00248 
00249                 //if (!strcmp(platformName, "Intel(R) Coporation"))
00250                 //if( !strcmp( platformName, "Advanced Micro Devices, Inc." ))
00251                 {
00252                     gpuInfo->mpPlatformID = platforms[i];
00253                     
00254                     if ( getenv("SC_OPENCLCPU") )
00255                     {
00256                         clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
00257                                                   CL_DEVICE_TYPE_CPU,    // device_type for CPU device
00258                                                   0,                     // num_entries
00259                                                   NULL,                  // devices
00260                                                   &numDevices);
00261                         printf("Selecting OpenCL device: CPU (a)\n");
00262                     }
00263                     else
00264                     {
00265                           clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform
00266                                                   CL_DEVICE_TYPE_GPU,      // device_type for GPU device
00267                                                   0,                       // num_entries
00268                                                   NULL,                    // devices
00269                                                   &numDevices);
00270                           printf("Selecting OpenCL device: GPU (a)\n");
00271                     }
00272                     if ( clStatus != CL_SUCCESS )
00273                         continue;
00274 
00275                     if ( numDevices )
00276                         break;
00277                 }
00278             }
00279             if ( clStatus != CL_SUCCESS )
00280                 return 1;
00281             free( platforms );
00282         }
00283         if ( NULL == gpuInfo->mpPlatformID )
00284             return 1;
00285 
00286         // Use available platform.
00287         cps[0] = CL_CONTEXT_PLATFORM;
00288         cps[1] = (cl_context_properties) gpuInfo->mpPlatformID;
00289         cps[2] = 0;
00290         // Set device type for OpenCL
00291         
00292         if ( getenv("SC_OPENCLCPU") )
00293         {
00294             gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
00295             printf("Selecting OpenCL device: CPU (b)\n");
00296         }
00297         else
00298         {
00299             gpuInfo->mDevType = CL_DEVICE_TYPE_GPU;
00300             printf("Selecting OpenCL device: GPU (b)\n");
00301         }
00302 
00303         gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
00304 
00305         if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
00306         {
00307             gpuInfo->mDevType = CL_DEVICE_TYPE_CPU;
00308             gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
00309             printf("Selecting OpenCL device: CPU (c)\n");
00310         }
00311         if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
00312         {
00313             gpuInfo->mDevType = CL_DEVICE_TYPE_DEFAULT;
00314             gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus );
00315             printf("Selecting OpenCL device: DEFAULT (c)\n");
00316         }
00317         if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) )
00318             return 1;
00319         // Detect OpenCL devices.
00320         // First, get the size of device list data
00321         clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, 0, NULL, &length );
00322         if ( ( clStatus != CL_SUCCESS ) || ( length == 0 ) )
00323             return 1;
00324         // Now allocate memory for device list based on the size we got earlier
00325         gpuInfo->mpArryDevsID = (cl_device_id*) malloc( length );
00326         if ( gpuInfo->mpArryDevsID == (cl_device_id*) NULL )
00327             return 1;
00328         // Now, get the device list data
00329         clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, length,
00330                        gpuInfo->mpArryDevsID, NULL );
00331         if ( clStatus != CL_SUCCESS )
00332             return 1;
00333 
00334         // Create OpenCL command queue.
00335         gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpArryDevsID[0], 0, &clStatus );
00336 
00337         if ( clStatus != CL_SUCCESS )
00338             return 1;
00339     }
00340 
00341     clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL );
00342     // Check device extensions for double type
00343     size_t aDevExtInfoSize = 0;
00344 
00345     clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS, 0, NULL, &aDevExtInfoSize );
00346     CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
00347 
00348     char *aExtInfo = new char[aDevExtInfoSize];
00349 
00350     clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS,
00351                    sizeof(char) * aDevExtInfoSize, aExtInfo, NULL);
00352     CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
00353 
00354     gpuInfo->mnKhrFp64Flag = 0;
00355     gpuInfo->mnAmdFp64Flag = 0;
00356 
00357     if ( strstr( aExtInfo, "cl_khr_fp64" ) )
00358     {
00359         gpuInfo->mnKhrFp64Flag = 1;
00360     }
00361     else
00362     {
00363         // Check if cl_amd_fp64 extension is supported
00364         if ( strstr( aExtInfo, "cl_amd_fp64" ) )
00365             gpuInfo->mnAmdFp64Flag = 1;
00366     }
00367     delete []aExtInfo;
00368 
00369     return 0;
00370 }
00371 
00372 void OpenclDevice::releaseMorphCLBuffers()
00373 {
00374     if (pixdCLIntermediate != NULL)
00375         clReleaseMemObject(pixdCLIntermediate);
00376     if (pixsCLBuffer != NULL)
00377         clReleaseMemObject(pixsCLBuffer);
00378     if (pixdCLBuffer != NULL)
00379         clReleaseMemObject(pixdCLBuffer);
00380     if (pixThBuffer != NULL)
00381         clReleaseMemObject(pixThBuffer);
00382 }
00383 
00384 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs)
00385 {
00386     SetKernelEnv( &rEnv );
00387     
00388     if (pixThBuffer != NULL)
00389     {
00390         pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
00391         
00392         //Get the output from ThresholdToPix operation
00393         clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, sizeof(l_uint32) * wpl*h, 0, NULL, NULL);
00394     }
00395     else
00396     {
00397         //Get data from the source image
00398         l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32));
00399         memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32));
00400     
00401         pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus);
00402     }
00403     
00404     pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
00405 
00406     pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
00407 
00408     return (int)clStatus;
00409 }
00410 
00411 int OpenclDevice::InitEnv()
00412 {
00413 //PERF_COUNT_START("OD::InitEnv")
00414 //    printf("[OD] OpenclDevice::InitEnv()\n");
00415 #ifdef SAL_WIN32
00416     while( 1 )
00417     {
00418         if( 1 == LoadOpencl() )
00419             break;
00420     }
00421 PERF_COUNT_SUB("LoadOpencl")
00422 #endif
00423     // sets up environment, compiles programs
00424     
00425 
00426 #if USE_DEVICE_SELECTION
00427     
00428     InitOpenclRunEnv_DeviceSelection( 0 );
00429 //PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
00430 #else
00431     // init according to device
00432     InitOpenclRunEnv( 0 );
00433 #endif
00434 //PERF_COUNT_END
00435     return 1;
00436 }
00437 
00438 int OpenclDevice::ReleaseOpenclRunEnv()
00439 {
00440     ReleaseOpenclEnv( &gpuEnv );
00441 #ifdef SAL_WIN32
00442     FreeOpenclDll();
00443 #endif
00444     return 1;
00445 }
00446 inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
00447 {
00448     if ( kCount < 1 )
00449         fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" );
00450     strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
00451     gpuEnv.mnKernelCount++;
00452     return 0;
00453 }
00454 int OpenclDevice::RegistOpenclKernel()
00455 {
00456     if ( !gpuEnv.mnIsUserCreated )
00457         memset( &gpuEnv, 0, sizeof(gpuEnv) );
00458 
00459     gpuEnv.mnFileCount = 0; //argc;
00460     gpuEnv.mnKernelCount = 0UL;
00461 
00462     AddKernelConfig( 1, (const char*) "oclAverageSub1" );
00463     return 0;
00464 }
00465 int OpenclDevice::InitOpenclRunEnv( int argc )
00466 {
00467     int status = 0;
00468     if ( MAX_CLKERNEL_NUM <= 0 )
00469     {
00470         return 1;
00471     }
00472     if ( ( argc > MAX_CLFILE_NUM ) || ( argc < 0 ) )
00473         return 1;
00474 
00475     if ( !isInited )
00476     {
00477         RegistOpenclKernel();
00478         //initialize devices, context, comand_queue
00479         status = InitOpenclRunEnv( &gpuEnv );
00480         if ( status )
00481         {
00482             fprintf(stderr,"init_opencl_env failed.\n");
00483             return 1;
00484         }
00485         fprintf(stderr,"init_opencl_env successed.\n");
00486         //initialize program, kernelName, kernelCount
00487         if( getenv( "SC_FLOAT" ) )
00488         {
00489             gpuEnv.mnKhrFp64Flag = 0;
00490             gpuEnv.mnAmdFp64Flag = 0;
00491         }
00492         if( gpuEnv.mnKhrFp64Flag )
00493         {
00494             fprintf(stderr,"----use khr double type in kernel----\n");
00495             status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
00496         }
00497         else if( gpuEnv.mnAmdFp64Flag )
00498         {
00499             fprintf(stderr,"----use amd double type in kernel----\n");
00500             status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" );
00501         }
00502         else
00503         {
00504             fprintf(stderr,"----use float type in kernel----\n");
00505             status = CompileKernelFile( &gpuEnv, "-Dfp_t=float -Dfp_t4=float4 -Dfp_t16=float16" );
00506         }
00507         if ( status == 0 || gpuEnv.mnKernelCount == 0 )
00508         {
00509             fprintf(stderr,"CompileKernelFile failed.\n");
00510             return 1;
00511         }
00512         fprintf(stderr,"CompileKernelFile successed.\n");
00513         isInited = 1;
00514     }
00515     return 0;
00516 }
00517 
00518 int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
00519 //PERF_COUNT_START("InitOpenclRunEnv_DS")
00520 #if USE_DEVICE_SELECTION
00521     if (!isInited) {
00522         // after programs compiled, selects best device
00523         //printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n");
00524         ds_device bestDevice_DS = getDeviceSelection( );
00525 //PERF_COUNT_SUB("called getDeviceSelection()")
00526         cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
00527         // overwrite global static GPUEnv with new device
00528         if (selectedDeviceIsOpenCL() ) {
00529             //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n");
00530         populateGPUEnvFromDevice( &gpuEnv, bestDevice );
00531         gpuEnv.mnFileCount = 0; //argc;
00532         gpuEnv.mnKernelCount = 0UL;
00533 //PERF_COUNT_SUB("populate gpuEnv")
00534         CompileKernelFile(&gpuEnv, "");
00535 //PERF_COUNT_SUB("CompileKernelFile")
00536         } else {
00537             //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n");
00538         }
00539         isInited = 1;
00540     }
00541 #endif
00542 //PERF_COUNT_END
00543     return 0;
00544 }
00545 
00546 
00547 OpenclDevice::OpenclDevice()
00548 {
00549     //InitEnv();
00550 }
00551 
00552 OpenclDevice::~OpenclDevice()
00553 {
00554     //ReleaseOpenclRunEnv();
00555 }
00556 
00557 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
00558 {
00559     int i = 0;
00560     int clStatus = 0;
00561 
00562     if ( !isInited )
00563     {
00564         return 1;
00565     }
00566 
00567     for ( i = 0; i < gpuEnv.mnFileCount; i++ )
00568     {
00569         if ( gpuEnv.mpArryPrograms[i] )
00570         {
00571             clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
00572             CHECK_OPENCL( clStatus, "clReleaseProgram" );
00573             gpuEnv.mpArryPrograms[i] = NULL;
00574         }
00575     }
00576     if ( gpuEnv.mpCmdQueue )
00577     {
00578         clReleaseCommandQueue( gpuEnv.mpCmdQueue );
00579         gpuEnv.mpCmdQueue = NULL;
00580     }
00581     if ( gpuEnv.mpContext )
00582     {
00583         clReleaseContext( gpuEnv.mpContext );
00584         gpuEnv.mpContext = NULL;
00585     }
00586     isInited = 0;
00587     gpuInfo->mnIsUserCreated = 0;
00588     free( gpuInfo->mpArryDevsID );
00589     return 1;
00590 }
00591 int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
00592 {
00593     unsigned int i = 0;
00594     cl_int clStatus;
00595     int status = 0;
00596     char *str = NULL;
00597     FILE *fd = NULL;
00598     cl_uint numDevices=0;
00599     if ( getenv("SC_OPENCLCPU") )
00600     {
00601         clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
00602                                   CL_DEVICE_TYPE_CPU,  // device_type for CPU device
00603                                   0,                   // num_entries
00604                                   NULL,                // devices ID
00605                                   &numDevices);
00606     }
00607     else
00608     {
00609         clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform
00610                                   CL_DEVICE_TYPE_GPU,  // device_type for GPU device
00611                                   0,                   // num_entries
00612                                   NULL,                // devices ID
00613                                   &numDevices);
00614     }
00615     CHECK_OPENCL( clStatus, "clGetDeviceIDs" );
00616     for ( i = 0; i < numDevices; i++ )
00617     {
00618         char fileName[256] = { 0 }, cl_name[128] = { 0 };
00619         if ( gpuEnv.mpArryDevsID[i] != 0 )
00620         {
00621             char deviceName[1024];
00622             clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL );
00623             CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
00624             str = (char*) strstr( clFileName, (char*) ".cl" );
00625             memcpy( cl_name, clFileName, str - clFileName );
00626             cl_name[str - clFileName] = '\0';
00627             sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
00628             legalizeFileName(fileName);
00629             fd = fopen( fileName, "rb" );
00630             status = ( fd != NULL ) ? 1 : 0;
00631         }
00632     }
00633     if ( fd != NULL )
00634     {
00635         *fhandle = fd;
00636     }
00637     return status;
00638 
00639 }
00640 int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
00641 {
00642     int i;
00643     for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
00644     {
00645         if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
00646         {
00647             if ( gpuEnvCached->mpArryPrograms[i] != NULL )
00648             {
00649                 return 1;
00650             }
00651         }
00652     }
00653 
00654     return 0;
00655 }
00656 int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
00657 {
00658     FILE *output = NULL;
00659     output = fopen( fileName, "wb" );
00660     if ( output == NULL )
00661     {
00662         return 0;
00663     }
00664 
00665     fwrite( birary, sizeof(char), numBytes, output );
00666     fclose( output );
00667 
00668     return 1;
00669 
00670 }
00671 int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
00672 {
00673     unsigned int i = 0;
00674     cl_int clStatus;
00675     size_t *binarySizes, numDevices;
00676     cl_device_id *mpArryDevsID;
00677     char **binaries, *str = NULL;
00678 
00679     clStatus = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES,
00680                    sizeof(numDevices), &numDevices, NULL );
00681     CHECK_OPENCL( clStatus, "clGetProgramInfo" );
00682 
00683     mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
00684     if ( mpArryDevsID == NULL )
00685     {
00686         return 0;
00687     }
00688     /* grab the handles to all of the devices in the program. */
00689     clStatus = clGetProgramInfo( program, CL_PROGRAM_DEVICES,
00690                    sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL );
00691     CHECK_OPENCL( clStatus, "clGetProgramInfo" );
00692 
00693     /* figure out the sizes of each of the binaries. */
00694     binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices );
00695 
00696     clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES,
00697                    sizeof(size_t) * numDevices, binarySizes, NULL );
00698     CHECK_OPENCL( clStatus, "clGetProgramInfo" );
00699 
00700     /* copy over all of the generated binaries. */
00701     binaries = (char**) malloc( sizeof(char *) * numDevices );
00702     if ( binaries == NULL )
00703     {
00704         return 0;
00705     }
00706 
00707     for ( i = 0; i < numDevices; i++ )
00708     {
00709         if ( binarySizes[i] != 0 )
00710         {
00711             binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
00712             if ( binaries[i] == NULL )
00713             {
00714                 return 0;
00715             }
00716         }
00717         else
00718         {
00719             binaries[i] = NULL;
00720         }
00721     }
00722 
00723     clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARIES,
00724                    sizeof(char *) * numDevices, binaries, NULL );
00725     CHECK_OPENCL(clStatus,"clGetProgramInfo");
00726 
00727     /* dump out each binary into its own separate file. */
00728     for ( i = 0; i < numDevices; i++ )
00729     {
00730         char fileName[256] = { 0 }, cl_name[128] = { 0 };
00731 
00732         if ( binarySizes[i] != 0 )
00733         {
00734             char deviceName[1024];
00735             clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
00736                            sizeof(deviceName), deviceName, NULL);
00737             CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
00738 
00739             str = (char*) strstr( clFileName, (char*) ".cl" );
00740             memcpy( cl_name, clFileName, str - clFileName );
00741             cl_name[str - clFileName] = '\0';
00742             sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
00743             legalizeFileName(fileName);
00744             if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
00745             {
00746                 printf("[OD] write binary[%s] failed\n", fileName);
00747                 return 0;
00748             } //else
00749             printf("[OD] write binary[%s] succesfully\n", fileName);
00750         }
00751     }
00752 
00753     // Release all resouces and memory
00754     for ( i = 0; i < numDevices; i++ )
00755     {
00756         if ( binaries[i] != NULL )
00757         {
00758             free( binaries[i] );
00759             binaries[i] = NULL;
00760         }
00761     }
00762 
00763     if ( binaries != NULL )
00764     {
00765         free( binaries );
00766         binaries = NULL;
00767     }
00768 
00769     if ( binarySizes != NULL )
00770     {
00771         free( binarySizes );
00772         binarySizes = NULL;
00773     }
00774 
00775     if ( mpArryDevsID != NULL )
00776     {
00777         free( mpArryDevsID );
00778         mpArryDevsID = NULL;
00779     }
00780     return 1;
00781 }
00782 
00783 void copyIntBuffer( KernelEnv rEnv, cl_mem xValues, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus )
00784 {
00785     l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
00786         nElements * sizeof(l_int32), 0, NULL, NULL, NULL );
00787     clFinish( rEnv.mpkCmdQueue );
00788     if (_pValues != NULL)
00789     {
00790         for ( int i = 0; i < (int)nElements; i++ )
00791             pValues[i] = (l_int32)_pValues[i];
00792     }
00793 
00794     clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL);
00795     //clFinish( rEnv.mpkCmdQueue );
00796     return;
00797 }
00798 
00799 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
00800 {
00801 //PERF_COUNT_START("CompileKernelFile")
00802     cl_int clStatus = 0;
00803     size_t length;
00804     char *buildLog = NULL, *binary;
00805     const char *source;
00806     size_t source_size[1];
00807     int b_error, binary_status, binaryExisted, idx;
00808     size_t numDevices;
00809     cl_device_id *mpArryDevsID;
00810     FILE *fd, *fd1;
00811     const char* filename = "kernel.cl";
00812     //fprintf(stderr, "[OD] CompileKernelFile ... \n");
00813     if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
00814     {
00815         return 1;
00816     }
00817 
00818     idx = gpuInfo->mnFileCount;
00819 
00820     source = kernel_src;
00821 
00822     source_size[0] = strlen( source );
00823     binaryExisted = 0;
00824         binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark
00825 //PERF_COUNT_SUB("BinaryGenerated")
00826     if ( binaryExisted == 1 )
00827     {
00828         clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
00829                        sizeof(numDevices), &numDevices, NULL );
00830         CHECK_OPENCL( clStatus, "clGetContextInfo" );
00831 
00832         mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
00833         if ( mpArryDevsID == NULL )
00834         {
00835             return 0;
00836         }
00837 //PERF_COUNT_SUB("get numDevices")
00838         b_error = 0;
00839         length = 0;
00840         b_error |= fseek( fd, 0, SEEK_END ) < 0;
00841         b_error |= ( length = ftell(fd) ) <= 0;
00842         b_error |= fseek( fd, 0, SEEK_SET ) < 0;
00843         if ( b_error )
00844         {
00845             return 0;
00846         }
00847 
00848         binary = (char*) malloc( length + 2 );
00849         if ( !binary )
00850         {
00851             return 0;
00852         }
00853 
00854         memset( binary, 0, length + 2 );
00855         b_error |= fread( binary, 1, length, fd ) != length;
00856 
00857 
00858         fclose( fd );
00859 //PERF_COUNT_SUB("read file")
00860         fd = NULL;
00861         // grab the handles to all of the devices in the context.
00862         clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES,
00863                        sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL );
00864         CHECK_OPENCL( clStatus, "clGetContextInfo" );
00865 //PERF_COUNT_SUB("get devices")
00866         //fprintf(stderr, "[OD] Create kernel from binary\n");
00867         gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
00868                                            mpArryDevsID, &length, (const unsigned char**) &binary,
00869                                            &binary_status, &clStatus );
00870         CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" );
00871 //PERF_COUNT_SUB("clCreateProgramWithBinary")
00872         free( binary );
00873         free( mpArryDevsID );
00874         mpArryDevsID = NULL;
00875 //PERF_COUNT_SUB("binaryExisted")
00876     }
00877     else
00878     {
00879         // create a CL program using the kernel source
00880         //fprintf(stderr, "[OD] Create kernel from source\n");
00881         gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
00882                                          source_size, &clStatus);
00883         CHECK_OPENCL( clStatus, "clCreateProgramWithSource" );
00884 //PERF_COUNT_SUB("!binaryExisted")
00885     }
00886 
00887     if ( gpuInfo->mpArryPrograms[idx] == (cl_program) NULL )
00888     {
00889         return 0;
00890     }
00891 
00892     //char options[512];
00893     // create a cl program executable for all the devices specified
00894     //printf("[OD] BuildProgram.\n");
00895 PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
00896     if (!gpuInfo->mnIsUserCreated)
00897     {
00898         clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
00899                        buildOption, NULL, NULL);
00900 //PERF_COUNT_SUB("clBuildProgram notUserCreated")
00901     }
00902     else
00903     {
00904         clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
00905                        buildOption, NULL, NULL);
00906 //PERF_COUNT_SUB("clBuildProgram isUserCreated")
00907     }
00908 PERF_COUNT_END
00909     if ( clStatus != CL_SUCCESS )
00910     {
00911         printf ("BuildProgram error!\n");
00912         if ( !gpuInfo->mnIsUserCreated )
00913         {
00914             clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
00915                            CL_PROGRAM_BUILD_LOG, 0, NULL, &length );
00916         }
00917         else
00918         {
00919             clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
00920                            CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
00921         }
00922         if ( clStatus != CL_SUCCESS )
00923         {
00924             printf("opencl create build log fail\n");
00925             return 0;
00926         }
00927         buildLog = (char*) malloc( length );
00928         if ( buildLog == (char*) NULL )
00929         {
00930             return 0;
00931         }
00932         if ( !gpuInfo->mnIsUserCreated )
00933         {
00934             clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
00935                            CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
00936         }
00937         else
00938         {
00939             clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
00940                            CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
00941         }
00942         if ( clStatus != CL_SUCCESS )
00943         {
00944             printf("opencl program build info fail\n");
00945             return 0;
00946         }
00947 
00948         fd1 = fopen( "kernel-build.log", "w+" );
00949         if ( fd1 != NULL )
00950         {
00951             fwrite( buildLog, sizeof(char), length, fd1 );
00952             fclose( fd1 );
00953         }
00954 
00955         free( buildLog );
00956 //PERF_COUNT_SUB("build error log")
00957         return 0;
00958     }
00959 
00960     strcpy( gpuInfo->mArryKnelSrcFile[idx], filename );
00961 //PERF_COUNT_SUB("strcpy")
00962     if ( binaryExisted == 0 ) {
00963         GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename );
00964         PERF_COUNT_SUB("GenerateBinFromKernelSource")
00965     }
00966 
00967     gpuInfo->mnFileCount += 1;
00968 //PERF_COUNT_END
00969     return 1;
00970 }
00971 
00972 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
00973 {
00974 PERF_COUNT_START("pixReadFromTiffKernel")
00975     cl_int clStatus;
00976     KernelEnv rEnv;
00977     size_t globalThreads[2];
00978     size_t localThreads[2];
00979     int gsize;
00980     cl_mem valuesCl; 
00981     cl_mem outputCl;
00982 
00983     //global and local work dimensions for Horizontal pass
00984     gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
00985     globalThreads[0] = gsize;
00986     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
00987     globalThreads[1] = gsize;
00988     localThreads[0] = GROUPSIZE_X;
00989     localThreads[1] = GROUPSIZE_Y;
00990 
00991     SetKernelEnv( &rEnv );
00992     
00993     l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32));
00994     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
00995     CHECK_OPENCL( clStatus, "clCreateKernel");
00996     
00997     //Allocate input and output OCL buffers
00998     valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
00999     outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
01000 
01001     //Kernel arguments
01002     clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl );
01003     CHECK_OPENCL( clStatus, "clSetKernelArg");
01004     clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(w), (void *)&w );
01005     CHECK_OPENCL( clStatus, "clSetKernelArg" );
01006     clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(h), (void *)&h );
01007     CHECK_OPENCL( clStatus, "clSetKernelArg" );
01008     clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(wpl), (void *)&wpl );
01009     CHECK_OPENCL( clStatus, "clSetKernelArg" );
01010     clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl );
01011     CHECK_OPENCL( clStatus, "clSetKernelArg");
01012     
01013     //Kernel enqueue
01014 PERF_COUNT_SUB("before")
01015     clStatus = clEnqueueNDRangeKernel( rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL );
01016     CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" );
01017     
01018      /* map results back from gpu */
01019     void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0, w*h * sizeof(l_uint32), 0, NULL, NULL, &clStatus);
01020     CHECK_OPENCL( clStatus, "clEnqueueMapBuffer outputCl");
01021     clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL);
01022 
01023     //Sync
01024     clFinish( rEnv.mpkCmdQueue );
01025 PERF_COUNT_SUB("kernel & map")
01026 PERF_COUNT_END
01027     return pResult;
01028 }
01029 
01030 
01031 PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n )
01032 {
01033 PERF_COUNT_START("pixReadTiffCL")
01034     FILE  *fp;
01035 PIX   *pix;
01036 
01037     //printf("pixReadTiffCl file");
01038     PROCNAME("pixReadTiff");
01039 
01040     if (!filename)
01041         return (PIX *)ERROR_PTR("filename not defined", procName, NULL);
01042 
01043     if ((fp = fopenReadStream(filename)) == NULL)
01044         return (PIX *)ERROR_PTR("image file not found", procName, NULL);
01045     if ((pix = pixReadStreamTiffCl(fp, n)) == NULL) {
01046         fclose(fp);
01047         return (PIX *)ERROR_PTR("pix not read", procName, NULL);
01048     }
01049     fclose(fp);
01050 PERF_COUNT_END
01051     return pix;
01052     
01053 }
01054 TIFF *
01055 OpenclDevice::fopenTiffCl(FILE        *fp,
01056           const char  *modestring)
01057 {
01058 l_int32  fd;
01059 
01060     PROCNAME("fopenTiff");
01061 
01062     if (!fp)
01063         return (TIFF *)ERROR_PTR("stream not opened", procName, NULL);
01064     if (!modestring)
01065         return (TIFF *)ERROR_PTR("modestring not defined", procName, NULL);
01066 
01067     if ((fd = fileno(fp)) < 0)
01068         return (TIFF *)ERROR_PTR("invalid file descriptor", procName, NULL);
01069     lseek(fd, 0, SEEK_SET);
01070 
01071     return TIFFFdOpen(fd, "TIFFstream", modestring);
01072 }
01073 l_int32 OpenclDevice::getTiffStreamResolutionCl(TIFF     *tif,
01074                         l_int32  *pxres,
01075                         l_int32  *pyres)
01076 {
01077 l_uint16   resunit;
01078 l_int32    foundxres, foundyres;
01079 l_float32  fxres, fyres;
01080 
01081     PROCNAME("getTiffStreamResolution");
01082 
01083     if (!tif)
01084         return ERROR_INT("tif not opened", procName, 1);
01085     if (!pxres || !pyres)
01086         return ERROR_INT("&xres and &yres not both defined", procName, 1);
01087     *pxres = *pyres = 0;
01088 
01089     TIFFGetFieldDefaulted(tif, TIFFTAG_RESOLUTIONUNIT, &resunit);
01090     foundxres = TIFFGetField(tif, TIFFTAG_XRESOLUTION, &fxres);
01091     foundyres = TIFFGetField(tif, TIFFTAG_YRESOLUTION, &fyres);
01092     if (!foundxres && !foundyres) return 1;
01093     if (!foundxres && foundyres)
01094         fxres = fyres;
01095     else if (foundxres && !foundyres)
01096         fyres = fxres;
01097 
01098     if (resunit == RESUNIT_CENTIMETER) {  /* convert to ppi */
01099         *pxres = (l_int32)(2.54 * fxres + 0.5);
01100         *pyres = (l_int32)(2.54 * fyres + 0.5);
01101     }
01102     else {
01103         *pxres = (l_int32)fxres;
01104         *pyres = (l_int32)fyres;
01105     }
01106 
01107     return 0;
01108 }
01109 PIX *
01110 OpenclDevice::pixReadStreamTiffCl(FILE    *fp,
01111                   l_int32  n)
01112 {
01113 l_int32  i, pagefound;
01114 PIX     *pix;
01115 TIFF    *tif;
01116 
01117     PROCNAME("pixReadStreamTiff");
01118 
01119     if (!fp)
01120         return (PIX *)ERROR_PTR("stream not defined", procName, NULL);
01121 
01122     if ((tif = fopenTiffCl(fp, "rb")) == NULL)
01123         return (PIX *)ERROR_PTR("tif not opened", procName, NULL);
01124 
01125     pagefound = FALSE;
01126     pix = NULL;
01127     for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) {
01128         if (i == n) {
01129             pagefound = TRUE;
01130             if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) {
01131                 TIFFCleanup(tif);
01132                 return (PIX *)ERROR_PTR("pix not read", procName, NULL);
01133             }
01134             break;
01135         }
01136         if (TIFFReadDirectory(tif) == 0)
01137             break;
01138     }
01139 
01140     if (pagefound == FALSE) {
01141         L_WARNING("tiff page %d not found", procName, n);
01142         TIFFCleanup(tif);
01143         return NULL;
01144     }
01145 
01146     TIFFCleanup(tif);
01147     return pix;
01148 }
01149 
01150 static l_int32
01151 getTiffCompressedFormat(l_uint16  tiffcomp)
01152 {
01153 l_int32  comptype;
01154 
01155     switch (tiffcomp)
01156     {
01157     case COMPRESSION_CCITTFAX4:
01158         comptype = IFF_TIFF_G4;
01159         break;
01160     case COMPRESSION_CCITTFAX3:
01161         comptype = IFF_TIFF_G3;
01162         break;
01163     case COMPRESSION_CCITTRLE:
01164         comptype = IFF_TIFF_RLE;
01165         break;
01166     case COMPRESSION_PACKBITS:
01167         comptype = IFF_TIFF_PACKBITS;
01168         break;
01169     case COMPRESSION_LZW:
01170         comptype = IFF_TIFF_LZW;
01171         break;
01172     case COMPRESSION_ADOBE_DEFLATE:
01173         comptype = IFF_TIFF_ZIP;
01174         break;
01175     default:
01176         comptype = IFF_TIFF;
01177         break;
01178     }
01179     return comptype;
01180 }
01181 
01182 void compare(l_uint32  *cpu, l_uint32  *gpu,int size)
01183 {
01184     for(int i=0;i<size;i++)
01185     {
01186         if(cpu[i]!=gpu[i])
01187         {
01188             printf("\ndoesnot match\n");
01189             return;
01190         }
01191     }
01192     printf("\nit matches\n");
01193     
01194 }
01195 
01196 //OpenCL implementation of pixReadFromTiffStream.
01197 //Similar to the CPU implentation of pixReadFromTiffStream
01198 PIX *
01199 OpenclDevice::pixReadFromTiffStreamCl(TIFF  *tif)
01200 {
01201 l_uint8   *linebuf, *data;
01202 l_uint16   spp, bps, bpp, tiffbpl, photometry, tiffcomp, orientation;
01203 l_uint16  *redmap, *greenmap, *bluemap;
01204 l_int32    d, wpl, bpl, comptype, i, ncolors;
01205 l_int32    xres, yres;
01206 l_uint32   w, h;
01207 l_uint32  *line, *tiffdata;
01208 PIX       *pix;
01209 PIXCMAP   *cmap;
01210 
01211     PROCNAME("pixReadFromTiffStream");
01212 
01213     if (!tif)
01214         return (PIX *)ERROR_PTR("tif not defined", procName, NULL);
01215 
01216     
01217     TIFFGetFieldDefaulted(tif, TIFFTAG_BITSPERSAMPLE, &bps);
01218     TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
01219     bpp = bps * spp;
01220     if (bpp > 32)
01221         return (PIX *)ERROR_PTR("can't handle bpp > 32", procName, NULL);
01222     if (spp == 1)
01223         d = bps;
01224     else if (spp == 3 || spp == 4)
01225         d = 32;
01226     else
01227         return (PIX *)ERROR_PTR("spp not in set {1,3,4}", procName, NULL);
01228 
01229     TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &w);
01230     TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &h);
01231     tiffbpl = TIFFScanlineSize(tif);
01232 
01233     if ((pix = pixCreate(w, h, d)) == NULL)
01234         return (PIX *)ERROR_PTR("pix not made", procName, NULL);
01235     data = (l_uint8 *)pixGetData(pix);
01236     wpl = pixGetWpl(pix);
01237     bpl = 4 * wpl;
01238 
01239    
01240     if (spp == 1) {
01241         if ((linebuf = (l_uint8 *)CALLOC(tiffbpl + 1, sizeof(l_uint8))) == NULL)
01242             return (PIX *)ERROR_PTR("calloc fail for linebuf", procName, NULL);
01243         
01244         for (i = 0 ; i < h ; i++) {
01245             if (TIFFReadScanline(tif, linebuf, i, 0) < 0) {
01246                 FREE(linebuf);
01247                 pixDestroy(&pix);
01248                 return (PIX *)ERROR_PTR("line read fail", procName, NULL);
01249             }
01250             memcpy((char *)data, (char *)linebuf, tiffbpl);
01251             data += bpl;
01252         }
01253         if (bps <= 8)
01254             pixEndianByteSwap(pix);
01255         else   
01256             pixEndianTwoByteSwap(pix);
01257         FREE(linebuf);
01258     }
01259     else {  
01260         if ((tiffdata = (l_uint32 *)CALLOC(w * h, sizeof(l_uint32))) == NULL) {
01261             pixDestroy(&pix);
01262             return (PIX *)ERROR_PTR("calloc fail for tiffdata", procName, NULL);
01263         }
01264         if (!TIFFReadRGBAImageOriented(tif, w, h, (uint32 *)tiffdata,
01265                                        ORIENTATION_TOPLEFT, 0)) {
01266             FREE(tiffdata);
01267             pixDestroy(&pix);
01268             return (PIX *)ERROR_PTR("failed to read tiffdata", procName, NULL);
01269         }
01270         line = pixGetData(pix);
01271 
01272         //Invoke the OpenCL kernel for pixReadFromTiff
01273         l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
01274         pixSetData(pix, output_gpu);
01275         
01276         FREE(tiffdata);
01277     }
01278 
01279     if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) {
01280         pixSetXRes(pix, xres);
01281         pixSetYRes(pix, yres);
01282     }
01283 
01284 
01285     TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &tiffcomp);
01286     comptype = getTiffCompressedFormat(tiffcomp);
01287     pixSetInputFormat(pix, comptype);
01288 
01289     if (TIFFGetField(tif, TIFFTAG_COLORMAP, &redmap, &greenmap, &bluemap)) {
01290            
01291         if ((cmap = pixcmapCreate(bps)) == NULL) {
01292             pixDestroy(&pix);
01293             return (PIX *)ERROR_PTR("cmap not made", procName, NULL);
01294         }
01295         ncolors = 1 << bps;
01296         for (i = 0; i < ncolors; i++)
01297             pixcmapAddColor(cmap, redmap[i] >> 8, greenmap[i] >> 8,
01298                             bluemap[i] >> 8);
01299         pixSetColormap(pix, cmap);
01300     }
01301     else {  
01302         if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometry)) {
01303        
01304             if (tiffcomp == COMPRESSION_CCITTFAX3 ||
01305                 tiffcomp == COMPRESSION_CCITTFAX4 ||
01306                 tiffcomp == COMPRESSION_CCITTRLE ||
01307                 tiffcomp == COMPRESSION_CCITTRLEW) {
01308                 photometry = PHOTOMETRIC_MINISWHITE;
01309             }
01310             else
01311                 photometry = PHOTOMETRIC_MINISBLACK;
01312         }
01313         if ((d == 1 && photometry == PHOTOMETRIC_MINISBLACK) ||
01314             (d == 8 && photometry == PHOTOMETRIC_MINISWHITE))
01315             pixInvert(pix, pix);
01316     }
01317 
01318     if (TIFFGetField(tif, TIFFTAG_ORIENTATION, &orientation)) {
01319         if (orientation >= 1 && orientation <= 8) {
01320             struct tiff_transform *transform =
01321               &tiff_orientation_transforms[orientation - 1];
01322             if (transform->vflip) pixFlipTB(pix, pix);
01323             if (transform->hflip) pixFlipLR(pix, pix);
01324             if (transform->rotate) {
01325                 PIX *oldpix = pix;
01326                 pix = pixRotate90(oldpix, transform->rotate);
01327                 pixDestroy(&oldpix);
01328            }
01329         }
01330     }
01331 
01332     return pix;
01333 }
01334 
01335 //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
01336 cl_int
01337 pixDilateCL_55(l_int32  wpl, l_int32  h)
01338 {
01339     size_t globalThreads[2];
01340     cl_mem pixtemp;
01341     cl_int status;
01342     int gsize;
01343     size_t localThreads[2];
01344 
01345     //Horizontal pass
01346     gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
01347     globalThreads[0] = gsize;
01348     globalThreads[1] = GROUPSIZE_HMORY;
01349     localThreads[0] = GROUPSIZE_HMORX;
01350     localThreads[1] = GROUPSIZE_HMORY;
01351 
01352     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status );
01353     
01354     status = clSetKernelArg(rEnv.mpkKernel,
01355         0,
01356         sizeof(cl_mem),
01357         &pixsCLBuffer);
01358     status = clSetKernelArg(rEnv.mpkKernel,
01359         1,
01360         sizeof(cl_mem),
01361         &pixdCLBuffer);
01362     status = clSetKernelArg(rEnv.mpkKernel,
01363         2,
01364         sizeof(wpl),
01365         (const void *)&wpl);
01366     status = clSetKernelArg(rEnv.mpkKernel,
01367         3,
01368         sizeof(h),
01369         (const void *)&h);
01370 
01371     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01372                             rEnv.mpkKernel,
01373                             2,
01374                             NULL,
01375                             globalThreads,
01376                             localThreads,
01377                             0,
01378                             NULL,
01379                             NULL);
01380     
01381     //Swap source and dest buffers
01382     pixtemp = pixsCLBuffer;
01383     pixsCLBuffer = pixdCLBuffer;
01384     pixdCLBuffer = pixtemp;
01385 
01386     //Vertical
01387     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
01388     globalThreads[0] = gsize;
01389     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
01390     globalThreads[1] = gsize;
01391     localThreads[0] = GROUPSIZE_X;
01392     localThreads[1] = GROUPSIZE_Y;
01393 
01394     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status );
01395     
01396     status = clSetKernelArg(rEnv.mpkKernel,
01397         0,
01398         sizeof(cl_mem),
01399         &pixsCLBuffer);
01400     status = clSetKernelArg(rEnv.mpkKernel,
01401         1,
01402         sizeof(cl_mem),
01403         &pixdCLBuffer);
01404     status = clSetKernelArg(rEnv.mpkKernel,
01405         2,
01406         sizeof(wpl),
01407         (const void *)&wpl);
01408     status = clSetKernelArg(rEnv.mpkKernel,
01409         3,
01410         sizeof(h),
01411         (const void *)&h);
01412     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01413                             rEnv.mpkKernel,
01414                             2,
01415                             NULL,
01416                             globalThreads,
01417                             localThreads,
01418                             0,
01419                             NULL,
01420                             NULL);
01421 
01422     return status;
01423 }
01424 
01425 //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
01426 cl_int
01427 pixErodeCL_55(l_int32  wpl, l_int32  h)
01428 {
01429     size_t globalThreads[2];
01430     cl_mem pixtemp;
01431     cl_int status;
01432     int gsize;
01433     l_uint32 fwmask, lwmask;
01434     size_t localThreads[2];
01435 
01436     lwmask = lmask32[32 - 2];
01437     fwmask = rmask32[32 - 2];
01438 
01439     //Horizontal pass
01440     gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
01441     globalThreads[0] = gsize;
01442     globalThreads[1] = GROUPSIZE_HMORY;
01443     localThreads[0] = GROUPSIZE_HMORX;
01444     localThreads[1] = GROUPSIZE_HMORY;
01445 
01446     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status );
01447     
01448     status = clSetKernelArg(rEnv.mpkKernel,
01449         0,
01450         sizeof(cl_mem),
01451         &pixsCLBuffer);
01452     status = clSetKernelArg(rEnv.mpkKernel,
01453         1,
01454         sizeof(cl_mem),
01455         &pixdCLBuffer);
01456     status = clSetKernelArg(rEnv.mpkKernel,
01457         2,
01458         sizeof(wpl),
01459         (const void *)&wpl);
01460     status = clSetKernelArg(rEnv.mpkKernel,
01461         3,
01462         sizeof(h),
01463         (const void *)&h);
01464 
01465     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01466                             rEnv.mpkKernel,
01467                             2,
01468                             NULL,
01469                             globalThreads,
01470                             localThreads,
01471                             0,
01472                             NULL,
01473                             NULL);
01474     
01475     //Swap source and dest buffers
01476     pixtemp = pixsCLBuffer;
01477     pixsCLBuffer = pixdCLBuffer;
01478     pixdCLBuffer = pixtemp;
01479 
01480     //Vertical
01481     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
01482     globalThreads[0] = gsize;
01483     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
01484     globalThreads[1] = gsize;
01485     localThreads[0] = GROUPSIZE_X;
01486     localThreads[1] = GROUPSIZE_Y;
01487 
01488     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status );
01489     
01490     status = clSetKernelArg(rEnv.mpkKernel,
01491         0,
01492         sizeof(cl_mem),
01493         &pixsCLBuffer);
01494     status = clSetKernelArg(rEnv.mpkKernel,
01495         1,
01496         sizeof(cl_mem),
01497         &pixdCLBuffer);
01498     status = clSetKernelArg(rEnv.mpkKernel,
01499         2,
01500         sizeof(wpl),
01501         (const void *)&wpl);
01502     status = clSetKernelArg(rEnv.mpkKernel,
01503         3,
01504         sizeof(h),
01505         (const void *)&h);
01506     status = clSetKernelArg(rEnv.mpkKernel,
01507         4,
01508         sizeof(fwmask),
01509         (const void *)&fwmask);
01510     status = clSetKernelArg(rEnv.mpkKernel,
01511         5,
01512         sizeof(lwmask),
01513         (const void *)&lwmask);
01514     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01515                             rEnv.mpkKernel,
01516                             2,
01517                             NULL,
01518                             globalThreads,
01519                             localThreads,
01520                             0,
01521                             NULL,
01522                             NULL);
01523 
01524     return status;
01525 }
01526 
01527 //Morphology Dilate operation. Invokes the relevant OpenCL kernels
01528 cl_int
01529 pixDilateCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
01530 {
01531     l_int32  xp, yp, xn, yn;
01532     SEL* sel;
01533     size_t globalThreads[2];
01534     cl_mem pixtemp;
01535     cl_int status;
01536     int gsize;
01537     size_t localThreads[2];
01538     char isEven;
01539 
01540     OpenclDevice::SetKernelEnv( &rEnv );
01541     
01542     if (hsize == 5 && vsize == 5)
01543     {
01544         //Specific case for 5x5
01545         status = pixDilateCL_55(wpl, h);
01546         return status;
01547     }
01548     
01549     sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
01550     
01551     selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
01552 
01553     //global and local work dimensions for Horizontal pass
01554     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
01555     globalThreads[0] = gsize;
01556     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
01557     globalThreads[1] = gsize;
01558     localThreads[0] = GROUPSIZE_X;
01559     localThreads[1] = GROUPSIZE_Y;
01560 
01561     if (xp > 31 || xn > 31)
01562     {
01563         //Generic case. 
01564         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor", &status );
01565     
01566         status = clSetKernelArg(rEnv.mpkKernel,
01567             0,
01568             sizeof(cl_mem),
01569             &pixsCLBuffer);
01570         status = clSetKernelArg(rEnv.mpkKernel,
01571             1,
01572             sizeof(cl_mem),
01573             &pixdCLBuffer);
01574         status = clSetKernelArg(rEnv.mpkKernel,
01575                 2,
01576                 sizeof(xp),
01577                 (const void *)&xp);
01578         status = clSetKernelArg(rEnv.mpkKernel,
01579                 3,
01580                 sizeof(xn),
01581                 (const void *)&xn);
01582         status = clSetKernelArg(rEnv.mpkKernel,
01583                 4,
01584                 sizeof(wpl),
01585                 (const void *)&wpl);
01586         status = clSetKernelArg(rEnv.mpkKernel,
01587                 5,
01588                 sizeof(h),
01589                 (const void *)&h);
01590         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01591                                 rEnv.mpkKernel,
01592                                 2,
01593                                 NULL,
01594                                 globalThreads,
01595                                 localThreads,
01596                                 0,
01597                                 NULL,
01598                                 NULL);
01599 
01600         if (yp > 0 || yn > 0)
01601         {
01602             pixtemp = pixsCLBuffer;
01603             pixsCLBuffer = pixdCLBuffer;
01604             pixdCLBuffer = pixtemp;
01605         }
01606     }
01607     else if (xp > 0 || xn > 0 )
01608     {
01609         //Specfic Horizontal pass kernel for half width < 32
01610         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_32word", &status );
01611         isEven = (xp != xn);
01612         
01613         status = clSetKernelArg(rEnv.mpkKernel,
01614             0,
01615             sizeof(cl_mem),
01616             &pixsCLBuffer);
01617         status = clSetKernelArg(rEnv.mpkKernel,
01618             1,
01619             sizeof(cl_mem),
01620             &pixdCLBuffer);
01621         status = clSetKernelArg(rEnv.mpkKernel,
01622                 2,
01623                 sizeof(xp),
01624                 (const void *)&xp);
01625         status = clSetKernelArg(rEnv.mpkKernel,
01626                 3,
01627                 sizeof(wpl),
01628                 (const void *)&wpl);
01629         status = clSetKernelArg(rEnv.mpkKernel,
01630                 4,
01631                 sizeof(h),
01632                 (const void *)&h);
01633         status = clSetKernelArg(rEnv.mpkKernel,
01634                 5,
01635                 sizeof(isEven),
01636                 (const void *)&isEven);
01637         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01638                                 rEnv.mpkKernel,
01639                                 2,
01640                                 NULL,
01641                                 globalThreads,
01642                                 localThreads,
01643                                 0,
01644                                 NULL,
01645                                 NULL);
01646 
01647         if (yp > 0 || yn > 0)
01648         {
01649             pixtemp = pixsCLBuffer;
01650             pixsCLBuffer = pixdCLBuffer;
01651             pixdCLBuffer = pixtemp;
01652         }
01653     } 
01654 
01655     if (yp > 0 || yn > 0)
01656     {
01657         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status );
01658         
01659         status = clSetKernelArg(rEnv.mpkKernel,
01660             0,
01661             sizeof(cl_mem),
01662             &pixsCLBuffer);
01663         status = clSetKernelArg(rEnv.mpkKernel,
01664             1,
01665             sizeof(cl_mem),
01666             &pixdCLBuffer);
01667         status = clSetKernelArg(rEnv.mpkKernel,
01668                 2,
01669                 sizeof(yp),
01670                 (const void *)&yp);
01671         status = clSetKernelArg(rEnv.mpkKernel,
01672                 3,
01673                 sizeof(wpl),
01674                 (const void *)&wpl);
01675         status = clSetKernelArg(rEnv.mpkKernel,
01676                 4,
01677                 sizeof(h),
01678                 (const void *)&h);
01679         status = clSetKernelArg(rEnv.mpkKernel,
01680                 5,
01681                 sizeof(yn),
01682                 (const void *)&yn);
01683         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01684                                 rEnv.mpkKernel,
01685                                 2,
01686                                 NULL,
01687                                 globalThreads,
01688                                 localThreads,
01689                                 0,
01690                                 NULL,
01691                                 NULL);
01692     }
01693     
01694 
01695     return status;
01696 }
01697 
01698 //Morphology Erode operation. Invokes the relevant OpenCL kernels
01699 cl_int 
01700 pixErodeCL(l_int32  hsize, l_int32  vsize, l_uint32 wpl, l_uint32 h)
01701 {
01702 
01703     l_int32  xp, yp, xn, yn;
01704     SEL* sel;
01705     size_t globalThreads[2];
01706     size_t localThreads[2];
01707     cl_mem pixtemp;
01708     cl_int status;
01709     int gsize;
01710     char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
01711     l_uint32 rwmask, lwmask;
01712     char isEven;
01713 
01714     sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
01715     
01716     selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
01717     
01718     OpenclDevice::SetKernelEnv( &rEnv );
01719 
01720     if (hsize == 5 && vsize == 5 && isAsymmetric)
01721     {
01722         //Specific kernel for 5x5
01723         status = pixErodeCL_55(wpl, h);
01724         return status;
01725     }
01726 
01727     rwmask = rmask32[32 - (xp & 31)];
01728     lwmask = lmask32[32 - (xn & 31)];
01729 
01730     //global and local work dimensions for Horizontal pass
01731     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
01732     globalThreads[0] = gsize;
01733     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
01734     globalThreads[1] = gsize;
01735     localThreads[0] = GROUPSIZE_X;
01736     localThreads[1] = GROUPSIZE_Y;
01737     
01738     //Horizontal Pass
01739     if (xp > 31 || xn > 31 )
01740     {
01741         //Generic case. 
01742         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor", &status );
01743         
01744         status = clSetKernelArg(rEnv.mpkKernel,
01745             0,
01746             sizeof(cl_mem),
01747             &pixsCLBuffer);
01748         status = clSetKernelArg(rEnv.mpkKernel,
01749             1,
01750             sizeof(cl_mem),
01751             &pixdCLBuffer);
01752         status = clSetKernelArg(rEnv.mpkKernel,
01753                 2,
01754                 sizeof(xp),
01755                 (const void *)&xp);
01756         status = clSetKernelArg(rEnv.mpkKernel,
01757                 3,
01758                 sizeof(xn),
01759                 (const void *)&xn);
01760         status = clSetKernelArg(rEnv.mpkKernel,
01761                 4,
01762                 sizeof(wpl),
01763                 (const void *)&wpl);
01764         status = clSetKernelArg(rEnv.mpkKernel,
01765                 5,
01766                 sizeof(h),
01767                 (const void *)&h);
01768         status = clSetKernelArg(rEnv.mpkKernel,
01769                 6,
01770                 sizeof(isAsymmetric),
01771                 (const void *)&isAsymmetric);
01772         status = clSetKernelArg(rEnv.mpkKernel,
01773                 7,
01774                 sizeof(rwmask),
01775                 (const void *)&rwmask);
01776         status = clSetKernelArg(rEnv.mpkKernel,
01777                 8,
01778                 sizeof(lwmask),
01779                 (const void *)&lwmask);
01780         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01781                                 rEnv.mpkKernel,
01782                                 2,
01783                                 NULL,
01784                                 globalThreads,
01785                                 localThreads,
01786                                 0,
01787                                 NULL,
01788                                 NULL);
01789 
01790         if (yp > 0 || yn > 0)
01791         {
01792             pixtemp = pixsCLBuffer;
01793             pixsCLBuffer = pixdCLBuffer;
01794             pixdCLBuffer = pixtemp;
01795         }
01796     }
01797     else if (xp > 0 || xn > 0)
01798     {
01799         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_32word", &status );
01800         isEven = (xp != xn);
01801 
01802         status = clSetKernelArg(rEnv.mpkKernel,
01803             0,
01804             sizeof(cl_mem),
01805             &pixsCLBuffer);
01806         status = clSetKernelArg(rEnv.mpkKernel,
01807             1,
01808             sizeof(cl_mem),
01809             &pixdCLBuffer);
01810         status = clSetKernelArg(rEnv.mpkKernel,
01811                 2,
01812                 sizeof(xp),
01813                 (const void *)&xp);
01814         status = clSetKernelArg(rEnv.mpkKernel,
01815                 3,
01816                 sizeof(wpl),
01817                 (const void *)&wpl);
01818         status = clSetKernelArg(rEnv.mpkKernel,
01819                 4,
01820                 sizeof(h),
01821                 (const void *)&h);
01822         status = clSetKernelArg(rEnv.mpkKernel,
01823                 5,
01824                 sizeof(isAsymmetric),
01825                 (const void *)&isAsymmetric);
01826         status = clSetKernelArg(rEnv.mpkKernel,
01827                 6,
01828                 sizeof(rwmask),
01829                 (const void *)&rwmask);
01830         status = clSetKernelArg(rEnv.mpkKernel,
01831                 7,
01832                 sizeof(lwmask),
01833                 (const void *)&lwmask);
01834         status = clSetKernelArg(rEnv.mpkKernel,
01835                 8,
01836                 sizeof(isEven),
01837                 (const void *)&isEven);
01838         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01839                                 rEnv.mpkKernel,
01840                                 2,
01841                                 NULL,
01842                                 globalThreads,
01843                                 localThreads,
01844                                 0,
01845                                 NULL,
01846                                 NULL);
01847     
01848         if (yp > 0 || yn > 0)
01849         {
01850             pixtemp = pixsCLBuffer;
01851             pixsCLBuffer = pixdCLBuffer;
01852             pixdCLBuffer = pixtemp;
01853         }
01854     }
01855 
01856     //Vertical Pass
01857     if (yp > 0 || yn > 0)
01858     {
01859         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer", &status );
01860         
01861         status = clSetKernelArg(rEnv.mpkKernel,
01862             0,
01863             sizeof(cl_mem),
01864             &pixsCLBuffer);
01865         status = clSetKernelArg(rEnv.mpkKernel,
01866             1,
01867             sizeof(cl_mem),
01868             &pixdCLBuffer);
01869         status = clSetKernelArg(rEnv.mpkKernel,
01870                 2,
01871                 sizeof(yp),
01872                 (const void *)&yp);
01873         status = clSetKernelArg(rEnv.mpkKernel,
01874                 3,
01875                 sizeof(wpl),
01876                 (const void *)&wpl);
01877         status = clSetKernelArg(rEnv.mpkKernel,
01878                 4,
01879                 sizeof(h),
01880                 (const void *)&h);
01881         status = clSetKernelArg(rEnv.mpkKernel,
01882                 5,
01883                 sizeof(isAsymmetric),
01884                 (const void *)&isAsymmetric);
01885         status = clSetKernelArg(rEnv.mpkKernel,
01886                 6,
01887                 sizeof(yn),
01888                 (const void *)&yn);
01889         status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
01890                                 rEnv.mpkKernel,
01891                                 2,
01892                                 NULL,
01893                                 globalThreads,
01894                                 localThreads,
01895                                 0,
01896                                 NULL,
01897                                 NULL);
01898     }
01899 
01900     return status;
01901 }
01902 
01903 // OpenCL implementation of Morphology Dilate
01904 //Note: Assumes the source and dest opencl buffer are initialized. No check done
01905 PIX* 
01906 OpenclDevice::pixDilateBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy = false)
01907 {
01908     l_uint32 wpl, h;
01909 
01910     wpl = pixGetWpl(pixs);
01911     h = pixGetHeight(pixs);
01912     
01913     clStatus = pixDilateCL(hsize, vsize, wpl, h);
01914 
01915     if (reqDataCopy)
01916     {
01917         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ, false);
01918     }
01919 
01920     return pixd;
01921 }
01922 
01923 // OpenCL implementation of Morphology Erode
01924 //Note: Assumes the source and dest opencl buffer are initialized. No check done
01925 PIX* 
01926 OpenclDevice::pixErodeBrickCL(PIX  *pixd, PIX  *pixs, l_int32  hsize, l_int32  vsize, bool reqDataCopy = false)
01927 {
01928     l_uint32 wpl, h;
01929     
01930     wpl = pixGetWpl(pixs);
01931     h = pixGetHeight(pixs);
01932 
01933     clStatus = pixErodeCL(hsize, vsize, wpl, h);
01934     
01935     if (reqDataCopy)
01936     {
01937         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
01938     }
01939 
01940     return pixd;
01941 }
01942 
01943 //Morphology Open operation. Invokes the relevant OpenCL kernels
01944 cl_int
01945 pixOpenCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
01946 {
01947     cl_int status;
01948     cl_mem pixtemp;
01949     
01950     //Erode followed by Dilate
01951     status = pixErodeCL(hsize, vsize, wpl, h);
01952     
01953     pixtemp = pixsCLBuffer;
01954     pixsCLBuffer = pixdCLBuffer;
01955     pixdCLBuffer = pixtemp;
01956 
01957     status = pixDilateCL(hsize, vsize, wpl, h);
01958 
01959     return status;
01960 }
01961 
01962 //Morphology Close operation. Invokes the relevant OpenCL kernels
01963 cl_int
01964 pixCloseCL(l_int32  hsize, l_int32  vsize, l_int32  wpl, l_int32  h)
01965 {
01966     cl_int status;
01967     cl_mem pixtemp;
01968     
01969     //Dilate followed by Erode
01970     status = pixDilateCL(hsize, vsize, wpl, h);
01971     
01972     pixtemp = pixsCLBuffer;
01973     pixsCLBuffer = pixdCLBuffer;
01974     pixdCLBuffer = pixtemp;
01975 
01976     status = pixErodeCL(hsize, vsize, wpl, h);
01977 
01978     return status;
01979 }
01980 
01981 // OpenCL implementation of Morphology Close
01982 //Note: Assumes the source and dest opencl buffer are initialized. No check done
01983 PIX* 
01984 OpenclDevice::pixCloseBrickCL(PIX  *pixd, 
01985                               PIX  *pixs, 
01986                               l_int32  hsize, 
01987                               l_int32  vsize, 
01988                               bool reqDataCopy = false)
01989 {
01990     l_uint32 wpl, h;
01991     
01992     wpl = pixGetWpl(pixs);
01993     h = pixGetHeight(pixs);
01994 
01995     clStatus = pixCloseCL(hsize, vsize, wpl, h);
01996 
01997     if (reqDataCopy)
01998     {
01999         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
02000     }
02001 
02002     return pixd;
02003 }
02004 
02005 // OpenCL implementation of Morphology Open
02006 //Note: Assumes the source and dest opencl buffer are initialized. No check done
02007 PIX* 
02008 OpenclDevice::pixOpenBrickCL(PIX  *pixd, 
02009                               PIX  *pixs, 
02010                               l_int32  hsize, 
02011                               l_int32  vsize, 
02012                               bool reqDataCopy = false)
02013 {
02014     l_uint32 wpl, h;
02015     
02016     wpl = pixGetWpl(pixs);
02017     h = pixGetHeight(pixs);
02018 
02019     clStatus = pixOpenCL(hsize, vsize, wpl, h);
02020 
02021     if (reqDataCopy)
02022     {
02023         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
02024     }
02025 
02026     return pixd;
02027 }
02028 
02029 //pix OR operation: outbuffer = buffer1 | buffer2
02030 cl_int
02031 pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
02032 {
02033     cl_int status;
02034     size_t globalThreads[2];
02035     int gsize;
02036     size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
02037 
02038     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
02039     globalThreads[0] = gsize;
02040     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
02041     globalThreads[1] = gsize;
02042 
02043     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixOR", &status );
02044 
02045     status = clSetKernelArg(rEnv.mpkKernel,
02046         0,
02047         sizeof(cl_mem),
02048         &buffer1);
02049     status = clSetKernelArg(rEnv.mpkKernel,
02050         1,
02051         sizeof(cl_mem),
02052         &buffer2);
02053     status = clSetKernelArg(rEnv.mpkKernel,
02054         2,
02055         sizeof(cl_mem),
02056         &outbuffer);
02057     status = clSetKernelArg(rEnv.mpkKernel,
02058             3,
02059             sizeof(wpl),
02060             (const void *)&wpl);
02061     status = clSetKernelArg(rEnv.mpkKernel,
02062             4,
02063             sizeof(h),
02064             (const void *)&h);
02065     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
02066                             rEnv.mpkKernel,
02067                             2,
02068                             NULL,
02069                             globalThreads,
02070                             localThreads,
02071                             0,
02072                             NULL,
02073                             NULL);
02074 
02075     return status;
02076 }
02077 
02078 //pix AND operation: outbuffer = buffer1 & buffer2
02079 cl_int
02080 pixANDCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer)
02081 {
02082     cl_int status;
02083     size_t globalThreads[2];
02084     int gsize;
02085     size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
02086 
02087     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
02088     globalThreads[0] = gsize;
02089     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
02090     globalThreads[1] = gsize;
02091 
02092     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixAND", &status );
02093                 
02094     // Enqueue a kernel run call.
02095     status = clSetKernelArg(rEnv.mpkKernel,
02096         0,
02097         sizeof(cl_mem),
02098         &buffer1);
02099     status = clSetKernelArg(rEnv.mpkKernel,
02100         1,
02101         sizeof(cl_mem),
02102         &buffer2);
02103     status = clSetKernelArg(rEnv.mpkKernel,
02104         2,
02105         sizeof(cl_mem),
02106         &outbuffer);
02107     status = clSetKernelArg(rEnv.mpkKernel,
02108             3,
02109             sizeof(wpl),
02110             (const void *)&wpl);
02111     status = clSetKernelArg(rEnv.mpkKernel,
02112             4,
02113             sizeof(h),
02114             (const void *)&h);
02115     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
02116                             rEnv.mpkKernel,
02117                             2,
02118                             NULL,
02119                             globalThreads,
02120                             localThreads,
02121                             0,
02122                             NULL,
02123                             NULL);
02124 
02125     return status;
02126 }
02127 
02128 //output = buffer1 & ~(buffer2)
02129 cl_int
02130 pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outBuffer = NULL)
02131 {
02132     cl_int status;
02133     size_t globalThreads[2];
02134     int gsize;
02135     size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
02136 
02137     gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
02138     globalThreads[0] = gsize;
02139     gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
02140     globalThreads[1] = gsize;
02141 
02142     if (outBuffer != NULL)
02143     {
02144         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract", &status );
02145     }
02146     else
02147     {
02148         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract_inplace", &status );
02149     }
02150 
02151     // Enqueue a kernel run call.
02152     status = clSetKernelArg(rEnv.mpkKernel,
02153         0,
02154         sizeof(cl_mem),
02155         &buffer1);
02156     status = clSetKernelArg(rEnv.mpkKernel,
02157         1,
02158         sizeof(cl_mem),
02159         &buffer2);
02160     status = clSetKernelArg(rEnv.mpkKernel,
02161             2,
02162             sizeof(wpl),
02163             (const void *)&wpl);
02164     status = clSetKernelArg(rEnv.mpkKernel,
02165             3,
02166             sizeof(h),
02167             (const void *)&h);
02168     if (outBuffer != NULL)
02169     {
02170         status = clSetKernelArg(rEnv.mpkKernel,
02171             4,
02172             sizeof(cl_mem),
02173             (const void *)&outBuffer);
02174     }
02175     status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue,
02176                             rEnv.mpkKernel,
02177                             2,
02178                             NULL,
02179                             globalThreads,
02180                             localThreads,
02181                             0,
02182                             NULL,
02183                             NULL);
02184 
02185     return status;
02186 }
02187 
02188 // OpenCL implementation of Subtract pix
02189 //Note: Assumes the source and dest opencl buffer are initialized. No check done
02190 PIX* 
02191 OpenclDevice::pixSubtractCL(PIX  *pixd, PIX  *pixs1, PIX  *pixs2, bool reqDataCopy = false)
02192 {
02193     l_uint32 wpl, h;
02194     
02195     PROCNAME("pixSubtractCL");
02196 
02197     if (!pixs1)
02198         return (PIX *)ERROR_PTR("pixs1 not defined", procName, pixd);
02199     if (!pixs2)
02200         return (PIX *)ERROR_PTR("pixs2 not defined", procName, pixd);
02201     if (pixGetDepth(pixs1) != pixGetDepth(pixs2))
02202         return (PIX *)ERROR_PTR("depths of pixs* unequal", procName, pixd);
02203 
02204 #if  EQUAL_SIZE_WARNING
02205     if (!pixSizesEqual(pixs1, pixs2))
02206         L_WARNING("pixs1 and pixs2 not equal sizes", procName);
02207 #endif  /* EQUAL_SIZE_WARNING */
02208 
02209     wpl = pixGetWpl(pixs1);
02210     h = pixGetHeight(pixs1);
02211 
02212     clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
02213 
02214     if (reqDataCopy)
02215     {
02216         //Read back output data from OCL buffer to cpu
02217         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs1, wpl*h, CL_MAP_READ);
02218     }
02219 
02220     return pixd;
02221 }
02222 
02223 // OpenCL implementation of Hollow pix
02224 //Note: Assumes the source and dest opencl buffer are initialized. No check done
02225 PIX* 
02226 OpenclDevice::pixHollowCL(PIX  *pixd, 
02227                         PIX  *pixs, 
02228                         l_int32  close_hsize, 
02229                         l_int32  close_vsize, 
02230                         l_int32  open_hsize, 
02231                         l_int32  open_vsize,
02232                         bool reqDataCopy = false)
02233 {
02234     l_uint32 wpl, h;
02235     cl_mem pixtemp;
02236 
02237     wpl = pixGetWpl(pixs);
02238     h = pixGetHeight(pixs);
02239 
02240     //First step : Close Morph operation: Dilate followed by Erode
02241     clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
02242 
02243     //Store the output of close operation in an intermediate buffer
02244     //this will be later used for pixsubtract
02245     clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
02246 
02247     //Second step: Open Operation - Erode followed by Dilate
02248     pixtemp = pixsCLBuffer;
02249     pixsCLBuffer = pixdCLBuffer;
02250     pixdCLBuffer = pixtemp;
02251 
02252     clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
02253 
02254     //Third step: Subtract : (Close - Open)
02255     pixtemp = pixsCLBuffer;
02256     pixsCLBuffer = pixdCLBuffer;
02257     pixdCLBuffer = pixdCLIntermediate;
02258     pixdCLIntermediate = pixtemp;
02259 
02260     clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
02261 
02262     if (reqDataCopy)
02263     {
02264         //Read back output data from OCL buffer to cpu
02265         pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ);
02266     }
02267     return pixd;
02268 }
02269 
02270 // OpenCL implementation of Get Lines from pix function
02271 //Note: Assumes the source and dest opencl buffer are initialized. No check done
02272 void 
02273 OpenclDevice::pixGetLinesCL(PIX  *pixd, 
02274                             PIX  *pixs, 
02275                             PIX** pix_vline, 
02276                             PIX** pix_hline, 
02277                             PIX** pixClosed,
02278                             bool  getpixClosed,
02279                             l_int32  close_hsize, l_int32  close_vsize, 
02280                             l_int32  open_hsize, l_int32  open_vsize,
02281                             l_int32  line_hsize, l_int32  line_vsize)
02282 {
02283     l_uint32 wpl, h;
02284     cl_mem pixtemp;
02285 
02286     wpl = pixGetWpl(pixs);
02287     h = pixGetHeight(pixs);
02288 
02289     //First step : Close Morph operation: Dilate followed by Erode
02290     clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
02291 
02292     //Copy the Close output to CPU buffer
02293     if (getpixClosed)
02294     {
02295         *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl*h, CL_MAP_READ, true, false);
02296     }
02297 
02298     //Store the output of close operation in an intermediate buffer
02299     //this will be later used for pixsubtract
02300     clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
02301     
02302     //Second step: Open Operation - Erode followed by Dilate
02303     pixtemp = pixsCLBuffer;
02304     pixsCLBuffer = pixdCLBuffer;
02305     pixdCLBuffer = pixtemp;
02306 
02307     clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
02308 
02309     //Third step: Subtract : (Close - Open)
02310     pixtemp = pixsCLBuffer;
02311     pixsCLBuffer = pixdCLBuffer;
02312     pixdCLBuffer = pixdCLIntermediate;
02313     pixdCLIntermediate = pixtemp;
02314 
02315     clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
02316 
02317     //Store the output of Hollow operation in an intermediate buffer
02318     //this will be later used 
02319     clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL);
02320 
02321     pixtemp = pixsCLBuffer;
02322     pixsCLBuffer = pixdCLBuffer;
02323     pixdCLBuffer = pixtemp;
02324 
02325     //Fourth step: Get vertical line
02326     //pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
02327     clStatus = pixOpenCL(1, line_vsize, wpl, h);
02328 
02329     //Copy the vertical line output to CPU buffer
02330     *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl*h, CL_MAP_READ, true, false);
02331     
02332     pixtemp = pixsCLBuffer;
02333     pixsCLBuffer = pixdCLIntermediate;
02334     pixdCLIntermediate = pixtemp;
02335 
02336     //Fifth step: Get horizontal line
02337     //pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
02338     clStatus = pixOpenCL(line_hsize, 1, wpl, h);
02339         
02340     //Copy the horizontal line output to CPU buffer
02341     *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl*h, CL_MAP_READ, true, true);
02342 
02343     return;
02344 }
02345 
02346 
02347 /*************************************************************************
02348  *  HistogramRect
02349  *  Otsu Thresholding Operations
02350  *  histogramAllChannels is layed out as all channel 0, then all channel 1...
02351  *  only supports 1 or 4 channels (bytes_per_pixel)
02352  ************************************************************************/
02353 void OpenclDevice::HistogramRectOCL(
02354     const unsigned char* imageData,
02355     int bytes_per_pixel,
02356     int bytes_per_line,
02357     int left, // always 0
02358     int top, // always 0
02359     int width,
02360     int height,
02361     int kHistogramSize,
02362     int* histogramAllChannels)
02363 {
02364 PERF_COUNT_START("HistogramRectOCL")
02365     cl_int clStatus;
02366     KernelEnv histKern;
02367     SetKernelEnv( &histKern );
02368     KernelEnv histRedKern;
02369     SetKernelEnv( &histRedKern );
02370     /* map imagedata to device as read only */
02371     // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
02372     // faster option would be to allocate initial image buffer
02373     // using a garlic bus memory type
02374     cl_mem imageBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
02375     CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
02376 
02377     /* setup work group size parameters */
02378     int block_size = 256;
02379     cl_uint numCUs;
02380     clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
02381     CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
02382 
02383     int requestedOccupancy = 10;
02384     int numWorkGroups = numCUs * requestedOccupancy;
02385     int numThreads = block_size*numWorkGroups;
02386     size_t local_work_size[] = {block_size};
02387     size_t global_work_size[] = {numThreads};
02388     size_t red_global_work_size[] = {block_size*kHistogramSize*bytes_per_pixel}; 
02389 
02390     /* map histogramAllChannels as write only */
02391     int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups;
02392     
02393     cl_mem histogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, kHistogramSize*bytes_per_pixel*sizeof(int), (void *)histogramAllChannels, &clStatus );
02394     CHECK_OPENCL( clStatus, "clCreateBuffer histogramBuffer");
02395 
02396     /* intermediate histogram buffer */
02397     int histRed = 256;
02398     int tmpHistogramBins =  kHistogramSize*bytes_per_pixel*histRed; 
02399 
02400     cl_mem tmpHistogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins*sizeof(cl_uint), NULL, &clStatus );
02401     CHECK_OPENCL( clStatus, "clCreateBuffer tmpHistogramBuffer");
02402 
02403     /* atomic sync buffer */
02404     int *zeroBuffer = new int[1];
02405     zeroBuffer[0] = 0;
02406     cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus );
02407     CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer");
02408 
02409     //Create kernel objects based on bytes_per_pixel
02410     if (bytes_per_pixel == 1)
02411     {
02412         histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus );
02413         CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
02414 
02415         histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectOneChannelReduction", &clStatus );
02416         CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction");
02417     } else {
02418     histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus );
02419     CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
02420 
02421     histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus );
02422     CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
02423     }
02424 
02425     void *ptr;
02426     
02427     //Initialize tmpHistogramBuffer buffer
02428     ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0, tmpHistogramBins*sizeof(cl_uint), 0, NULL, NULL, &clStatus);
02429     CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
02430     
02431     memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint));
02432     clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, NULL, NULL);
02433 
02434     /* set kernel 1 arguments */
02435     clStatus = clSetKernelArg( histKern.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
02436     CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
02437     cl_uint numPixels = width*height;
02438     clStatus = clSetKernelArg( histKern.mpkKernel, 1, sizeof(cl_uint), (void *)&numPixels );
02439     CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" );
02440     clStatus = clSetKernelArg( histKern.mpkKernel, 2, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
02441     CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
02442 
02443     /* set kernel 2 arguments */
02444     int n = numThreads/bytes_per_pixel;
02445     clStatus = clSetKernelArg( histRedKern.mpkKernel, 0, sizeof(cl_int), (void *)&n );
02446     CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
02447     clStatus = clSetKernelArg( histRedKern.mpkKernel, 1, sizeof(cl_mem), (void *)&tmpHistogramBuffer );
02448     CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
02449     clStatus = clSetKernelArg( histRedKern.mpkKernel, 2, sizeof(cl_mem), (void *)&histogramBuffer );
02450     CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer");
02451 
02452     /* launch histogram */
02453 PERF_COUNT_SUB("before")
02454     clStatus = clEnqueueNDRangeKernel(
02455         histKern.mpkCmdQueue,
02456         histKern.mpkKernel,
02457         1, NULL, global_work_size, local_work_size,
02458         0, NULL, NULL );
02459     CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" );
02460     clFinish( histKern.mpkCmdQueue );
02461 
02462     /* launch histogram */
02463     clStatus = clEnqueueNDRangeKernel(
02464         histRedKern.mpkCmdQueue,
02465         histRedKern.mpkKernel,
02466         1, NULL, red_global_work_size, local_work_size,
02467         0, NULL, NULL );
02468     CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
02469     clFinish( histRedKern.mpkCmdQueue );
02470 
02471 PERF_COUNT_SUB("redKernel")
02472 
02473     /* map results back from gpu */
02474     ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus);
02475     CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
02476     
02477     clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL);
02478    
02479     clReleaseMemObject(histogramBuffer);
02480     clReleaseMemObject(imageBuffer);
02481 PERF_COUNT_SUB("after")
02482 PERF_COUNT_END
02483 
02484 }
02485 
02486 /*************************************************************************
02487  * Threshold the rectangle, taking everything except the image buffer pointer
02488  * from the class, using thresholds/hi_values to the output IMAGE.
02489  * only supports 1 or 4 channels
02490  ************************************************************************/
02491 void OpenclDevice::ThresholdRectToPixOCL(
02492     const unsigned char* imageData,
02493     int bytes_per_pixel,
02494     int bytes_per_line,
02495     const int* thresholds,
02496     const int* hi_values,
02497     Pix** pix,
02498     int height,
02499     int width,
02500     int top,
02501     int left) {
02502 PERF_COUNT_START("ThresholdRectToPixOCL")
02503 
02504     /* create pix result buffer */                                 
02505     *pix = pixCreate(width, height, 1);
02506     uinT32* pixData = pixGetData(*pix);
02507     int wpl = pixGetWpl(*pix);
02508     int pixSize = wpl*height*sizeof(uinT32);
02509 
02510     cl_int clStatus;
02511     KernelEnv rEnv;
02512     SetKernelEnv( &rEnv );
02513 
02514     /* setup work group size parameters */
02515     int block_size = 256;
02516     cl_uint numCUs = 6;
02517      clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL);
02518     CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
02519 
02520     int requestedOccupancy = 10;
02521     int numWorkGroups = numCUs * requestedOccupancy;
02522     int numThreads = block_size*numWorkGroups;
02523     size_t local_work_size[] = {(size_t) block_size};
02524     size_t global_work_size[] = {(size_t) numThreads};
02525 
02526     /* map imagedata to device as read only */
02527     // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need.
02528     // faster option would be to allocate initial image buffer
02529     // using a garlic bus memory type
02530     cl_mem imageBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus );
02531     CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer");
02532 
02533     /* map pix as write only */
02534     pixThBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize, (void *)pixData, &clStatus );
02535     CHECK_OPENCL( clStatus, "clCreateBuffer pix");
02536 
02537     /* map thresholds and hi_values */
02538     cl_mem thresholdsBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)thresholds, &clStatus );
02539     CHECK_OPENCL( clStatus, "clCreateBuffer thresholdBuffer");
02540     cl_mem hiValuesBuffer   = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)hi_values, &clStatus );
02541     CHECK_OPENCL( clStatus, "clCreateBuffer hiValuesBuffer");
02542 
02543     /* compile kernel */
02544     if (bytes_per_pixel == 4) {
02545     rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus );
02546     CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix");
02547     } else {
02548         rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus );
02549         CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
02550     }
02551 
02552     /* set kernel arguments */
02553     clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer );
02554     CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
02555     cl_uint numPixels = width*height;
02556     clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(int), (void *)&height );
02557     CHECK_OPENCL( clStatus, "clSetKernelArg height" );
02558     clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(int), (void *)&width );
02559     CHECK_OPENCL( clStatus, "clSetKernelArg width" );
02560     clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(int), (void *)&wpl );
02561     CHECK_OPENCL( clStatus, "clSetKernelArg wpl" );
02562     clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&thresholdsBuffer );
02563     CHECK_OPENCL( clStatus, "clSetKernelArg thresholdsBuffer" );
02564     clStatus = clSetKernelArg( rEnv.mpkKernel, 5, sizeof(cl_mem), (void *)&hiValuesBuffer );
02565     CHECK_OPENCL( clStatus, "clSetKernelArg hiValuesBuffer" );
02566     clStatus = clSetKernelArg( rEnv.mpkKernel, 6, sizeof(cl_mem), (void *)&pixThBuffer );
02567     CHECK_OPENCL( clStatus, "clSetKernelArg pixThBuffer");
02568 
02569     /* launch kernel & wait */
02570 PERF_COUNT_SUB("before")
02571     clStatus = clEnqueueNDRangeKernel(
02572         rEnv.mpkCmdQueue,
02573         rEnv.mpkKernel,
02574         1, NULL, global_work_size, local_work_size,
02575         0, NULL, NULL );
02576     CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" );
02577     clFinish( rEnv.mpkCmdQueue );
02578 PERF_COUNT_SUB("kernel")
02579     
02580     /* map results back from gpu */
02581     void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus);
02582     CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
02583     clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL, NULL);
02584     
02585     clReleaseMemObject(imageBuffer);
02586     clReleaseMemObject(thresholdsBuffer);
02587     clReleaseMemObject(hiValuesBuffer);
02588 
02589 PERF_COUNT_SUB("after")
02590 PERF_COUNT_END
02591 }
02592 
02593 
02594 #if USE_DEVICE_SELECTION
02595 
02596 /******************************************************************************
02597  * Data Types for Device Selection
02598  *****************************************************************************/
02599 
02600 typedef struct _TessScoreEvaluationInputData {
02601     int height;
02602     int width;
02603     int numChannels;
02604     unsigned char *imageData;
02605     Pix *pix;
02606 } TessScoreEvaluationInputData;
02607 
02608 void populateTessScoreEvaluationInputData( TessScoreEvaluationInputData *input ) {
02609     srand(1);
02610     // 8.5x11 inches @ 300dpi rounded to clean multiples
02611     int height = 3328; // %256
02612     int width = 2560; // %512
02613     int numChannels = 4;
02614     input->height = height;
02615     input->width = width;
02616     input->numChannels = numChannels;
02617     unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width];
02618     input->imageData = (unsigned char *) &imageData4[0];
02619     
02620     // zero out image
02621     unsigned char pixelWhite[4] = {  0,   0,   0, 255};
02622     unsigned char pixelBlack[4] = {255, 255, 255, 255};
02623     for (int p = 0; p < height*width; p++) {
02624         //unsigned char tmp[4] = imageData4[0];
02625         imageData4[p][0] = pixelWhite[0];
02626         imageData4[p][1] = pixelWhite[1];
02627         imageData4[p][2] = pixelWhite[2];
02628         imageData4[p][3] = pixelWhite[3];
02629     }
02630     // random lines to be eliminated
02631     int maxLineWidth = 64; // pixels wide
02632     int numLines = 10;
02633     // vertical lines
02634     for (int i = 0; i < numLines; i++) {
02635         int lineWidth = rand()%maxLineWidth;
02636         int vertLinePos = lineWidth + rand()%(width-2*lineWidth);
02637         //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
02638         for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
02639             for (int col = 0; col < height; col++) {
02640                 //imageData4[row*width+col] = pixelBlack;
02641                 imageData4[row*width+col][0] = pixelBlack[0];
02642                 imageData4[row*width+col][1] = pixelBlack[1];
02643                 imageData4[row*width+col][2] = pixelBlack[2];
02644                 imageData4[row*width+col][3] = pixelBlack[3];
02645             }
02646         }
02647     }
02648     // horizontal lines
02649     for (int i = 0; i < numLines; i++) {
02650         int lineWidth = rand()%maxLineWidth;
02651         int horLinePos = lineWidth + rand()%(height-2*lineWidth);
02652         //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
02653         for (int row = 0; row < width; row++) {
02654             for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
02655                 //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
02656                 //imageData4[row*width+col] = pixelBlack;
02657                 imageData4[row*width+col][0] = pixelBlack[0];
02658                 imageData4[row*width+col][1] = pixelBlack[1];
02659                 imageData4[row*width+col][2] = pixelBlack[2];
02660                 imageData4[row*width+col][3] = pixelBlack[3];
02661             }
02662         }
02663     }
02664     // spots (noise, squares)
02665     float fractionBlack = 0.1; // how much of the image should be blackened
02666     int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2);
02667     for (int i = 0; i < numSpots; i++) {
02668         
02669         int lineWidth = rand()%maxLineWidth;
02670         int col = lineWidth + rand()%(width-2*lineWidth);
02671         int row = lineWidth + rand()%(height-2*lineWidth);
02672         //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
02673         for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) {
02674             for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) {
02675                 //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
02676                 //imageData4[row*width+col] = pixelBlack;
02677                 imageData4[r*width+c][0] = pixelBlack[0];
02678                 imageData4[r*width+c][1] = pixelBlack[1];
02679                 imageData4[r*width+c][2] = pixelBlack[2];
02680                 imageData4[r*width+c][3] = pixelBlack[3];
02681             }
02682         }
02683     }
02684 
02685     input->pix = pixCreate(input->width, input->height, 1);
02686 }
02687 
02688 typedef struct _TessDeviceScore {
02689     float time; // small time means faster device
02690     bool clError; // were there any opencl errors
02691     bool valid; // was the correct response generated
02692 } TessDeviceScore;
02693 
02694 /******************************************************************************
02695  * Micro Benchmarks for Device Selection
02696  *****************************************************************************/
02697 
02698 double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
02699     
02700     double time = 0;
02701 #if ON_WINDOWS
02702     LARGE_INTEGER freq, time_funct_start, time_funct_end;
02703     QueryPerformanceFrequency(&freq);
02704 #else
02705     timespec time_funct_start, time_funct_end;
02706 #endif
02707     // input data
02708     l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
02709 
02710     // function call
02711     if (type == DS_DEVICE_OPENCL_DEVICE) {
02712 #if ON_WINDOWS
02713         QueryPerformanceCounter(&time_funct_start);
02714 #else
02715         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02716 #endif
02717 
02718         OpenclDevice::gpuEnv = *env;
02719         int wpl = pixGetWpl(input.pix);
02720         OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, NULL);
02721 #if ON_WINDOWS
02722         QueryPerformanceCounter(&time_funct_end);
02723         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
02724 #else
02725         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
02726         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
02727 #endif
02728 
02729     } else {
02730 #if ON_WINDOWS
02731         QueryPerformanceCounter(&time_funct_start);
02732 #else
02733         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02734 #endif
02735         Pix *pix = pixCreate(input.width, input.height, 32);
02736         l_uint32 *pixData = pixGetData(pix);
02737         int wpl = pixGetWpl(pix);
02738         //l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
02739         //pixSetData(pix, output_gpu);
02740         int i, j;
02741         int idx = 0;
02742         for (i = 0; i < input.height ; i++) {
02743             for (j = 0; j < input.width; j++) {
02744                 
02745                 l_uint32 tiffword = tiffdata[i * input.width + j];
02746                 l_int32 rval = ((tiffword) & 0xff);
02747                 l_int32 gval = (((tiffword) >> 8) & 0xff);
02748                 l_int32 bval = (((tiffword) >> 16) & 0xff);
02749                 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
02750                 pixData[idx] = value;
02751                 idx++;
02752             }
02753         }
02754 #if ON_WINDOWS
02755         QueryPerformanceCounter(&time_funct_end);
02756         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
02757 #else
02758         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
02759         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
02760 #endif
02761         pixDestroy(&pix);
02762     }
02763 
02764 
02765     // cleanup
02766 
02767     return time;
02768 }
02769 
02770 double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
02771     
02772     double time;
02773 #if ON_WINDOWS
02774     LARGE_INTEGER freq, time_funct_start, time_funct_end;
02775     QueryPerformanceFrequency(&freq);
02776 #else
02777     timespec time_funct_start, time_funct_end;
02778 #endif
02779     
02780     unsigned char pixelHi = (unsigned char)255;
02781     
02782     int left = 0;
02783     int top = 0;
02784     int kHistogramSize = 256;
02785     int bytes_per_line = input.width*input.numChannels;
02786     int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
02787 
02788     // function call
02789     if (type == DS_DEVICE_OPENCL_DEVICE) {
02790 #if ON_WINDOWS
02791         QueryPerformanceCounter(&time_funct_start);
02792 #else
02793         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02794 #endif
02795 
02796         OpenclDevice::gpuEnv = *env;
02797         int wpl = pixGetWpl(input.pix);
02798         OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels);
02799 
02800 #if ON_WINDOWS
02801         QueryPerformanceCounter(&time_funct_end);
02802         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
02803 #else
02804         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
02805         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
02806 #endif
02807     } else {
02808     
02809         int *histogram = new int[kHistogramSize];
02810 #if ON_WINDOWS
02811         QueryPerformanceCounter(&time_funct_start);
02812 #else
02813         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02814 #endif
02815         for (int ch = 0; ch < input.numChannels; ++ch) { 
02816             tesseract::HistogramRect(input.pix, input.numChannels,
02817                   left, top, input.width, input.height, histogram);
02818         }
02819 #if ON_WINDOWS
02820         QueryPerformanceCounter(&time_funct_end);
02821         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
02822 #else
02823         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
02824         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
02825 #endif
02826         delete[] histogram;
02827     }
02828 
02829     // cleanup
02830     //delete[] imageData;
02831     delete[] histogramAllChannels;
02832     return time;
02833 }
02834 
02835 //Reproducing the ThresholdRectToPix native version
02836 void ThresholdRectToPix_Native(const unsigned char* imagedata,
02837                                           int bytes_per_pixel,
02838                                           int bytes_per_line,
02839                                           const int* thresholds,
02840                                           const int* hi_values,
02841                                           Pix** pix) {
02842     int top = 0;
02843     int left = 0;
02844     int width = pixGetWidth(*pix);
02845     int height = pixGetHeight(*pix);
02846 
02847   *pix = pixCreate(width, height, 1);
02848   uinT32* pixdata = pixGetData(*pix);
02849   int wpl = pixGetWpl(*pix);
02850   const unsigned char* srcdata = imagedata + top * bytes_per_line +
02851                                  left * bytes_per_pixel;
02852   for (int y = 0; y < height; ++y) {
02853     const uinT8* linedata = srcdata;
02854     uinT32* pixline = pixdata + y * wpl;
02855     for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
02856       bool white_result = true;
02857       for (int ch = 0; ch < bytes_per_pixel; ++ch) {
02858         if (hi_values[ch] >= 0 &&
02859             (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
02860           white_result = false;
02861           break;
02862         }
02863       }
02864       if (white_result)
02865         CLEAR_DATA_BIT(pixline, x);
02866       else
02867         SET_DATA_BIT(pixline, x);
02868     }
02869     srcdata += bytes_per_line;
02870   }
02871 }
02872 
02873 double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
02874     
02875     double time;
02876 #if ON_WINDOWS
02877     LARGE_INTEGER freq, time_funct_start, time_funct_end;
02878     QueryPerformanceFrequency(&freq);
02879 #else
02880     timespec time_funct_start, time_funct_end;
02881 #endif
02882     
02883     // input data
02884     unsigned char pixelHi = (unsigned char)255;
02885     int* thresholds = new int[4];
02886     thresholds[0] = pixelHi/2;
02887     thresholds[1] = pixelHi/2;
02888     thresholds[2] = pixelHi/2;
02889     thresholds[3] = pixelHi/2;
02890     int *hi_values = new int[4];
02891     thresholds[0] = pixelHi;
02892     thresholds[1] = pixelHi;
02893     thresholds[2] = pixelHi;
02894     thresholds[3] = pixelHi;
02895     //Pix* pix = pixCreate(width, height, 1);
02896     int top = 0;
02897     int left = 0;
02898     int bytes_per_line = input.width*input.numChannels;
02899 
02900     // function call
02901     if (type == DS_DEVICE_OPENCL_DEVICE) {
02902 #if ON_WINDOWS
02903         QueryPerformanceCounter(&time_funct_start);
02904 #else
02905         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02906 #endif
02907 
02908         OpenclDevice::gpuEnv = *env;
02909         int wpl = pixGetWpl(input.pix);
02910         OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left);
02911 
02912 #if ON_WINDOWS
02913         QueryPerformanceCounter(&time_funct_end);
02914         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
02915 #else
02916         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
02917         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
02918 #endif
02919     } else {
02920 
02921 
02922         tesseract::ImageThresholder thresholder;
02923         thresholder.SetImage( input.pix );
02924 #if ON_WINDOWS
02925         QueryPerformanceCounter(&time_funct_start);
02926 #else
02927         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02928 #endif
02929         ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
02930             thresholds, hi_values, &input.pix );
02931 
02932 #if ON_WINDOWS
02933         QueryPerformanceCounter(&time_funct_end);
02934         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
02935 #else
02936         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
02937         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
02938 #endif
02939     }
02940 
02941     // cleanup
02942     delete[] thresholds;
02943     delete[] hi_values;
02944     return time;
02945 }
02946 
02947 double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
02948 
02949     double time = 0;
02950 #if ON_WINDOWS
02951     LARGE_INTEGER freq, time_funct_start, time_funct_end;
02952     QueryPerformanceFrequency(&freq);
02953 #else
02954     timespec time_funct_start, time_funct_end;
02955 #endif
02956 
02957     // input data
02958     int resolution = 300;
02959     int wpl = pixGetWpl(input.pix);
02960     int kThinLineFraction = 20; // tess constant
02961     int kMinLineLengthFraction = 4; // tess constant
02962     int max_line_width = resolution / kThinLineFraction;
02963     int min_line_length = resolution / kMinLineLengthFraction;
02964     int closing_brick = max_line_width / 3;
02965    
02966     // function call
02967     if (type == DS_DEVICE_OPENCL_DEVICE) {
02968 #if ON_WINDOWS
02969         QueryPerformanceCounter(&time_funct_start);
02970 #else
02971         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02972 #endif
02973         Pix *src_pix = input.pix;
02974         OpenclDevice::gpuEnv = *env;
02975         OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
02976         Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL;
02977         OpenclDevice::pixGetLinesCL(NULL, input.pix, &pix_vline, &pix_hline, &pix_closed, true, closing_brick, closing_brick, max_line_width, max_line_width, min_line_length, min_line_length);
02978 
02979         OpenclDevice::releaseMorphCLBuffers();
02980 
02981 #if ON_WINDOWS
02982         QueryPerformanceCounter(&time_funct_end);
02983         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
02984 #else
02985         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
02986         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
02987 #endif
02988     } else {
02989 #if ON_WINDOWS
02990         QueryPerformanceCounter(&time_funct_start);
02991 #else
02992         clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
02993 #endif
02994 
02995         // native serial code
02996         Pix *src_pix = input.pix;
02997         Pix *pix_closed = pixCloseBrick(NULL, src_pix, closing_brick, closing_brick);
02998         Pix *pix_solid  = pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width);
02999         Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid);
03000         pixDestroy(&pix_solid);
03001         Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length);
03002         Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1);
03003         pixDestroy(&pix_hollow);
03004 
03005 #if ON_WINDOWS
03006         QueryPerformanceCounter(&time_funct_end);
03007         time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
03008 #else
03009         clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
03010         time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
03011 #endif
03012     }
03013 
03014     return time;
03015 }
03016 
03017 
03018 
03019 /******************************************************************************
03020  * Device Selection
03021  *****************************************************************************/
03022 
03023 #include "stdlib.h"
03024 
03025  
03026 // encode score object as byte string
03027 ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) {
03028     *serializedScoreSize = sizeof(TessDeviceScore);
03029     *serializedScore = (void *) new unsigned char[*serializedScoreSize];
03030     memcpy(*serializedScore, device->score, *serializedScoreSize);
03031     return DS_SUCCESS;
03032 }
03033 
03034 // parses byte string and stores in score object
03035 ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) {
03036     // check that serializedScoreSize == sizeof(TessDeviceScore);
03037     device->score = new TessDeviceScore;
03038     memcpy(device->score, serializedScore, serializedScoreSize);
03039     return DS_SUCCESS;
03040 }
03041 
03042 
03043 
03044 // evaluate devices
03045 ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
03046     
03047     // overwrite statuc gpuEnv w/ current device
03048     // so native opencl calls can be used; they use static gpuEnv
03049     printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
03050     GPUEnv *env = NULL;
03051     if (device->type == DS_DEVICE_OPENCL_DEVICE) {
03052         env = new GPUEnv;
03053         //printf("[DS] populating tmp GPUEnv from device\n");
03054         populateGPUEnvFromDevice( env, device->oclDeviceID);
03055         env->mnFileCount = 0; //argc;
03056         env->mnKernelCount = 0UL;
03057         //printf("[DS] compiling kernels for tmp GPUEnv\n");
03058         OpenclDevice::gpuEnv = *env;
03059         OpenclDevice::CompileKernelFile(env, "");
03060     }
03061     
03062 
03063     TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;
03064     
03065     // pixReadTiff
03066     double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type );
03067 
03068     // HistogramRect
03069     double histogramRectTime = histogramRectMicroBench( env, *input, device->type );
03070 
03071     // ThresholdRectToPix
03072     double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type );
03073 
03074     // getLineMasks
03075     double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type );
03076 
03077 
03078     // weigh times (% of cpu time)
03079     // these weights should be the % execution time that the native cpu code took
03080     float composeRGBPixelWeight     = 1.2f;
03081     float histogramRectWeight       = 2.4f;
03082     float thresholdRectToPixWeight  = 4.5f;
03083     float getLineMasksMorphWeight        = 5.0f;
03084     
03085     float weightedTime = 
03086         composeRGBPixelWeight       * composeRGBPixelTime +
03087         histogramRectWeight         * histogramRectTime +
03088         thresholdRectToPixWeight    * thresholdRectToPixTime +
03089         getLineMasksMorphWeight     * getLineMasksMorphTime
03090         ;
03091     device->score = (void *)new TessDeviceScore;
03092     ((TessDeviceScore *)device->score)->time = weightedTime;
03093     
03094     printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
03095     printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight );
03096     printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight );
03097     printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight );
03098     printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight );
03099     printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time );
03100     return DS_SUCCESS;
03101 }
03102 
03103 // initial call to select device
03104 ds_device OpenclDevice::getDeviceSelection( ) {
03105 //PERF_COUNT_START("getDeviceSelection")
03106     if (!deviceIsSelected) {
03107 PERF_COUNT_START("getDeviceSelection")
03108         // check if opencl is available at runtime
03109         if( 1 == LoadOpencl() ) {
03110             // opencl is available
03111 //PERF_COUNT_SUB("LoadOpencl")
03112     // setup devices
03113     ds_status status;
03114     ds_profile *profile;
03115     status = initDSProfile( &profile, "v0.1" );
03116 PERF_COUNT_SUB("initDSProfile")
03117     // try reading scores from file
03118     char *fileName = "tesseract_opencl_profile_devices.dat";
03119     status = readProfileFromFile( profile, deserializeScore, fileName);
03120     if (status != DS_SUCCESS) {
03121         // need to run evaluation
03122                 printf("[DS] Profile file not available (%s); performing profiling.\n", fileName);
03123 
03124         // create input data
03125         TessScoreEvaluationInputData input;
03126                 populateTessScoreEvaluationInputData( &input );
03127 //PERF_COUNT_SUB("populateTessScoreEvaluationInputData")
03128         // perform evaluations
03129         unsigned int numUpdates;
03130         status =  profileDevices( profile, DS_EVALUATE_ALL, evaluateScoreForDevice, (void *)&input, &numUpdates );
03131 PERF_COUNT_SUB("profileDevices")
03132         // write scores to file
03133         if ( status == DS_SUCCESS ) {
03134             status = writeProfileToFile( profile, serializeScore, fileName);
03135 PERF_COUNT_SUB("writeProfileToFile")
03136             if ( status == DS_SUCCESS ) {
03137                         printf("[DS] Scores written to file (%s).\n", fileName);
03138             } else {
03139                         printf("[DS] Error saving scores to file (%s); scores not written to file.\n", fileName);
03140             }
03141         } else {
03142                     printf("[DS] Unable to evaluate performance; scores not written to file.\n");
03143         }
03144 
03145     } else {
03146 
03147 PERF_COUNT_SUB("readProfileFromFile")
03148                 printf("[DS] Profile read from file (%s).\n", fileName);
03149     }
03150 
03151     // we now have device scores either from file or evaluation
03152     // select fastest using custom Tesseract selection algorithm
03153     float bestTime = FLT_MAX; // begin search with worst possible time
03154     int bestDeviceIdx = -1;
03155     for (int d = 0; d < profile->numDevices; d++) {
03156         //((TessDeviceScore *)device->score)->time
03157         ds_device device = profile->devices[d];
03158         TessDeviceScore score = *(TessDeviceScore *)device.score;
03159         
03160         float time = score.time;
03161                 printf("[DS] Device[%i] %i:%s score is %f\n", d+1, device.type, device.oclDeviceName, time);
03162         if (time < bestTime) {
03163                     bestTime = time;
03164             bestDeviceIdx = d;
03165         }
03166     }
03167             printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
03168     // cleanup
03169             // TODO: call destructor for profile object?
03170 
03171             bool overrided = false;
03172             char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
03173             if (overrideDeviceStr != NULL) {
03174                 int overrideDeviceIdx = atoi(overrideDeviceStr);
03175                 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices ) {
03176                     printf("[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, %i)\n", overrideDeviceStr, overrideDeviceIdx);
03177                     bestDeviceIdx = overrideDeviceIdx - 1;
03178                     overrided = true;
03179                 } else {
03180                     printf("[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are valid devices).\n", overrideDeviceStr, profile->numDevices);
03181                 }
03182 }
03183 
03184             if (overrided) {
03185                 printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native");
03186             }
03187             selectedDevice = profile->devices[bestDeviceIdx];
03188 
03189         } else {
03190             // opencl isn't available at runtime, select native cpu device
03191             printf("[DS] OpenCL runtime not available.\n");
03192             selectedDevice.type = DS_DEVICE_NATIVE_CPU;
03193             selectedDevice.oclDeviceName = "(null)";
03194             selectedDevice.score = NULL;
03195             selectedDevice.oclDeviceID = NULL;
03196             selectedDevice.oclDriverVersion = NULL;
03197         }
03198         deviceIsSelected = true;
03199 PERF_COUNT_SUB("select from Profile")
03200 PERF_COUNT_END
03201     }
03202 //PERF_COUNT_END
03203     return selectedDevice;
03204 }
03205 
03206 #endif
03207 
03208 bool OpenclDevice::selectedDeviceIsOpenCL() {
03209 #if USE_DEVICE_SELECTION
03210     ds_device device = getDeviceSelection();
03211     return (device.type == DS_DEVICE_OPENCL_DEVICE);
03212 #else
03213     return true;
03214 #endif
03215 }
03216 
03217 bool OpenclDevice::selectedDeviceIsNativeCPU() {
03218 #if USE_DEVICE_SELECTION
03219     ds_device device = getDeviceSelection();
03220     return (device.type == DS_DEVICE_NATIVE_CPU);
03221 #else
03222     return false;
03223 #endif
03224 }
03225 
03226 
03227 
03228 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines