tesseract
3.03
|
00001 #ifdef _WIN32 00002 #include <Windows.h> 00003 #include <io.h> 00004 #else 00005 #include <sys/types.h> 00006 #include <unistd.h> 00007 #endif 00008 #include <float.h> 00009 00010 #include "openclwrapper.h" 00011 #include "oclkernels.h" 00012 00013 // for micro-benchmark 00014 #include "otsuthr.h" 00015 #include "thresholder.h" 00016 00017 #ifdef USE_OPENCL 00018 00019 #include "opencl_device_selection.h" 00020 #ifdef _MSC_VER 00021 int LeptMsgSeverity = 3; // L_SEVERITY_INFO 00022 #endif // _MSC_VER 00023 GPUEnv OpenclDevice::gpuEnv; 00024 00025 #if USE_DEVICE_SELECTION 00026 bool OpenclDevice::deviceIsSelected = false; 00027 ds_device OpenclDevice::selectedDevice; 00028 #endif 00029 00030 int OpenclDevice::isInited =0; 00031 00032 struct tiff_transform { 00033 int vflip; /* if non-zero, image needs a vertical fip */ 00034 int hflip; /* if non-zero, image needs a horizontal flip */ 00035 int rotate; /* -1 -> counterclockwise 90-degree rotation, 00036 0 -> no rotation 00037 1 -> clockwise 90-degree rotation */ 00038 }; 00039 00040 static struct tiff_transform tiff_orientation_transforms[] = { 00041 {0, 0, 0}, 00042 {0, 1, 0}, 00043 {1, 1, 0}, 00044 {1, 0, 0}, 00045 {0, 1, -1}, 00046 {0, 0, 1}, 00047 {0, 1, 1}, 00048 {0, 0, -1} 00049 }; 00050 00051 static const l_int32 MAX_PAGES_IN_TIFF_FILE = 3000; 00052 00053 cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers 00054 cl_mem pixThBuffer; //output from thresholdtopix calculation 00055 cl_int clStatus; 00056 KernelEnv rEnv; 00057 00058 // substitute invalid characters in device name with _ 00059 void legalizeFileName( char *fileName) { 00060 //printf("fileName: %s\n", fileName); 00061 char *invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches 00062 // for each invalid char 00063 for (int i = 0; i < strlen(invalidChars); i++) { 00064 char invalidStr[4]; 00065 invalidStr[0] = invalidChars[i]; 00066 invalidStr[1] = NULL; 00067 //printf("eliminating %s\n", invalidStr); 00068 //char *pos = strstr(fileName, invalidStr); 00069 // initial ./ is valid for present directory 00070 //if (*pos == '.') pos++; 00071 //if (*pos == '/') pos++; 00072 for ( char *pos = strstr(fileName, invalidStr); pos != NULL; pos = strstr(pos+1, invalidStr)) { 00073 //printf("\tfound: %s, ", pos); 00074 pos[0] = '_'; 00075 //printf("fileName: %s\n", fileName); 00076 } 00077 } 00078 } 00079 00080 void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) { 00081 //printf("[DS] populateGPUEnvFromDevice\n"); 00082 size_t size; 00083 gpuInfo->mnIsUserCreated = 1; 00084 // device 00085 gpuInfo->mpDevID = device; 00086 gpuInfo->mpArryDevsID = new cl_device_id[1]; 00087 gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID; 00088 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE , sizeof(cl_device_type), (void *) &gpuInfo->mDevType , &size); 00089 CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)"); 00090 // platform 00091 clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM , sizeof(cl_platform_id), (void *) &gpuInfo->mpPlatformID , &size); 00092 CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)"); 00093 // context 00094 cl_context_properties props[3]; 00095 props[0] = CL_CONTEXT_PLATFORM; 00096 props[1] = (cl_context_properties) gpuInfo->mpPlatformID; 00097 props[2] = 0; 00098 gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL, NULL, &clStatus); 00099 CHECK_OPENCL( clStatus, "populateGPUEnv::createContext"); 00100 // queue 00101 cl_command_queue_properties queueProperties = 0; 00102 gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus ); 00103 CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue"); 00104 00105 } 00106 00107 int OpenclDevice::LoadOpencl() 00108 { 00109 #ifdef WIN32 00110 HINSTANCE HOpenclDll = NULL; 00111 void * OpenclDll = NULL; 00112 //fprintf(stderr, " LoadOpenclDllxx... \n"); 00113 OpenclDll = static_cast<HINSTANCE>( HOpenclDll ); 00114 OpenclDll = LoadLibrary( "openCL.dll" ); 00115 if ( !static_cast<HINSTANCE>( OpenclDll ) ) 00116 { 00117 fprintf(stderr, "[OD] Load opencl.dll failed!\n"); 00118 FreeLibrary( static_cast<HINSTANCE>( OpenclDll ) ); 00119 return 0; 00120 00121 } 00122 fprintf(stderr, "[OD] Load opencl.dll successful!\n"); 00123 #endif 00124 return 1; 00125 } 00126 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo ) 00127 { 00128 envInfo->mpkContext = gpuEnv.mpContext; 00129 envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue; 00130 envInfo->mpkProgram = gpuEnv.mpArryPrograms[0]; 00131 00132 return 1; 00133 } 00134 00135 cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElements, cl_mem_flags flags, cl_int *pStatus) 00136 { 00137 cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags), 00138 nElements * sizeof(l_uint32), hostbuffer, pStatus); 00139 00140 return membuffer; 00141 } 00142 00143 PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, int elements, cl_mem_flags flags, bool memcopy = false, bool sync = true) 00144 { 00145 PROCNAME("mapOutputCLBuffer"); 00146 if (!pixd) 00147 { 00148 if (memcopy) 00149 { 00150 if ((pixd = pixCreateTemplate(pixs)) == NULL) 00151 (PIX *)ERROR_PTR("pixd not made", procName, NULL); 00152 } 00153 else 00154 { 00155 if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) == NULL) 00156 (PIX *)ERROR_PTR("pixd not made", procName, NULL); 00157 } 00158 } 00159 l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0, 00160 elements * sizeof(l_uint32), 0, NULL, NULL, NULL ); 00161 00162 if (memcopy) 00163 { 00164 memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32)); 00165 } 00166 else 00167 { 00168 pixSetData(pixd, pValues); 00169 } 00170 00171 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,clbuffer,pValues,0,NULL,NULL); 00172 00173 if (sync) 00174 { 00175 clFinish( rEnv.mpkCmdQueue ); 00176 } 00177 00178 return pixd; 00179 } 00180 00181 cl_mem allocateIntBuffer( KernelEnv rEnv, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus , bool sync = false) 00182 { 00183 cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), 00184 nElements * sizeof(l_int32), NULL, pStatus); 00185 00186 if (_pValues != NULL) 00187 { 00188 l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0, 00189 nElements * sizeof(l_int32), 0, NULL, NULL, NULL ); 00190 00191 memcpy(pValues, _pValues, nElements * sizeof(l_int32)); 00192 00193 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL); 00194 00195 if (sync) 00196 clFinish( rEnv.mpkCmdQueue ); 00197 } 00198 00199 return xValues; 00200 } 00201 00202 int OpenclDevice::InitOpenclRunEnv( GPUEnv *gpuInfo ) 00203 { 00204 size_t length; 00205 cl_int clStatus; 00206 cl_uint numPlatforms, numDevices; 00207 cl_platform_id *platforms; 00208 cl_context_properties cps[3]; 00209 char platformName[256]; 00210 unsigned int i; 00211 00212 00213 // Have a look at the available platforms. 00214 00215 if ( !gpuInfo->mnIsUserCreated ) 00216 { 00217 clStatus = clGetPlatformIDs( 0, NULL, &numPlatforms ); 00218 if ( clStatus != CL_SUCCESS ) 00219 { 00220 return 1; 00221 } 00222 gpuInfo->mpPlatformID = NULL; 00223 00224 if ( 0 < numPlatforms ) 00225 { 00226 platforms = (cl_platform_id*) malloc( numPlatforms * sizeof( cl_platform_id ) ); 00227 if ( platforms == (cl_platform_id*) NULL ) 00228 { 00229 return 1; 00230 } 00231 clStatus = clGetPlatformIDs( numPlatforms, platforms, NULL ); 00232 00233 if ( clStatus != CL_SUCCESS ) 00234 { 00235 return 1; 00236 } 00237 00238 for ( i = 0; i < numPlatforms; i++ ) 00239 { 00240 clStatus = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, 00241 sizeof( platformName ), platformName, NULL ); 00242 00243 if ( clStatus != CL_SUCCESS ) 00244 { 00245 return 1; 00246 } 00247 gpuInfo->mpPlatformID = platforms[i]; 00248 00249 //if (!strcmp(platformName, "Intel(R) Coporation")) 00250 //if( !strcmp( platformName, "Advanced Micro Devices, Inc." )) 00251 { 00252 gpuInfo->mpPlatformID = platforms[i]; 00253 00254 if ( getenv("SC_OPENCLCPU") ) 00255 { 00256 clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform 00257 CL_DEVICE_TYPE_CPU, // device_type for CPU device 00258 0, // num_entries 00259 NULL, // devices 00260 &numDevices); 00261 printf("Selecting OpenCL device: CPU (a)\n"); 00262 } 00263 else 00264 { 00265 clStatus = clGetDeviceIDs(gpuInfo->mpPlatformID, // platform 00266 CL_DEVICE_TYPE_GPU, // device_type for GPU device 00267 0, // num_entries 00268 NULL, // devices 00269 &numDevices); 00270 printf("Selecting OpenCL device: GPU (a)\n"); 00271 } 00272 if ( clStatus != CL_SUCCESS ) 00273 continue; 00274 00275 if ( numDevices ) 00276 break; 00277 } 00278 } 00279 if ( clStatus != CL_SUCCESS ) 00280 return 1; 00281 free( platforms ); 00282 } 00283 if ( NULL == gpuInfo->mpPlatformID ) 00284 return 1; 00285 00286 // Use available platform. 00287 cps[0] = CL_CONTEXT_PLATFORM; 00288 cps[1] = (cl_context_properties) gpuInfo->mpPlatformID; 00289 cps[2] = 0; 00290 // Set device type for OpenCL 00291 00292 if ( getenv("SC_OPENCLCPU") ) 00293 { 00294 gpuInfo->mDevType = CL_DEVICE_TYPE_CPU; 00295 printf("Selecting OpenCL device: CPU (b)\n"); 00296 } 00297 else 00298 { 00299 gpuInfo->mDevType = CL_DEVICE_TYPE_GPU; 00300 printf("Selecting OpenCL device: GPU (b)\n"); 00301 } 00302 00303 gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus ); 00304 00305 if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) ) 00306 { 00307 gpuInfo->mDevType = CL_DEVICE_TYPE_CPU; 00308 gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus ); 00309 printf("Selecting OpenCL device: CPU (c)\n"); 00310 } 00311 if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) ) 00312 { 00313 gpuInfo->mDevType = CL_DEVICE_TYPE_DEFAULT; 00314 gpuInfo->mpContext = clCreateContextFromType( cps, gpuInfo->mDevType, NULL, NULL, &clStatus ); 00315 printf("Selecting OpenCL device: DEFAULT (c)\n"); 00316 } 00317 if ( ( gpuInfo->mpContext == (cl_context) NULL) || ( clStatus != CL_SUCCESS ) ) 00318 return 1; 00319 // Detect OpenCL devices. 00320 // First, get the size of device list data 00321 clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, 0, NULL, &length ); 00322 if ( ( clStatus != CL_SUCCESS ) || ( length == 0 ) ) 00323 return 1; 00324 // Now allocate memory for device list based on the size we got earlier 00325 gpuInfo->mpArryDevsID = (cl_device_id*) malloc( length ); 00326 if ( gpuInfo->mpArryDevsID == (cl_device_id*) NULL ) 00327 return 1; 00328 // Now, get the device list data 00329 clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, length, 00330 gpuInfo->mpArryDevsID, NULL ); 00331 if ( clStatus != CL_SUCCESS ) 00332 return 1; 00333 00334 // Create OpenCL command queue. 00335 gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpArryDevsID[0], 0, &clStatus ); 00336 00337 if ( clStatus != CL_SUCCESS ) 00338 return 1; 00339 } 00340 00341 clStatus = clGetCommandQueueInfo( gpuInfo->mpCmdQueue, CL_QUEUE_THREAD_HANDLE_AMD, 0, NULL, NULL ); 00342 // Check device extensions for double type 00343 size_t aDevExtInfoSize = 0; 00344 00345 clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS, 0, NULL, &aDevExtInfoSize ); 00346 CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); 00347 00348 char *aExtInfo = new char[aDevExtInfoSize]; 00349 00350 clStatus = clGetDeviceInfo( gpuInfo->mpArryDevsID[0], CL_DEVICE_EXTENSIONS, 00351 sizeof(char) * aDevExtInfoSize, aExtInfo, NULL); 00352 CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); 00353 00354 gpuInfo->mnKhrFp64Flag = 0; 00355 gpuInfo->mnAmdFp64Flag = 0; 00356 00357 if ( strstr( aExtInfo, "cl_khr_fp64" ) ) 00358 { 00359 gpuInfo->mnKhrFp64Flag = 1; 00360 } 00361 else 00362 { 00363 // Check if cl_amd_fp64 extension is supported 00364 if ( strstr( aExtInfo, "cl_amd_fp64" ) ) 00365 gpuInfo->mnAmdFp64Flag = 1; 00366 } 00367 delete []aExtInfo; 00368 00369 return 0; 00370 } 00371 00372 void OpenclDevice::releaseMorphCLBuffers() 00373 { 00374 if (pixdCLIntermediate != NULL) 00375 clReleaseMemObject(pixdCLIntermediate); 00376 if (pixsCLBuffer != NULL) 00377 clReleaseMemObject(pixsCLBuffer); 00378 if (pixdCLBuffer != NULL) 00379 clReleaseMemObject(pixdCLBuffer); 00380 if (pixThBuffer != NULL) 00381 clReleaseMemObject(pixThBuffer); 00382 } 00383 00384 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs) 00385 { 00386 SetKernelEnv( &rEnv ); 00387 00388 if (pixThBuffer != NULL) 00389 { 00390 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); 00391 00392 //Get the output from ThresholdToPix operation 00393 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, sizeof(l_uint32) * wpl*h, 0, NULL, NULL); 00394 } 00395 else 00396 { 00397 //Get data from the source image 00398 l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32)); 00399 memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32)); 00400 00401 pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus); 00402 } 00403 00404 pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); 00405 00406 pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); 00407 00408 return (int)clStatus; 00409 } 00410 00411 int OpenclDevice::InitEnv() 00412 { 00413 //PERF_COUNT_START("OD::InitEnv") 00414 // printf("[OD] OpenclDevice::InitEnv()\n"); 00415 #ifdef SAL_WIN32 00416 while( 1 ) 00417 { 00418 if( 1 == LoadOpencl() ) 00419 break; 00420 } 00421 PERF_COUNT_SUB("LoadOpencl") 00422 #endif 00423 // sets up environment, compiles programs 00424 00425 00426 #if USE_DEVICE_SELECTION 00427 00428 InitOpenclRunEnv_DeviceSelection( 0 ); 00429 //PERF_COUNT_SUB("called InitOpenclRunEnv_DS") 00430 #else 00431 // init according to device 00432 InitOpenclRunEnv( 0 ); 00433 #endif 00434 //PERF_COUNT_END 00435 return 1; 00436 } 00437 00438 int OpenclDevice::ReleaseOpenclRunEnv() 00439 { 00440 ReleaseOpenclEnv( &gpuEnv ); 00441 #ifdef SAL_WIN32 00442 FreeOpenclDll(); 00443 #endif 00444 return 1; 00445 } 00446 inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName ) 00447 { 00448 if ( kCount < 1 ) 00449 fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" ); 00450 strcpy( gpuEnv.mArrykernelNames[kCount-1], kName ); 00451 gpuEnv.mnKernelCount++; 00452 return 0; 00453 } 00454 int OpenclDevice::RegistOpenclKernel() 00455 { 00456 if ( !gpuEnv.mnIsUserCreated ) 00457 memset( &gpuEnv, 0, sizeof(gpuEnv) ); 00458 00459 gpuEnv.mnFileCount = 0; //argc; 00460 gpuEnv.mnKernelCount = 0UL; 00461 00462 AddKernelConfig( 1, (const char*) "oclAverageSub1" ); 00463 return 0; 00464 } 00465 int OpenclDevice::InitOpenclRunEnv( int argc ) 00466 { 00467 int status = 0; 00468 if ( MAX_CLKERNEL_NUM <= 0 ) 00469 { 00470 return 1; 00471 } 00472 if ( ( argc > MAX_CLFILE_NUM ) || ( argc < 0 ) ) 00473 return 1; 00474 00475 if ( !isInited ) 00476 { 00477 RegistOpenclKernel(); 00478 //initialize devices, context, comand_queue 00479 status = InitOpenclRunEnv( &gpuEnv ); 00480 if ( status ) 00481 { 00482 fprintf(stderr,"init_opencl_env failed.\n"); 00483 return 1; 00484 } 00485 fprintf(stderr,"init_opencl_env successed.\n"); 00486 //initialize program, kernelName, kernelCount 00487 if( getenv( "SC_FLOAT" ) ) 00488 { 00489 gpuEnv.mnKhrFp64Flag = 0; 00490 gpuEnv.mnAmdFp64Flag = 0; 00491 } 00492 if( gpuEnv.mnKhrFp64Flag ) 00493 { 00494 fprintf(stderr,"----use khr double type in kernel----\n"); 00495 status = CompileKernelFile( &gpuEnv, "-D KHR_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" ); 00496 } 00497 else if( gpuEnv.mnAmdFp64Flag ) 00498 { 00499 fprintf(stderr,"----use amd double type in kernel----\n"); 00500 status = CompileKernelFile( &gpuEnv, "-D AMD_DP_EXTENSION -Dfp_t=double -Dfp_t4=double4 -Dfp_t16=double16" ); 00501 } 00502 else 00503 { 00504 fprintf(stderr,"----use float type in kernel----\n"); 00505 status = CompileKernelFile( &gpuEnv, "-Dfp_t=float -Dfp_t4=float4 -Dfp_t16=float16" ); 00506 } 00507 if ( status == 0 || gpuEnv.mnKernelCount == 0 ) 00508 { 00509 fprintf(stderr,"CompileKernelFile failed.\n"); 00510 return 1; 00511 } 00512 fprintf(stderr,"CompileKernelFile successed.\n"); 00513 isInited = 1; 00514 } 00515 return 0; 00516 } 00517 00518 int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) { 00519 //PERF_COUNT_START("InitOpenclRunEnv_DS") 00520 #if USE_DEVICE_SELECTION 00521 if (!isInited) { 00522 // after programs compiled, selects best device 00523 //printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n"); 00524 ds_device bestDevice_DS = getDeviceSelection( ); 00525 //PERF_COUNT_SUB("called getDeviceSelection()") 00526 cl_device_id bestDevice = bestDevice_DS.oclDeviceID; 00527 // overwrite global static GPUEnv with new device 00528 if (selectedDeviceIsOpenCL() ) { 00529 //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n"); 00530 populateGPUEnvFromDevice( &gpuEnv, bestDevice ); 00531 gpuEnv.mnFileCount = 0; //argc; 00532 gpuEnv.mnKernelCount = 0UL; 00533 //PERF_COUNT_SUB("populate gpuEnv") 00534 CompileKernelFile(&gpuEnv, ""); 00535 //PERF_COUNT_SUB("CompileKernelFile") 00536 } else { 00537 //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n"); 00538 } 00539 isInited = 1; 00540 } 00541 #endif 00542 //PERF_COUNT_END 00543 return 0; 00544 } 00545 00546 00547 OpenclDevice::OpenclDevice() 00548 { 00549 //InitEnv(); 00550 } 00551 00552 OpenclDevice::~OpenclDevice() 00553 { 00554 //ReleaseOpenclRunEnv(); 00555 } 00556 00557 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo ) 00558 { 00559 int i = 0; 00560 int clStatus = 0; 00561 00562 if ( !isInited ) 00563 { 00564 return 1; 00565 } 00566 00567 for ( i = 0; i < gpuEnv.mnFileCount; i++ ) 00568 { 00569 if ( gpuEnv.mpArryPrograms[i] ) 00570 { 00571 clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] ); 00572 CHECK_OPENCL( clStatus, "clReleaseProgram" ); 00573 gpuEnv.mpArryPrograms[i] = NULL; 00574 } 00575 } 00576 if ( gpuEnv.mpCmdQueue ) 00577 { 00578 clReleaseCommandQueue( gpuEnv.mpCmdQueue ); 00579 gpuEnv.mpCmdQueue = NULL; 00580 } 00581 if ( gpuEnv.mpContext ) 00582 { 00583 clReleaseContext( gpuEnv.mpContext ); 00584 gpuEnv.mpContext = NULL; 00585 } 00586 isInited = 0; 00587 gpuInfo->mnIsUserCreated = 0; 00588 free( gpuInfo->mpArryDevsID ); 00589 return 1; 00590 } 00591 int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle ) 00592 { 00593 unsigned int i = 0; 00594 cl_int clStatus; 00595 int status = 0; 00596 char *str = NULL; 00597 FILE *fd = NULL; 00598 cl_uint numDevices=0; 00599 if ( getenv("SC_OPENCLCPU") ) 00600 { 00601 clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform 00602 CL_DEVICE_TYPE_CPU, // device_type for CPU device 00603 0, // num_entries 00604 NULL, // devices ID 00605 &numDevices); 00606 } 00607 else 00608 { 00609 clStatus = clGetDeviceIDs(gpuEnv.mpPlatformID, // platform 00610 CL_DEVICE_TYPE_GPU, // device_type for GPU device 00611 0, // num_entries 00612 NULL, // devices ID 00613 &numDevices); 00614 } 00615 CHECK_OPENCL( clStatus, "clGetDeviceIDs" ); 00616 for ( i = 0; i < numDevices; i++ ) 00617 { 00618 char fileName[256] = { 0 }, cl_name[128] = { 0 }; 00619 if ( gpuEnv.mpArryDevsID[i] != 0 ) 00620 { 00621 char deviceName[1024]; 00622 clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL ); 00623 CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); 00624 str = (char*) strstr( clFileName, (char*) ".cl" ); 00625 memcpy( cl_name, clFileName, str - clFileName ); 00626 cl_name[str - clFileName] = '\0'; 00627 sprintf( fileName, "%s-%s.bin", cl_name, deviceName ); 00628 legalizeFileName(fileName); 00629 fd = fopen( fileName, "rb" ); 00630 status = ( fd != NULL ) ? 1 : 0; 00631 } 00632 } 00633 if ( fd != NULL ) 00634 { 00635 *fhandle = fd; 00636 } 00637 return status; 00638 00639 } 00640 int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName ) 00641 { 00642 int i; 00643 for ( i = 0; i < gpuEnvCached->mnFileCount; i++ ) 00644 { 00645 if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 ) 00646 { 00647 if ( gpuEnvCached->mpArryPrograms[i] != NULL ) 00648 { 00649 return 1; 00650 } 00651 } 00652 } 00653 00654 return 0; 00655 } 00656 int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes ) 00657 { 00658 FILE *output = NULL; 00659 output = fopen( fileName, "wb" ); 00660 if ( output == NULL ) 00661 { 00662 return 0; 00663 } 00664 00665 fwrite( birary, sizeof(char), numBytes, output ); 00666 fclose( output ); 00667 00668 return 1; 00669 00670 } 00671 int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName ) 00672 { 00673 unsigned int i = 0; 00674 cl_int clStatus; 00675 size_t *binarySizes, numDevices; 00676 cl_device_id *mpArryDevsID; 00677 char **binaries, *str = NULL; 00678 00679 clStatus = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, 00680 sizeof(numDevices), &numDevices, NULL ); 00681 CHECK_OPENCL( clStatus, "clGetProgramInfo" ); 00682 00683 mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices ); 00684 if ( mpArryDevsID == NULL ) 00685 { 00686 return 0; 00687 } 00688 /* grab the handles to all of the devices in the program. */ 00689 clStatus = clGetProgramInfo( program, CL_PROGRAM_DEVICES, 00690 sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL ); 00691 CHECK_OPENCL( clStatus, "clGetProgramInfo" ); 00692 00693 /* figure out the sizes of each of the binaries. */ 00694 binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices ); 00695 00696 clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, 00697 sizeof(size_t) * numDevices, binarySizes, NULL ); 00698 CHECK_OPENCL( clStatus, "clGetProgramInfo" ); 00699 00700 /* copy over all of the generated binaries. */ 00701 binaries = (char**) malloc( sizeof(char *) * numDevices ); 00702 if ( binaries == NULL ) 00703 { 00704 return 0; 00705 } 00706 00707 for ( i = 0; i < numDevices; i++ ) 00708 { 00709 if ( binarySizes[i] != 0 ) 00710 { 00711 binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] ); 00712 if ( binaries[i] == NULL ) 00713 { 00714 return 0; 00715 } 00716 } 00717 else 00718 { 00719 binaries[i] = NULL; 00720 } 00721 } 00722 00723 clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARIES, 00724 sizeof(char *) * numDevices, binaries, NULL ); 00725 CHECK_OPENCL(clStatus,"clGetProgramInfo"); 00726 00727 /* dump out each binary into its own separate file. */ 00728 for ( i = 0; i < numDevices; i++ ) 00729 { 00730 char fileName[256] = { 0 }, cl_name[128] = { 0 }; 00731 00732 if ( binarySizes[i] != 0 ) 00733 { 00734 char deviceName[1024]; 00735 clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, 00736 sizeof(deviceName), deviceName, NULL); 00737 CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); 00738 00739 str = (char*) strstr( clFileName, (char*) ".cl" ); 00740 memcpy( cl_name, clFileName, str - clFileName ); 00741 cl_name[str - clFileName] = '\0'; 00742 sprintf( fileName, "%s-%s.bin", cl_name, deviceName ); 00743 legalizeFileName(fileName); 00744 if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) ) 00745 { 00746 printf("[OD] write binary[%s] failed\n", fileName); 00747 return 0; 00748 } //else 00749 printf("[OD] write binary[%s] succesfully\n", fileName); 00750 } 00751 } 00752 00753 // Release all resouces and memory 00754 for ( i = 0; i < numDevices; i++ ) 00755 { 00756 if ( binaries[i] != NULL ) 00757 { 00758 free( binaries[i] ); 00759 binaries[i] = NULL; 00760 } 00761 } 00762 00763 if ( binaries != NULL ) 00764 { 00765 free( binaries ); 00766 binaries = NULL; 00767 } 00768 00769 if ( binarySizes != NULL ) 00770 { 00771 free( binarySizes ); 00772 binarySizes = NULL; 00773 } 00774 00775 if ( mpArryDevsID != NULL ) 00776 { 00777 free( mpArryDevsID ); 00778 mpArryDevsID = NULL; 00779 } 00780 return 1; 00781 } 00782 00783 void copyIntBuffer( KernelEnv rEnv, cl_mem xValues, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus ) 00784 { 00785 l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0, 00786 nElements * sizeof(l_int32), 0, NULL, NULL, NULL ); 00787 clFinish( rEnv.mpkCmdQueue ); 00788 if (_pValues != NULL) 00789 { 00790 for ( int i = 0; i < (int)nElements; i++ ) 00791 pValues[i] = (l_int32)_pValues[i]; 00792 } 00793 00794 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL); 00795 //clFinish( rEnv.mpkCmdQueue ); 00796 return; 00797 } 00798 00799 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) 00800 { 00801 //PERF_COUNT_START("CompileKernelFile") 00802 cl_int clStatus = 0; 00803 size_t length; 00804 char *buildLog = NULL, *binary; 00805 const char *source; 00806 size_t source_size[1]; 00807 int b_error, binary_status, binaryExisted, idx; 00808 size_t numDevices; 00809 cl_device_id *mpArryDevsID; 00810 FILE *fd, *fd1; 00811 const char* filename = "kernel.cl"; 00812 //fprintf(stderr, "[OD] CompileKernelFile ... \n"); 00813 if ( CachedOfKernerPrg(gpuInfo, filename) == 1 ) 00814 { 00815 return 1; 00816 } 00817 00818 idx = gpuInfo->mnFileCount; 00819 00820 source = kernel_src; 00821 00822 source_size[0] = strlen( source ); 00823 binaryExisted = 0; 00824 binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark 00825 //PERF_COUNT_SUB("BinaryGenerated") 00826 if ( binaryExisted == 1 ) 00827 { 00828 clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, 00829 sizeof(numDevices), &numDevices, NULL ); 00830 CHECK_OPENCL( clStatus, "clGetContextInfo" ); 00831 00832 mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices ); 00833 if ( mpArryDevsID == NULL ) 00834 { 00835 return 0; 00836 } 00837 //PERF_COUNT_SUB("get numDevices") 00838 b_error = 0; 00839 length = 0; 00840 b_error |= fseek( fd, 0, SEEK_END ) < 0; 00841 b_error |= ( length = ftell(fd) ) <= 0; 00842 b_error |= fseek( fd, 0, SEEK_SET ) < 0; 00843 if ( b_error ) 00844 { 00845 return 0; 00846 } 00847 00848 binary = (char*) malloc( length + 2 ); 00849 if ( !binary ) 00850 { 00851 return 0; 00852 } 00853 00854 memset( binary, 0, length + 2 ); 00855 b_error |= fread( binary, 1, length, fd ) != length; 00856 00857 00858 fclose( fd ); 00859 //PERF_COUNT_SUB("read file") 00860 fd = NULL; 00861 // grab the handles to all of the devices in the context. 00862 clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, 00863 sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL ); 00864 CHECK_OPENCL( clStatus, "clGetContextInfo" ); 00865 //PERF_COUNT_SUB("get devices") 00866 //fprintf(stderr, "[OD] Create kernel from binary\n"); 00867 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices, 00868 mpArryDevsID, &length, (const unsigned char**) &binary, 00869 &binary_status, &clStatus ); 00870 CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" ); 00871 //PERF_COUNT_SUB("clCreateProgramWithBinary") 00872 free( binary ); 00873 free( mpArryDevsID ); 00874 mpArryDevsID = NULL; 00875 //PERF_COUNT_SUB("binaryExisted") 00876 } 00877 else 00878 { 00879 // create a CL program using the kernel source 00880 //fprintf(stderr, "[OD] Create kernel from source\n"); 00881 gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source, 00882 source_size, &clStatus); 00883 CHECK_OPENCL( clStatus, "clCreateProgramWithSource" ); 00884 //PERF_COUNT_SUB("!binaryExisted") 00885 } 00886 00887 if ( gpuInfo->mpArryPrograms[idx] == (cl_program) NULL ) 00888 { 00889 return 0; 00890 } 00891 00892 //char options[512]; 00893 // create a cl program executable for all the devices specified 00894 //printf("[OD] BuildProgram.\n"); 00895 PERF_COUNT_START("OD::CompileKernel::clBuildProgram") 00896 if (!gpuInfo->mnIsUserCreated) 00897 { 00898 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, 00899 buildOption, NULL, NULL); 00900 //PERF_COUNT_SUB("clBuildProgram notUserCreated") 00901 } 00902 else 00903 { 00904 clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), 00905 buildOption, NULL, NULL); 00906 //PERF_COUNT_SUB("clBuildProgram isUserCreated") 00907 } 00908 PERF_COUNT_END 00909 if ( clStatus != CL_SUCCESS ) 00910 { 00911 printf ("BuildProgram error!\n"); 00912 if ( !gpuInfo->mnIsUserCreated ) 00913 { 00914 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], 00915 CL_PROGRAM_BUILD_LOG, 0, NULL, &length ); 00916 } 00917 else 00918 { 00919 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID, 00920 CL_PROGRAM_BUILD_LOG, 0, NULL, &length); 00921 } 00922 if ( clStatus != CL_SUCCESS ) 00923 { 00924 printf("opencl create build log fail\n"); 00925 return 0; 00926 } 00927 buildLog = (char*) malloc( length ); 00928 if ( buildLog == (char*) NULL ) 00929 { 00930 return 0; 00931 } 00932 if ( !gpuInfo->mnIsUserCreated ) 00933 { 00934 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], 00935 CL_PROGRAM_BUILD_LOG, length, buildLog, &length ); 00936 } 00937 else 00938 { 00939 clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID, 00940 CL_PROGRAM_BUILD_LOG, length, buildLog, &length ); 00941 } 00942 if ( clStatus != CL_SUCCESS ) 00943 { 00944 printf("opencl program build info fail\n"); 00945 return 0; 00946 } 00947 00948 fd1 = fopen( "kernel-build.log", "w+" ); 00949 if ( fd1 != NULL ) 00950 { 00951 fwrite( buildLog, sizeof(char), length, fd1 ); 00952 fclose( fd1 ); 00953 } 00954 00955 free( buildLog ); 00956 //PERF_COUNT_SUB("build error log") 00957 return 0; 00958 } 00959 00960 strcpy( gpuInfo->mArryKnelSrcFile[idx], filename ); 00961 //PERF_COUNT_SUB("strcpy") 00962 if ( binaryExisted == 0 ) { 00963 GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename ); 00964 PERF_COUNT_SUB("GenerateBinFromKernelSource") 00965 } 00966 00967 gpuInfo->mnFileCount += 1; 00968 //PERF_COUNT_END 00969 return 1; 00970 } 00971 00972 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line) 00973 { 00974 PERF_COUNT_START("pixReadFromTiffKernel") 00975 cl_int clStatus; 00976 KernelEnv rEnv; 00977 size_t globalThreads[2]; 00978 size_t localThreads[2]; 00979 int gsize; 00980 cl_mem valuesCl; 00981 cl_mem outputCl; 00982 00983 //global and local work dimensions for Horizontal pass 00984 gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 00985 globalThreads[0] = gsize; 00986 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 00987 globalThreads[1] = gsize; 00988 localThreads[0] = GROUPSIZE_X; 00989 localThreads[1] = GROUPSIZE_Y; 00990 00991 SetKernelEnv( &rEnv ); 00992 00993 l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32)); 00994 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus ); 00995 CHECK_OPENCL( clStatus, "clCreateKernel"); 00996 00997 //Allocate input and output OCL buffers 00998 valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); 00999 outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); 01000 01001 //Kernel arguments 01002 clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl ); 01003 CHECK_OPENCL( clStatus, "clSetKernelArg"); 01004 clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(w), (void *)&w ); 01005 CHECK_OPENCL( clStatus, "clSetKernelArg" ); 01006 clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(h), (void *)&h ); 01007 CHECK_OPENCL( clStatus, "clSetKernelArg" ); 01008 clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(wpl), (void *)&wpl ); 01009 CHECK_OPENCL( clStatus, "clSetKernelArg" ); 01010 clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl ); 01011 CHECK_OPENCL( clStatus, "clSetKernelArg"); 01012 01013 //Kernel enqueue 01014 PERF_COUNT_SUB("before") 01015 clStatus = clEnqueueNDRangeKernel( rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL ); 01016 CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); 01017 01018 /* map results back from gpu */ 01019 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0, w*h * sizeof(l_uint32), 0, NULL, NULL, &clStatus); 01020 CHECK_OPENCL( clStatus, "clEnqueueMapBuffer outputCl"); 01021 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL); 01022 01023 //Sync 01024 clFinish( rEnv.mpkCmdQueue ); 01025 PERF_COUNT_SUB("kernel & map") 01026 PERF_COUNT_END 01027 return pResult; 01028 } 01029 01030 01031 PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n ) 01032 { 01033 PERF_COUNT_START("pixReadTiffCL") 01034 FILE *fp; 01035 PIX *pix; 01036 01037 //printf("pixReadTiffCl file"); 01038 PROCNAME("pixReadTiff"); 01039 01040 if (!filename) 01041 return (PIX *)ERROR_PTR("filename not defined", procName, NULL); 01042 01043 if ((fp = fopenReadStream(filename)) == NULL) 01044 return (PIX *)ERROR_PTR("image file not found", procName, NULL); 01045 if ((pix = pixReadStreamTiffCl(fp, n)) == NULL) { 01046 fclose(fp); 01047 return (PIX *)ERROR_PTR("pix not read", procName, NULL); 01048 } 01049 fclose(fp); 01050 PERF_COUNT_END 01051 return pix; 01052 01053 } 01054 TIFF * 01055 OpenclDevice::fopenTiffCl(FILE *fp, 01056 const char *modestring) 01057 { 01058 l_int32 fd; 01059 01060 PROCNAME("fopenTiff"); 01061 01062 if (!fp) 01063 return (TIFF *)ERROR_PTR("stream not opened", procName, NULL); 01064 if (!modestring) 01065 return (TIFF *)ERROR_PTR("modestring not defined", procName, NULL); 01066 01067 if ((fd = fileno(fp)) < 0) 01068 return (TIFF *)ERROR_PTR("invalid file descriptor", procName, NULL); 01069 lseek(fd, 0, SEEK_SET); 01070 01071 return TIFFFdOpen(fd, "TIFFstream", modestring); 01072 } 01073 l_int32 OpenclDevice::getTiffStreamResolutionCl(TIFF *tif, 01074 l_int32 *pxres, 01075 l_int32 *pyres) 01076 { 01077 l_uint16 resunit; 01078 l_int32 foundxres, foundyres; 01079 l_float32 fxres, fyres; 01080 01081 PROCNAME("getTiffStreamResolution"); 01082 01083 if (!tif) 01084 return ERROR_INT("tif not opened", procName, 1); 01085 if (!pxres || !pyres) 01086 return ERROR_INT("&xres and &yres not both defined", procName, 1); 01087 *pxres = *pyres = 0; 01088 01089 TIFFGetFieldDefaulted(tif, TIFFTAG_RESOLUTIONUNIT, &resunit); 01090 foundxres = TIFFGetField(tif, TIFFTAG_XRESOLUTION, &fxres); 01091 foundyres = TIFFGetField(tif, TIFFTAG_YRESOLUTION, &fyres); 01092 if (!foundxres && !foundyres) return 1; 01093 if (!foundxres && foundyres) 01094 fxres = fyres; 01095 else if (foundxres && !foundyres) 01096 fyres = fxres; 01097 01098 if (resunit == RESUNIT_CENTIMETER) { /* convert to ppi */ 01099 *pxres = (l_int32)(2.54 * fxres + 0.5); 01100 *pyres = (l_int32)(2.54 * fyres + 0.5); 01101 } 01102 else { 01103 *pxres = (l_int32)fxres; 01104 *pyres = (l_int32)fyres; 01105 } 01106 01107 return 0; 01108 } 01109 PIX * 01110 OpenclDevice::pixReadStreamTiffCl(FILE *fp, 01111 l_int32 n) 01112 { 01113 l_int32 i, pagefound; 01114 PIX *pix; 01115 TIFF *tif; 01116 01117 PROCNAME("pixReadStreamTiff"); 01118 01119 if (!fp) 01120 return (PIX *)ERROR_PTR("stream not defined", procName, NULL); 01121 01122 if ((tif = fopenTiffCl(fp, "rb")) == NULL) 01123 return (PIX *)ERROR_PTR("tif not opened", procName, NULL); 01124 01125 pagefound = FALSE; 01126 pix = NULL; 01127 for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) { 01128 if (i == n) { 01129 pagefound = TRUE; 01130 if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) { 01131 TIFFCleanup(tif); 01132 return (PIX *)ERROR_PTR("pix not read", procName, NULL); 01133 } 01134 break; 01135 } 01136 if (TIFFReadDirectory(tif) == 0) 01137 break; 01138 } 01139 01140 if (pagefound == FALSE) { 01141 L_WARNING("tiff page %d not found", procName, n); 01142 TIFFCleanup(tif); 01143 return NULL; 01144 } 01145 01146 TIFFCleanup(tif); 01147 return pix; 01148 } 01149 01150 static l_int32 01151 getTiffCompressedFormat(l_uint16 tiffcomp) 01152 { 01153 l_int32 comptype; 01154 01155 switch (tiffcomp) 01156 { 01157 case COMPRESSION_CCITTFAX4: 01158 comptype = IFF_TIFF_G4; 01159 break; 01160 case COMPRESSION_CCITTFAX3: 01161 comptype = IFF_TIFF_G3; 01162 break; 01163 case COMPRESSION_CCITTRLE: 01164 comptype = IFF_TIFF_RLE; 01165 break; 01166 case COMPRESSION_PACKBITS: 01167 comptype = IFF_TIFF_PACKBITS; 01168 break; 01169 case COMPRESSION_LZW: 01170 comptype = IFF_TIFF_LZW; 01171 break; 01172 case COMPRESSION_ADOBE_DEFLATE: 01173 comptype = IFF_TIFF_ZIP; 01174 break; 01175 default: 01176 comptype = IFF_TIFF; 01177 break; 01178 } 01179 return comptype; 01180 } 01181 01182 void compare(l_uint32 *cpu, l_uint32 *gpu,int size) 01183 { 01184 for(int i=0;i<size;i++) 01185 { 01186 if(cpu[i]!=gpu[i]) 01187 { 01188 printf("\ndoesnot match\n"); 01189 return; 01190 } 01191 } 01192 printf("\nit matches\n"); 01193 01194 } 01195 01196 //OpenCL implementation of pixReadFromTiffStream. 01197 //Similar to the CPU implentation of pixReadFromTiffStream 01198 PIX * 01199 OpenclDevice::pixReadFromTiffStreamCl(TIFF *tif) 01200 { 01201 l_uint8 *linebuf, *data; 01202 l_uint16 spp, bps, bpp, tiffbpl, photometry, tiffcomp, orientation; 01203 l_uint16 *redmap, *greenmap, *bluemap; 01204 l_int32 d, wpl, bpl, comptype, i, ncolors; 01205 l_int32 xres, yres; 01206 l_uint32 w, h; 01207 l_uint32 *line, *tiffdata; 01208 PIX *pix; 01209 PIXCMAP *cmap; 01210 01211 PROCNAME("pixReadFromTiffStream"); 01212 01213 if (!tif) 01214 return (PIX *)ERROR_PTR("tif not defined", procName, NULL); 01215 01216 01217 TIFFGetFieldDefaulted(tif, TIFFTAG_BITSPERSAMPLE, &bps); 01218 TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &spp); 01219 bpp = bps * spp; 01220 if (bpp > 32) 01221 return (PIX *)ERROR_PTR("can't handle bpp > 32", procName, NULL); 01222 if (spp == 1) 01223 d = bps; 01224 else if (spp == 3 || spp == 4) 01225 d = 32; 01226 else 01227 return (PIX *)ERROR_PTR("spp not in set {1,3,4}", procName, NULL); 01228 01229 TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &w); 01230 TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &h); 01231 tiffbpl = TIFFScanlineSize(tif); 01232 01233 if ((pix = pixCreate(w, h, d)) == NULL) 01234 return (PIX *)ERROR_PTR("pix not made", procName, NULL); 01235 data = (l_uint8 *)pixGetData(pix); 01236 wpl = pixGetWpl(pix); 01237 bpl = 4 * wpl; 01238 01239 01240 if (spp == 1) { 01241 if ((linebuf = (l_uint8 *)CALLOC(tiffbpl + 1, sizeof(l_uint8))) == NULL) 01242 return (PIX *)ERROR_PTR("calloc fail for linebuf", procName, NULL); 01243 01244 for (i = 0 ; i < h ; i++) { 01245 if (TIFFReadScanline(tif, linebuf, i, 0) < 0) { 01246 FREE(linebuf); 01247 pixDestroy(&pix); 01248 return (PIX *)ERROR_PTR("line read fail", procName, NULL); 01249 } 01250 memcpy((char *)data, (char *)linebuf, tiffbpl); 01251 data += bpl; 01252 } 01253 if (bps <= 8) 01254 pixEndianByteSwap(pix); 01255 else 01256 pixEndianTwoByteSwap(pix); 01257 FREE(linebuf); 01258 } 01259 else { 01260 if ((tiffdata = (l_uint32 *)CALLOC(w * h, sizeof(l_uint32))) == NULL) { 01261 pixDestroy(&pix); 01262 return (PIX *)ERROR_PTR("calloc fail for tiffdata", procName, NULL); 01263 } 01264 if (!TIFFReadRGBAImageOriented(tif, w, h, (uint32 *)tiffdata, 01265 ORIENTATION_TOPLEFT, 0)) { 01266 FREE(tiffdata); 01267 pixDestroy(&pix); 01268 return (PIX *)ERROR_PTR("failed to read tiffdata", procName, NULL); 01269 } 01270 line = pixGetData(pix); 01271 01272 //Invoke the OpenCL kernel for pixReadFromTiff 01273 l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line); 01274 pixSetData(pix, output_gpu); 01275 01276 FREE(tiffdata); 01277 } 01278 01279 if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) { 01280 pixSetXRes(pix, xres); 01281 pixSetYRes(pix, yres); 01282 } 01283 01284 01285 TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &tiffcomp); 01286 comptype = getTiffCompressedFormat(tiffcomp); 01287 pixSetInputFormat(pix, comptype); 01288 01289 if (TIFFGetField(tif, TIFFTAG_COLORMAP, &redmap, &greenmap, &bluemap)) { 01290 01291 if ((cmap = pixcmapCreate(bps)) == NULL) { 01292 pixDestroy(&pix); 01293 return (PIX *)ERROR_PTR("cmap not made", procName, NULL); 01294 } 01295 ncolors = 1 << bps; 01296 for (i = 0; i < ncolors; i++) 01297 pixcmapAddColor(cmap, redmap[i] >> 8, greenmap[i] >> 8, 01298 bluemap[i] >> 8); 01299 pixSetColormap(pix, cmap); 01300 } 01301 else { 01302 if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometry)) { 01303 01304 if (tiffcomp == COMPRESSION_CCITTFAX3 || 01305 tiffcomp == COMPRESSION_CCITTFAX4 || 01306 tiffcomp == COMPRESSION_CCITTRLE || 01307 tiffcomp == COMPRESSION_CCITTRLEW) { 01308 photometry = PHOTOMETRIC_MINISWHITE; 01309 } 01310 else 01311 photometry = PHOTOMETRIC_MINISBLACK; 01312 } 01313 if ((d == 1 && photometry == PHOTOMETRIC_MINISBLACK) || 01314 (d == 8 && photometry == PHOTOMETRIC_MINISWHITE)) 01315 pixInvert(pix, pix); 01316 } 01317 01318 if (TIFFGetField(tif, TIFFTAG_ORIENTATION, &orientation)) { 01319 if (orientation >= 1 && orientation <= 8) { 01320 struct tiff_transform *transform = 01321 &tiff_orientation_transforms[orientation - 1]; 01322 if (transform->vflip) pixFlipTB(pix, pix); 01323 if (transform->hflip) pixFlipLR(pix, pix); 01324 if (transform->rotate) { 01325 PIX *oldpix = pix; 01326 pix = pixRotate90(oldpix, transform->rotate); 01327 pixDestroy(&oldpix); 01328 } 01329 } 01330 } 01331 01332 return pix; 01333 } 01334 01335 //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels 01336 cl_int 01337 pixDilateCL_55(l_int32 wpl, l_int32 h) 01338 { 01339 size_t globalThreads[2]; 01340 cl_mem pixtemp; 01341 cl_int status; 01342 int gsize; 01343 size_t localThreads[2]; 01344 01345 //Horizontal pass 01346 gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX; 01347 globalThreads[0] = gsize; 01348 globalThreads[1] = GROUPSIZE_HMORY; 01349 localThreads[0] = GROUPSIZE_HMORX; 01350 localThreads[1] = GROUPSIZE_HMORY; 01351 01352 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status ); 01353 01354 status = clSetKernelArg(rEnv.mpkKernel, 01355 0, 01356 sizeof(cl_mem), 01357 &pixsCLBuffer); 01358 status = clSetKernelArg(rEnv.mpkKernel, 01359 1, 01360 sizeof(cl_mem), 01361 &pixdCLBuffer); 01362 status = clSetKernelArg(rEnv.mpkKernel, 01363 2, 01364 sizeof(wpl), 01365 (const void *)&wpl); 01366 status = clSetKernelArg(rEnv.mpkKernel, 01367 3, 01368 sizeof(h), 01369 (const void *)&h); 01370 01371 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01372 rEnv.mpkKernel, 01373 2, 01374 NULL, 01375 globalThreads, 01376 localThreads, 01377 0, 01378 NULL, 01379 NULL); 01380 01381 //Swap source and dest buffers 01382 pixtemp = pixsCLBuffer; 01383 pixsCLBuffer = pixdCLBuffer; 01384 pixdCLBuffer = pixtemp; 01385 01386 //Vertical 01387 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 01388 globalThreads[0] = gsize; 01389 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 01390 globalThreads[1] = gsize; 01391 localThreads[0] = GROUPSIZE_X; 01392 localThreads[1] = GROUPSIZE_Y; 01393 01394 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status ); 01395 01396 status = clSetKernelArg(rEnv.mpkKernel, 01397 0, 01398 sizeof(cl_mem), 01399 &pixsCLBuffer); 01400 status = clSetKernelArg(rEnv.mpkKernel, 01401 1, 01402 sizeof(cl_mem), 01403 &pixdCLBuffer); 01404 status = clSetKernelArg(rEnv.mpkKernel, 01405 2, 01406 sizeof(wpl), 01407 (const void *)&wpl); 01408 status = clSetKernelArg(rEnv.mpkKernel, 01409 3, 01410 sizeof(h), 01411 (const void *)&h); 01412 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01413 rEnv.mpkKernel, 01414 2, 01415 NULL, 01416 globalThreads, 01417 localThreads, 01418 0, 01419 NULL, 01420 NULL); 01421 01422 return status; 01423 } 01424 01425 //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels 01426 cl_int 01427 pixErodeCL_55(l_int32 wpl, l_int32 h) 01428 { 01429 size_t globalThreads[2]; 01430 cl_mem pixtemp; 01431 cl_int status; 01432 int gsize; 01433 l_uint32 fwmask, lwmask; 01434 size_t localThreads[2]; 01435 01436 lwmask = lmask32[32 - 2]; 01437 fwmask = rmask32[32 - 2]; 01438 01439 //Horizontal pass 01440 gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX; 01441 globalThreads[0] = gsize; 01442 globalThreads[1] = GROUPSIZE_HMORY; 01443 localThreads[0] = GROUPSIZE_HMORX; 01444 localThreads[1] = GROUPSIZE_HMORY; 01445 01446 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status ); 01447 01448 status = clSetKernelArg(rEnv.mpkKernel, 01449 0, 01450 sizeof(cl_mem), 01451 &pixsCLBuffer); 01452 status = clSetKernelArg(rEnv.mpkKernel, 01453 1, 01454 sizeof(cl_mem), 01455 &pixdCLBuffer); 01456 status = clSetKernelArg(rEnv.mpkKernel, 01457 2, 01458 sizeof(wpl), 01459 (const void *)&wpl); 01460 status = clSetKernelArg(rEnv.mpkKernel, 01461 3, 01462 sizeof(h), 01463 (const void *)&h); 01464 01465 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01466 rEnv.mpkKernel, 01467 2, 01468 NULL, 01469 globalThreads, 01470 localThreads, 01471 0, 01472 NULL, 01473 NULL); 01474 01475 //Swap source and dest buffers 01476 pixtemp = pixsCLBuffer; 01477 pixsCLBuffer = pixdCLBuffer; 01478 pixdCLBuffer = pixtemp; 01479 01480 //Vertical 01481 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 01482 globalThreads[0] = gsize; 01483 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 01484 globalThreads[1] = gsize; 01485 localThreads[0] = GROUPSIZE_X; 01486 localThreads[1] = GROUPSIZE_Y; 01487 01488 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status ); 01489 01490 status = clSetKernelArg(rEnv.mpkKernel, 01491 0, 01492 sizeof(cl_mem), 01493 &pixsCLBuffer); 01494 status = clSetKernelArg(rEnv.mpkKernel, 01495 1, 01496 sizeof(cl_mem), 01497 &pixdCLBuffer); 01498 status = clSetKernelArg(rEnv.mpkKernel, 01499 2, 01500 sizeof(wpl), 01501 (const void *)&wpl); 01502 status = clSetKernelArg(rEnv.mpkKernel, 01503 3, 01504 sizeof(h), 01505 (const void *)&h); 01506 status = clSetKernelArg(rEnv.mpkKernel, 01507 4, 01508 sizeof(fwmask), 01509 (const void *)&fwmask); 01510 status = clSetKernelArg(rEnv.mpkKernel, 01511 5, 01512 sizeof(lwmask), 01513 (const void *)&lwmask); 01514 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01515 rEnv.mpkKernel, 01516 2, 01517 NULL, 01518 globalThreads, 01519 localThreads, 01520 0, 01521 NULL, 01522 NULL); 01523 01524 return status; 01525 } 01526 01527 //Morphology Dilate operation. Invokes the relevant OpenCL kernels 01528 cl_int 01529 pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) 01530 { 01531 l_int32 xp, yp, xn, yn; 01532 SEL* sel; 01533 size_t globalThreads[2]; 01534 cl_mem pixtemp; 01535 cl_int status; 01536 int gsize; 01537 size_t localThreads[2]; 01538 char isEven; 01539 01540 OpenclDevice::SetKernelEnv( &rEnv ); 01541 01542 if (hsize == 5 && vsize == 5) 01543 { 01544 //Specific case for 5x5 01545 status = pixDilateCL_55(wpl, h); 01546 return status; 01547 } 01548 01549 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT); 01550 01551 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn); 01552 01553 //global and local work dimensions for Horizontal pass 01554 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 01555 globalThreads[0] = gsize; 01556 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 01557 globalThreads[1] = gsize; 01558 localThreads[0] = GROUPSIZE_X; 01559 localThreads[1] = GROUPSIZE_Y; 01560 01561 if (xp > 31 || xn > 31) 01562 { 01563 //Generic case. 01564 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor", &status ); 01565 01566 status = clSetKernelArg(rEnv.mpkKernel, 01567 0, 01568 sizeof(cl_mem), 01569 &pixsCLBuffer); 01570 status = clSetKernelArg(rEnv.mpkKernel, 01571 1, 01572 sizeof(cl_mem), 01573 &pixdCLBuffer); 01574 status = clSetKernelArg(rEnv.mpkKernel, 01575 2, 01576 sizeof(xp), 01577 (const void *)&xp); 01578 status = clSetKernelArg(rEnv.mpkKernel, 01579 3, 01580 sizeof(xn), 01581 (const void *)&xn); 01582 status = clSetKernelArg(rEnv.mpkKernel, 01583 4, 01584 sizeof(wpl), 01585 (const void *)&wpl); 01586 status = clSetKernelArg(rEnv.mpkKernel, 01587 5, 01588 sizeof(h), 01589 (const void *)&h); 01590 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01591 rEnv.mpkKernel, 01592 2, 01593 NULL, 01594 globalThreads, 01595 localThreads, 01596 0, 01597 NULL, 01598 NULL); 01599 01600 if (yp > 0 || yn > 0) 01601 { 01602 pixtemp = pixsCLBuffer; 01603 pixsCLBuffer = pixdCLBuffer; 01604 pixdCLBuffer = pixtemp; 01605 } 01606 } 01607 else if (xp > 0 || xn > 0 ) 01608 { 01609 //Specfic Horizontal pass kernel for half width < 32 01610 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_32word", &status ); 01611 isEven = (xp != xn); 01612 01613 status = clSetKernelArg(rEnv.mpkKernel, 01614 0, 01615 sizeof(cl_mem), 01616 &pixsCLBuffer); 01617 status = clSetKernelArg(rEnv.mpkKernel, 01618 1, 01619 sizeof(cl_mem), 01620 &pixdCLBuffer); 01621 status = clSetKernelArg(rEnv.mpkKernel, 01622 2, 01623 sizeof(xp), 01624 (const void *)&xp); 01625 status = clSetKernelArg(rEnv.mpkKernel, 01626 3, 01627 sizeof(wpl), 01628 (const void *)&wpl); 01629 status = clSetKernelArg(rEnv.mpkKernel, 01630 4, 01631 sizeof(h), 01632 (const void *)&h); 01633 status = clSetKernelArg(rEnv.mpkKernel, 01634 5, 01635 sizeof(isEven), 01636 (const void *)&isEven); 01637 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01638 rEnv.mpkKernel, 01639 2, 01640 NULL, 01641 globalThreads, 01642 localThreads, 01643 0, 01644 NULL, 01645 NULL); 01646 01647 if (yp > 0 || yn > 0) 01648 { 01649 pixtemp = pixsCLBuffer; 01650 pixsCLBuffer = pixdCLBuffer; 01651 pixdCLBuffer = pixtemp; 01652 } 01653 } 01654 01655 if (yp > 0 || yn > 0) 01656 { 01657 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status ); 01658 01659 status = clSetKernelArg(rEnv.mpkKernel, 01660 0, 01661 sizeof(cl_mem), 01662 &pixsCLBuffer); 01663 status = clSetKernelArg(rEnv.mpkKernel, 01664 1, 01665 sizeof(cl_mem), 01666 &pixdCLBuffer); 01667 status = clSetKernelArg(rEnv.mpkKernel, 01668 2, 01669 sizeof(yp), 01670 (const void *)&yp); 01671 status = clSetKernelArg(rEnv.mpkKernel, 01672 3, 01673 sizeof(wpl), 01674 (const void *)&wpl); 01675 status = clSetKernelArg(rEnv.mpkKernel, 01676 4, 01677 sizeof(h), 01678 (const void *)&h); 01679 status = clSetKernelArg(rEnv.mpkKernel, 01680 5, 01681 sizeof(yn), 01682 (const void *)&yn); 01683 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01684 rEnv.mpkKernel, 01685 2, 01686 NULL, 01687 globalThreads, 01688 localThreads, 01689 0, 01690 NULL, 01691 NULL); 01692 } 01693 01694 01695 return status; 01696 } 01697 01698 //Morphology Erode operation. Invokes the relevant OpenCL kernels 01699 cl_int 01700 pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) 01701 { 01702 01703 l_int32 xp, yp, xn, yn; 01704 SEL* sel; 01705 size_t globalThreads[2]; 01706 size_t localThreads[2]; 01707 cl_mem pixtemp; 01708 cl_int status; 01709 int gsize; 01710 char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC); 01711 l_uint32 rwmask, lwmask; 01712 char isEven; 01713 01714 sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT); 01715 01716 selFindMaxTranslations(sel, &xp, &yp, &xn, &yn); 01717 01718 OpenclDevice::SetKernelEnv( &rEnv ); 01719 01720 if (hsize == 5 && vsize == 5 && isAsymmetric) 01721 { 01722 //Specific kernel for 5x5 01723 status = pixErodeCL_55(wpl, h); 01724 return status; 01725 } 01726 01727 rwmask = rmask32[32 - (xp & 31)]; 01728 lwmask = lmask32[32 - (xn & 31)]; 01729 01730 //global and local work dimensions for Horizontal pass 01731 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 01732 globalThreads[0] = gsize; 01733 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 01734 globalThreads[1] = gsize; 01735 localThreads[0] = GROUPSIZE_X; 01736 localThreads[1] = GROUPSIZE_Y; 01737 01738 //Horizontal Pass 01739 if (xp > 31 || xn > 31 ) 01740 { 01741 //Generic case. 01742 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor", &status ); 01743 01744 status = clSetKernelArg(rEnv.mpkKernel, 01745 0, 01746 sizeof(cl_mem), 01747 &pixsCLBuffer); 01748 status = clSetKernelArg(rEnv.mpkKernel, 01749 1, 01750 sizeof(cl_mem), 01751 &pixdCLBuffer); 01752 status = clSetKernelArg(rEnv.mpkKernel, 01753 2, 01754 sizeof(xp), 01755 (const void *)&xp); 01756 status = clSetKernelArg(rEnv.mpkKernel, 01757 3, 01758 sizeof(xn), 01759 (const void *)&xn); 01760 status = clSetKernelArg(rEnv.mpkKernel, 01761 4, 01762 sizeof(wpl), 01763 (const void *)&wpl); 01764 status = clSetKernelArg(rEnv.mpkKernel, 01765 5, 01766 sizeof(h), 01767 (const void *)&h); 01768 status = clSetKernelArg(rEnv.mpkKernel, 01769 6, 01770 sizeof(isAsymmetric), 01771 (const void *)&isAsymmetric); 01772 status = clSetKernelArg(rEnv.mpkKernel, 01773 7, 01774 sizeof(rwmask), 01775 (const void *)&rwmask); 01776 status = clSetKernelArg(rEnv.mpkKernel, 01777 8, 01778 sizeof(lwmask), 01779 (const void *)&lwmask); 01780 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01781 rEnv.mpkKernel, 01782 2, 01783 NULL, 01784 globalThreads, 01785 localThreads, 01786 0, 01787 NULL, 01788 NULL); 01789 01790 if (yp > 0 || yn > 0) 01791 { 01792 pixtemp = pixsCLBuffer; 01793 pixsCLBuffer = pixdCLBuffer; 01794 pixdCLBuffer = pixtemp; 01795 } 01796 } 01797 else if (xp > 0 || xn > 0) 01798 { 01799 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_32word", &status ); 01800 isEven = (xp != xn); 01801 01802 status = clSetKernelArg(rEnv.mpkKernel, 01803 0, 01804 sizeof(cl_mem), 01805 &pixsCLBuffer); 01806 status = clSetKernelArg(rEnv.mpkKernel, 01807 1, 01808 sizeof(cl_mem), 01809 &pixdCLBuffer); 01810 status = clSetKernelArg(rEnv.mpkKernel, 01811 2, 01812 sizeof(xp), 01813 (const void *)&xp); 01814 status = clSetKernelArg(rEnv.mpkKernel, 01815 3, 01816 sizeof(wpl), 01817 (const void *)&wpl); 01818 status = clSetKernelArg(rEnv.mpkKernel, 01819 4, 01820 sizeof(h), 01821 (const void *)&h); 01822 status = clSetKernelArg(rEnv.mpkKernel, 01823 5, 01824 sizeof(isAsymmetric), 01825 (const void *)&isAsymmetric); 01826 status = clSetKernelArg(rEnv.mpkKernel, 01827 6, 01828 sizeof(rwmask), 01829 (const void *)&rwmask); 01830 status = clSetKernelArg(rEnv.mpkKernel, 01831 7, 01832 sizeof(lwmask), 01833 (const void *)&lwmask); 01834 status = clSetKernelArg(rEnv.mpkKernel, 01835 8, 01836 sizeof(isEven), 01837 (const void *)&isEven); 01838 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01839 rEnv.mpkKernel, 01840 2, 01841 NULL, 01842 globalThreads, 01843 localThreads, 01844 0, 01845 NULL, 01846 NULL); 01847 01848 if (yp > 0 || yn > 0) 01849 { 01850 pixtemp = pixsCLBuffer; 01851 pixsCLBuffer = pixdCLBuffer; 01852 pixdCLBuffer = pixtemp; 01853 } 01854 } 01855 01856 //Vertical Pass 01857 if (yp > 0 || yn > 0) 01858 { 01859 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer", &status ); 01860 01861 status = clSetKernelArg(rEnv.mpkKernel, 01862 0, 01863 sizeof(cl_mem), 01864 &pixsCLBuffer); 01865 status = clSetKernelArg(rEnv.mpkKernel, 01866 1, 01867 sizeof(cl_mem), 01868 &pixdCLBuffer); 01869 status = clSetKernelArg(rEnv.mpkKernel, 01870 2, 01871 sizeof(yp), 01872 (const void *)&yp); 01873 status = clSetKernelArg(rEnv.mpkKernel, 01874 3, 01875 sizeof(wpl), 01876 (const void *)&wpl); 01877 status = clSetKernelArg(rEnv.mpkKernel, 01878 4, 01879 sizeof(h), 01880 (const void *)&h); 01881 status = clSetKernelArg(rEnv.mpkKernel, 01882 5, 01883 sizeof(isAsymmetric), 01884 (const void *)&isAsymmetric); 01885 status = clSetKernelArg(rEnv.mpkKernel, 01886 6, 01887 sizeof(yn), 01888 (const void *)&yn); 01889 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 01890 rEnv.mpkKernel, 01891 2, 01892 NULL, 01893 globalThreads, 01894 localThreads, 01895 0, 01896 NULL, 01897 NULL); 01898 } 01899 01900 return status; 01901 } 01902 01903 // OpenCL implementation of Morphology Dilate 01904 //Note: Assumes the source and dest opencl buffer are initialized. No check done 01905 PIX* 01906 OpenclDevice::pixDilateBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false) 01907 { 01908 l_uint32 wpl, h; 01909 01910 wpl = pixGetWpl(pixs); 01911 h = pixGetHeight(pixs); 01912 01913 clStatus = pixDilateCL(hsize, vsize, wpl, h); 01914 01915 if (reqDataCopy) 01916 { 01917 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ, false); 01918 } 01919 01920 return pixd; 01921 } 01922 01923 // OpenCL implementation of Morphology Erode 01924 //Note: Assumes the source and dest opencl buffer are initialized. No check done 01925 PIX* 01926 OpenclDevice::pixErodeBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false) 01927 { 01928 l_uint32 wpl, h; 01929 01930 wpl = pixGetWpl(pixs); 01931 h = pixGetHeight(pixs); 01932 01933 clStatus = pixErodeCL(hsize, vsize, wpl, h); 01934 01935 if (reqDataCopy) 01936 { 01937 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); 01938 } 01939 01940 return pixd; 01941 } 01942 01943 //Morphology Open operation. Invokes the relevant OpenCL kernels 01944 cl_int 01945 pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) 01946 { 01947 cl_int status; 01948 cl_mem pixtemp; 01949 01950 //Erode followed by Dilate 01951 status = pixErodeCL(hsize, vsize, wpl, h); 01952 01953 pixtemp = pixsCLBuffer; 01954 pixsCLBuffer = pixdCLBuffer; 01955 pixdCLBuffer = pixtemp; 01956 01957 status = pixDilateCL(hsize, vsize, wpl, h); 01958 01959 return status; 01960 } 01961 01962 //Morphology Close operation. Invokes the relevant OpenCL kernels 01963 cl_int 01964 pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) 01965 { 01966 cl_int status; 01967 cl_mem pixtemp; 01968 01969 //Dilate followed by Erode 01970 status = pixDilateCL(hsize, vsize, wpl, h); 01971 01972 pixtemp = pixsCLBuffer; 01973 pixsCLBuffer = pixdCLBuffer; 01974 pixdCLBuffer = pixtemp; 01975 01976 status = pixErodeCL(hsize, vsize, wpl, h); 01977 01978 return status; 01979 } 01980 01981 // OpenCL implementation of Morphology Close 01982 //Note: Assumes the source and dest opencl buffer are initialized. No check done 01983 PIX* 01984 OpenclDevice::pixCloseBrickCL(PIX *pixd, 01985 PIX *pixs, 01986 l_int32 hsize, 01987 l_int32 vsize, 01988 bool reqDataCopy = false) 01989 { 01990 l_uint32 wpl, h; 01991 01992 wpl = pixGetWpl(pixs); 01993 h = pixGetHeight(pixs); 01994 01995 clStatus = pixCloseCL(hsize, vsize, wpl, h); 01996 01997 if (reqDataCopy) 01998 { 01999 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); 02000 } 02001 02002 return pixd; 02003 } 02004 02005 // OpenCL implementation of Morphology Open 02006 //Note: Assumes the source and dest opencl buffer are initialized. No check done 02007 PIX* 02008 OpenclDevice::pixOpenBrickCL(PIX *pixd, 02009 PIX *pixs, 02010 l_int32 hsize, 02011 l_int32 vsize, 02012 bool reqDataCopy = false) 02013 { 02014 l_uint32 wpl, h; 02015 02016 wpl = pixGetWpl(pixs); 02017 h = pixGetHeight(pixs); 02018 02019 clStatus = pixOpenCL(hsize, vsize, wpl, h); 02020 02021 if (reqDataCopy) 02022 { 02023 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); 02024 } 02025 02026 return pixd; 02027 } 02028 02029 //pix OR operation: outbuffer = buffer1 | buffer2 02030 cl_int 02031 pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer) 02032 { 02033 cl_int status; 02034 size_t globalThreads[2]; 02035 int gsize; 02036 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; 02037 02038 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 02039 globalThreads[0] = gsize; 02040 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 02041 globalThreads[1] = gsize; 02042 02043 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixOR", &status ); 02044 02045 status = clSetKernelArg(rEnv.mpkKernel, 02046 0, 02047 sizeof(cl_mem), 02048 &buffer1); 02049 status = clSetKernelArg(rEnv.mpkKernel, 02050 1, 02051 sizeof(cl_mem), 02052 &buffer2); 02053 status = clSetKernelArg(rEnv.mpkKernel, 02054 2, 02055 sizeof(cl_mem), 02056 &outbuffer); 02057 status = clSetKernelArg(rEnv.mpkKernel, 02058 3, 02059 sizeof(wpl), 02060 (const void *)&wpl); 02061 status = clSetKernelArg(rEnv.mpkKernel, 02062 4, 02063 sizeof(h), 02064 (const void *)&h); 02065 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 02066 rEnv.mpkKernel, 02067 2, 02068 NULL, 02069 globalThreads, 02070 localThreads, 02071 0, 02072 NULL, 02073 NULL); 02074 02075 return status; 02076 } 02077 02078 //pix AND operation: outbuffer = buffer1 & buffer2 02079 cl_int 02080 pixANDCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer) 02081 { 02082 cl_int status; 02083 size_t globalThreads[2]; 02084 int gsize; 02085 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; 02086 02087 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 02088 globalThreads[0] = gsize; 02089 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 02090 globalThreads[1] = gsize; 02091 02092 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixAND", &status ); 02093 02094 // Enqueue a kernel run call. 02095 status = clSetKernelArg(rEnv.mpkKernel, 02096 0, 02097 sizeof(cl_mem), 02098 &buffer1); 02099 status = clSetKernelArg(rEnv.mpkKernel, 02100 1, 02101 sizeof(cl_mem), 02102 &buffer2); 02103 status = clSetKernelArg(rEnv.mpkKernel, 02104 2, 02105 sizeof(cl_mem), 02106 &outbuffer); 02107 status = clSetKernelArg(rEnv.mpkKernel, 02108 3, 02109 sizeof(wpl), 02110 (const void *)&wpl); 02111 status = clSetKernelArg(rEnv.mpkKernel, 02112 4, 02113 sizeof(h), 02114 (const void *)&h); 02115 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 02116 rEnv.mpkKernel, 02117 2, 02118 NULL, 02119 globalThreads, 02120 localThreads, 02121 0, 02122 NULL, 02123 NULL); 02124 02125 return status; 02126 } 02127 02128 //output = buffer1 & ~(buffer2) 02129 cl_int 02130 pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outBuffer = NULL) 02131 { 02132 cl_int status; 02133 size_t globalThreads[2]; 02134 int gsize; 02135 size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; 02136 02137 gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; 02138 globalThreads[0] = gsize; 02139 gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; 02140 globalThreads[1] = gsize; 02141 02142 if (outBuffer != NULL) 02143 { 02144 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract", &status ); 02145 } 02146 else 02147 { 02148 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract_inplace", &status ); 02149 } 02150 02151 // Enqueue a kernel run call. 02152 status = clSetKernelArg(rEnv.mpkKernel, 02153 0, 02154 sizeof(cl_mem), 02155 &buffer1); 02156 status = clSetKernelArg(rEnv.mpkKernel, 02157 1, 02158 sizeof(cl_mem), 02159 &buffer2); 02160 status = clSetKernelArg(rEnv.mpkKernel, 02161 2, 02162 sizeof(wpl), 02163 (const void *)&wpl); 02164 status = clSetKernelArg(rEnv.mpkKernel, 02165 3, 02166 sizeof(h), 02167 (const void *)&h); 02168 if (outBuffer != NULL) 02169 { 02170 status = clSetKernelArg(rEnv.mpkKernel, 02171 4, 02172 sizeof(cl_mem), 02173 (const void *)&outBuffer); 02174 } 02175 status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, 02176 rEnv.mpkKernel, 02177 2, 02178 NULL, 02179 globalThreads, 02180 localThreads, 02181 0, 02182 NULL, 02183 NULL); 02184 02185 return status; 02186 } 02187 02188 // OpenCL implementation of Subtract pix 02189 //Note: Assumes the source and dest opencl buffer are initialized. No check done 02190 PIX* 02191 OpenclDevice::pixSubtractCL(PIX *pixd, PIX *pixs1, PIX *pixs2, bool reqDataCopy = false) 02192 { 02193 l_uint32 wpl, h; 02194 02195 PROCNAME("pixSubtractCL"); 02196 02197 if (!pixs1) 02198 return (PIX *)ERROR_PTR("pixs1 not defined", procName, pixd); 02199 if (!pixs2) 02200 return (PIX *)ERROR_PTR("pixs2 not defined", procName, pixd); 02201 if (pixGetDepth(pixs1) != pixGetDepth(pixs2)) 02202 return (PIX *)ERROR_PTR("depths of pixs* unequal", procName, pixd); 02203 02204 #if EQUAL_SIZE_WARNING 02205 if (!pixSizesEqual(pixs1, pixs2)) 02206 L_WARNING("pixs1 and pixs2 not equal sizes", procName); 02207 #endif /* EQUAL_SIZE_WARNING */ 02208 02209 wpl = pixGetWpl(pixs1); 02210 h = pixGetHeight(pixs1); 02211 02212 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); 02213 02214 if (reqDataCopy) 02215 { 02216 //Read back output data from OCL buffer to cpu 02217 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs1, wpl*h, CL_MAP_READ); 02218 } 02219 02220 return pixd; 02221 } 02222 02223 // OpenCL implementation of Hollow pix 02224 //Note: Assumes the source and dest opencl buffer are initialized. No check done 02225 PIX* 02226 OpenclDevice::pixHollowCL(PIX *pixd, 02227 PIX *pixs, 02228 l_int32 close_hsize, 02229 l_int32 close_vsize, 02230 l_int32 open_hsize, 02231 l_int32 open_vsize, 02232 bool reqDataCopy = false) 02233 { 02234 l_uint32 wpl, h; 02235 cl_mem pixtemp; 02236 02237 wpl = pixGetWpl(pixs); 02238 h = pixGetHeight(pixs); 02239 02240 //First step : Close Morph operation: Dilate followed by Erode 02241 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h); 02242 02243 //Store the output of close operation in an intermediate buffer 02244 //this will be later used for pixsubtract 02245 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); 02246 02247 //Second step: Open Operation - Erode followed by Dilate 02248 pixtemp = pixsCLBuffer; 02249 pixsCLBuffer = pixdCLBuffer; 02250 pixdCLBuffer = pixtemp; 02251 02252 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h); 02253 02254 //Third step: Subtract : (Close - Open) 02255 pixtemp = pixsCLBuffer; 02256 pixsCLBuffer = pixdCLBuffer; 02257 pixdCLBuffer = pixdCLIntermediate; 02258 pixdCLIntermediate = pixtemp; 02259 02260 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); 02261 02262 if (reqDataCopy) 02263 { 02264 //Read back output data from OCL buffer to cpu 02265 pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); 02266 } 02267 return pixd; 02268 } 02269 02270 // OpenCL implementation of Get Lines from pix function 02271 //Note: Assumes the source and dest opencl buffer are initialized. No check done 02272 void 02273 OpenclDevice::pixGetLinesCL(PIX *pixd, 02274 PIX *pixs, 02275 PIX** pix_vline, 02276 PIX** pix_hline, 02277 PIX** pixClosed, 02278 bool getpixClosed, 02279 l_int32 close_hsize, l_int32 close_vsize, 02280 l_int32 open_hsize, l_int32 open_vsize, 02281 l_int32 line_hsize, l_int32 line_vsize) 02282 { 02283 l_uint32 wpl, h; 02284 cl_mem pixtemp; 02285 02286 wpl = pixGetWpl(pixs); 02287 h = pixGetHeight(pixs); 02288 02289 //First step : Close Morph operation: Dilate followed by Erode 02290 clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h); 02291 02292 //Copy the Close output to CPU buffer 02293 if (getpixClosed) 02294 { 02295 *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl*h, CL_MAP_READ, true, false); 02296 } 02297 02298 //Store the output of close operation in an intermediate buffer 02299 //this will be later used for pixsubtract 02300 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); 02301 02302 //Second step: Open Operation - Erode followed by Dilate 02303 pixtemp = pixsCLBuffer; 02304 pixsCLBuffer = pixdCLBuffer; 02305 pixdCLBuffer = pixtemp; 02306 02307 clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h); 02308 02309 //Third step: Subtract : (Close - Open) 02310 pixtemp = pixsCLBuffer; 02311 pixsCLBuffer = pixdCLBuffer; 02312 pixdCLBuffer = pixdCLIntermediate; 02313 pixdCLIntermediate = pixtemp; 02314 02315 clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); 02316 02317 //Store the output of Hollow operation in an intermediate buffer 02318 //this will be later used 02319 clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); 02320 02321 pixtemp = pixsCLBuffer; 02322 pixsCLBuffer = pixdCLBuffer; 02323 pixdCLBuffer = pixtemp; 02324 02325 //Fourth step: Get vertical line 02326 //pixOpenBrick(NULL, pix_hollow, 1, min_line_length); 02327 clStatus = pixOpenCL(1, line_vsize, wpl, h); 02328 02329 //Copy the vertical line output to CPU buffer 02330 *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl*h, CL_MAP_READ, true, false); 02331 02332 pixtemp = pixsCLBuffer; 02333 pixsCLBuffer = pixdCLIntermediate; 02334 pixdCLIntermediate = pixtemp; 02335 02336 //Fifth step: Get horizontal line 02337 //pixOpenBrick(NULL, pix_hollow, min_line_length, 1); 02338 clStatus = pixOpenCL(line_hsize, 1, wpl, h); 02339 02340 //Copy the horizontal line output to CPU buffer 02341 *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl*h, CL_MAP_READ, true, true); 02342 02343 return; 02344 } 02345 02346 02347 /************************************************************************* 02348 * HistogramRect 02349 * Otsu Thresholding Operations 02350 * histogramAllChannels is layed out as all channel 0, then all channel 1... 02351 * only supports 1 or 4 channels (bytes_per_pixel) 02352 ************************************************************************/ 02353 void OpenclDevice::HistogramRectOCL( 02354 const unsigned char* imageData, 02355 int bytes_per_pixel, 02356 int bytes_per_line, 02357 int left, // always 0 02358 int top, // always 0 02359 int width, 02360 int height, 02361 int kHistogramSize, 02362 int* histogramAllChannels) 02363 { 02364 PERF_COUNT_START("HistogramRectOCL") 02365 cl_int clStatus; 02366 KernelEnv histKern; 02367 SetKernelEnv( &histKern ); 02368 KernelEnv histRedKern; 02369 SetKernelEnv( &histRedKern ); 02370 /* map imagedata to device as read only */ 02371 // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need. 02372 // faster option would be to allocate initial image buffer 02373 // using a garlic bus memory type 02374 cl_mem imageBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus ); 02375 CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); 02376 02377 /* setup work group size parameters */ 02378 int block_size = 256; 02379 cl_uint numCUs; 02380 clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL); 02381 CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); 02382 02383 int requestedOccupancy = 10; 02384 int numWorkGroups = numCUs * requestedOccupancy; 02385 int numThreads = block_size*numWorkGroups; 02386 size_t local_work_size[] = {block_size}; 02387 size_t global_work_size[] = {numThreads}; 02388 size_t red_global_work_size[] = {block_size*kHistogramSize*bytes_per_pixel}; 02389 02390 /* map histogramAllChannels as write only */ 02391 int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups; 02392 02393 cl_mem histogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, kHistogramSize*bytes_per_pixel*sizeof(int), (void *)histogramAllChannels, &clStatus ); 02394 CHECK_OPENCL( clStatus, "clCreateBuffer histogramBuffer"); 02395 02396 /* intermediate histogram buffer */ 02397 int histRed = 256; 02398 int tmpHistogramBins = kHistogramSize*bytes_per_pixel*histRed; 02399 02400 cl_mem tmpHistogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins*sizeof(cl_uint), NULL, &clStatus ); 02401 CHECK_OPENCL( clStatus, "clCreateBuffer tmpHistogramBuffer"); 02402 02403 /* atomic sync buffer */ 02404 int *zeroBuffer = new int[1]; 02405 zeroBuffer[0] = 0; 02406 cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus ); 02407 CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer"); 02408 02409 //Create kernel objects based on bytes_per_pixel 02410 if (bytes_per_pixel == 1) 02411 { 02412 histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus ); 02413 CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannel"); 02414 02415 histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectOneChannelReduction", &clStatus ); 02416 CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction"); 02417 } else { 02418 histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus ); 02419 CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels"); 02420 02421 histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus ); 02422 CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction"); 02423 } 02424 02425 void *ptr; 02426 02427 //Initialize tmpHistogramBuffer buffer 02428 ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0, tmpHistogramBins*sizeof(cl_uint), 0, NULL, NULL, &clStatus); 02429 CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer"); 02430 02431 memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint)); 02432 clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, NULL, NULL); 02433 02434 /* set kernel 1 arguments */ 02435 clStatus = clSetKernelArg( histKern.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer ); 02436 CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); 02437 cl_uint numPixels = width*height; 02438 clStatus = clSetKernelArg( histKern.mpkKernel, 1, sizeof(cl_uint), (void *)&numPixels ); 02439 CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" ); 02440 clStatus = clSetKernelArg( histKern.mpkKernel, 2, sizeof(cl_mem), (void *)&tmpHistogramBuffer ); 02441 CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer"); 02442 02443 /* set kernel 2 arguments */ 02444 int n = numThreads/bytes_per_pixel; 02445 clStatus = clSetKernelArg( histRedKern.mpkKernel, 0, sizeof(cl_int), (void *)&n ); 02446 CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); 02447 clStatus = clSetKernelArg( histRedKern.mpkKernel, 1, sizeof(cl_mem), (void *)&tmpHistogramBuffer ); 02448 CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer"); 02449 clStatus = clSetKernelArg( histRedKern.mpkKernel, 2, sizeof(cl_mem), (void *)&histogramBuffer ); 02450 CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer"); 02451 02452 /* launch histogram */ 02453 PERF_COUNT_SUB("before") 02454 clStatus = clEnqueueNDRangeKernel( 02455 histKern.mpkCmdQueue, 02456 histKern.mpkKernel, 02457 1, NULL, global_work_size, local_work_size, 02458 0, NULL, NULL ); 02459 CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" ); 02460 clFinish( histKern.mpkCmdQueue ); 02461 02462 /* launch histogram */ 02463 clStatus = clEnqueueNDRangeKernel( 02464 histRedKern.mpkCmdQueue, 02465 histRedKern.mpkKernel, 02466 1, NULL, red_global_work_size, local_work_size, 02467 0, NULL, NULL ); 02468 CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" ); 02469 clFinish( histRedKern.mpkCmdQueue ); 02470 02471 PERF_COUNT_SUB("redKernel") 02472 02473 /* map results back from gpu */ 02474 ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus); 02475 CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer"); 02476 02477 clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL); 02478 02479 clReleaseMemObject(histogramBuffer); 02480 clReleaseMemObject(imageBuffer); 02481 PERF_COUNT_SUB("after") 02482 PERF_COUNT_END 02483 02484 } 02485 02486 /************************************************************************* 02487 * Threshold the rectangle, taking everything except the image buffer pointer 02488 * from the class, using thresholds/hi_values to the output IMAGE. 02489 * only supports 1 or 4 channels 02490 ************************************************************************/ 02491 void OpenclDevice::ThresholdRectToPixOCL( 02492 const unsigned char* imageData, 02493 int bytes_per_pixel, 02494 int bytes_per_line, 02495 const int* thresholds, 02496 const int* hi_values, 02497 Pix** pix, 02498 int height, 02499 int width, 02500 int top, 02501 int left) { 02502 PERF_COUNT_START("ThresholdRectToPixOCL") 02503 02504 /* create pix result buffer */ 02505 *pix = pixCreate(width, height, 1); 02506 uinT32* pixData = pixGetData(*pix); 02507 int wpl = pixGetWpl(*pix); 02508 int pixSize = wpl*height*sizeof(uinT32); 02509 02510 cl_int clStatus; 02511 KernelEnv rEnv; 02512 SetKernelEnv( &rEnv ); 02513 02514 /* setup work group size parameters */ 02515 int block_size = 256; 02516 cl_uint numCUs = 6; 02517 clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL); 02518 CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); 02519 02520 int requestedOccupancy = 10; 02521 int numWorkGroups = numCUs * requestedOccupancy; 02522 int numThreads = block_size*numWorkGroups; 02523 size_t local_work_size[] = {(size_t) block_size}; 02524 size_t global_work_size[] = {(size_t) numThreads}; 02525 02526 /* map imagedata to device as read only */ 02527 // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need. 02528 // faster option would be to allocate initial image buffer 02529 // using a garlic bus memory type 02530 cl_mem imageBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus ); 02531 CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); 02532 02533 /* map pix as write only */ 02534 pixThBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize, (void *)pixData, &clStatus ); 02535 CHECK_OPENCL( clStatus, "clCreateBuffer pix"); 02536 02537 /* map thresholds and hi_values */ 02538 cl_mem thresholdsBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)thresholds, &clStatus ); 02539 CHECK_OPENCL( clStatus, "clCreateBuffer thresholdBuffer"); 02540 cl_mem hiValuesBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)hi_values, &clStatus ); 02541 CHECK_OPENCL( clStatus, "clCreateBuffer hiValuesBuffer"); 02542 02543 /* compile kernel */ 02544 if (bytes_per_pixel == 4) { 02545 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus ); 02546 CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix"); 02547 } else { 02548 rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus ); 02549 CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan"); 02550 } 02551 02552 /* set kernel arguments */ 02553 clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer ); 02554 CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); 02555 cl_uint numPixels = width*height; 02556 clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(int), (void *)&height ); 02557 CHECK_OPENCL( clStatus, "clSetKernelArg height" ); 02558 clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(int), (void *)&width ); 02559 CHECK_OPENCL( clStatus, "clSetKernelArg width" ); 02560 clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(int), (void *)&wpl ); 02561 CHECK_OPENCL( clStatus, "clSetKernelArg wpl" ); 02562 clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&thresholdsBuffer ); 02563 CHECK_OPENCL( clStatus, "clSetKernelArg thresholdsBuffer" ); 02564 clStatus = clSetKernelArg( rEnv.mpkKernel, 5, sizeof(cl_mem), (void *)&hiValuesBuffer ); 02565 CHECK_OPENCL( clStatus, "clSetKernelArg hiValuesBuffer" ); 02566 clStatus = clSetKernelArg( rEnv.mpkKernel, 6, sizeof(cl_mem), (void *)&pixThBuffer ); 02567 CHECK_OPENCL( clStatus, "clSetKernelArg pixThBuffer"); 02568 02569 /* launch kernel & wait */ 02570 PERF_COUNT_SUB("before") 02571 clStatus = clEnqueueNDRangeKernel( 02572 rEnv.mpkCmdQueue, 02573 rEnv.mpkKernel, 02574 1, NULL, global_work_size, local_work_size, 02575 0, NULL, NULL ); 02576 CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" ); 02577 clFinish( rEnv.mpkCmdQueue ); 02578 PERF_COUNT_SUB("kernel") 02579 02580 /* map results back from gpu */ 02581 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus); 02582 CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer"); 02583 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL, NULL); 02584 02585 clReleaseMemObject(imageBuffer); 02586 clReleaseMemObject(thresholdsBuffer); 02587 clReleaseMemObject(hiValuesBuffer); 02588 02589 PERF_COUNT_SUB("after") 02590 PERF_COUNT_END 02591 } 02592 02593 02594 #if USE_DEVICE_SELECTION 02595 02596 /****************************************************************************** 02597 * Data Types for Device Selection 02598 *****************************************************************************/ 02599 02600 typedef struct _TessScoreEvaluationInputData { 02601 int height; 02602 int width; 02603 int numChannels; 02604 unsigned char *imageData; 02605 Pix *pix; 02606 } TessScoreEvaluationInputData; 02607 02608 void populateTessScoreEvaluationInputData( TessScoreEvaluationInputData *input ) { 02609 srand(1); 02610 // 8.5x11 inches @ 300dpi rounded to clean multiples 02611 int height = 3328; // %256 02612 int width = 2560; // %512 02613 int numChannels = 4; 02614 input->height = height; 02615 input->width = width; 02616 input->numChannels = numChannels; 02617 unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width]; 02618 input->imageData = (unsigned char *) &imageData4[0]; 02619 02620 // zero out image 02621 unsigned char pixelWhite[4] = { 0, 0, 0, 255}; 02622 unsigned char pixelBlack[4] = {255, 255, 255, 255}; 02623 for (int p = 0; p < height*width; p++) { 02624 //unsigned char tmp[4] = imageData4[0]; 02625 imageData4[p][0] = pixelWhite[0]; 02626 imageData4[p][1] = pixelWhite[1]; 02627 imageData4[p][2] = pixelWhite[2]; 02628 imageData4[p][3] = pixelWhite[3]; 02629 } 02630 // random lines to be eliminated 02631 int maxLineWidth = 64; // pixels wide 02632 int numLines = 10; 02633 // vertical lines 02634 for (int i = 0; i < numLines; i++) { 02635 int lineWidth = rand()%maxLineWidth; 02636 int vertLinePos = lineWidth + rand()%(width-2*lineWidth); 02637 //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth); 02638 for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) { 02639 for (int col = 0; col < height; col++) { 02640 //imageData4[row*width+col] = pixelBlack; 02641 imageData4[row*width+col][0] = pixelBlack[0]; 02642 imageData4[row*width+col][1] = pixelBlack[1]; 02643 imageData4[row*width+col][2] = pixelBlack[2]; 02644 imageData4[row*width+col][3] = pixelBlack[3]; 02645 } 02646 } 02647 } 02648 // horizontal lines 02649 for (int i = 0; i < numLines; i++) { 02650 int lineWidth = rand()%maxLineWidth; 02651 int horLinePos = lineWidth + rand()%(height-2*lineWidth); 02652 //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth); 02653 for (int row = 0; row < width; row++) { 02654 for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) { 02655 //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col); 02656 //imageData4[row*width+col] = pixelBlack; 02657 imageData4[row*width+col][0] = pixelBlack[0]; 02658 imageData4[row*width+col][1] = pixelBlack[1]; 02659 imageData4[row*width+col][2] = pixelBlack[2]; 02660 imageData4[row*width+col][3] = pixelBlack[3]; 02661 } 02662 } 02663 } 02664 // spots (noise, squares) 02665 float fractionBlack = 0.1; // how much of the image should be blackened 02666 int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2); 02667 for (int i = 0; i < numSpots; i++) { 02668 02669 int lineWidth = rand()%maxLineWidth; 02670 int col = lineWidth + rand()%(width-2*lineWidth); 02671 int row = lineWidth + rand()%(height-2*lineWidth); 02672 //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col ); 02673 for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) { 02674 for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) { 02675 //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c ); 02676 //imageData4[row*width+col] = pixelBlack; 02677 imageData4[r*width+c][0] = pixelBlack[0]; 02678 imageData4[r*width+c][1] = pixelBlack[1]; 02679 imageData4[r*width+c][2] = pixelBlack[2]; 02680 imageData4[r*width+c][3] = pixelBlack[3]; 02681 } 02682 } 02683 } 02684 02685 input->pix = pixCreate(input->width, input->height, 1); 02686 } 02687 02688 typedef struct _TessDeviceScore { 02689 float time; // small time means faster device 02690 bool clError; // were there any opencl errors 02691 bool valid; // was the correct response generated 02692 } TessDeviceScore; 02693 02694 /****************************************************************************** 02695 * Micro Benchmarks for Device Selection 02696 *****************************************************************************/ 02697 02698 double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { 02699 02700 double time = 0; 02701 #if ON_WINDOWS 02702 LARGE_INTEGER freq, time_funct_start, time_funct_end; 02703 QueryPerformanceFrequency(&freq); 02704 #else 02705 timespec time_funct_start, time_funct_end; 02706 #endif 02707 // input data 02708 l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload 02709 02710 // function call 02711 if (type == DS_DEVICE_OPENCL_DEVICE) { 02712 #if ON_WINDOWS 02713 QueryPerformanceCounter(&time_funct_start); 02714 #else 02715 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02716 #endif 02717 02718 OpenclDevice::gpuEnv = *env; 02719 int wpl = pixGetWpl(input.pix); 02720 OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, NULL); 02721 #if ON_WINDOWS 02722 QueryPerformanceCounter(&time_funct_end); 02723 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 02724 #else 02725 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 02726 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 02727 #endif 02728 02729 } else { 02730 #if ON_WINDOWS 02731 QueryPerformanceCounter(&time_funct_start); 02732 #else 02733 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02734 #endif 02735 Pix *pix = pixCreate(input.width, input.height, 32); 02736 l_uint32 *pixData = pixGetData(pix); 02737 int wpl = pixGetWpl(pix); 02738 //l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line); 02739 //pixSetData(pix, output_gpu); 02740 int i, j; 02741 int idx = 0; 02742 for (i = 0; i < input.height ; i++) { 02743 for (j = 0; j < input.width; j++) { 02744 02745 l_uint32 tiffword = tiffdata[i * input.width + j]; 02746 l_int32 rval = ((tiffword) & 0xff); 02747 l_int32 gval = (((tiffword) >> 8) & 0xff); 02748 l_int32 bval = (((tiffword) >> 16) & 0xff); 02749 l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8); 02750 pixData[idx] = value; 02751 idx++; 02752 } 02753 } 02754 #if ON_WINDOWS 02755 QueryPerformanceCounter(&time_funct_end); 02756 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 02757 #else 02758 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 02759 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 02760 #endif 02761 pixDestroy(&pix); 02762 } 02763 02764 02765 // cleanup 02766 02767 return time; 02768 } 02769 02770 double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { 02771 02772 double time; 02773 #if ON_WINDOWS 02774 LARGE_INTEGER freq, time_funct_start, time_funct_end; 02775 QueryPerformanceFrequency(&freq); 02776 #else 02777 timespec time_funct_start, time_funct_end; 02778 #endif 02779 02780 unsigned char pixelHi = (unsigned char)255; 02781 02782 int left = 0; 02783 int top = 0; 02784 int kHistogramSize = 256; 02785 int bytes_per_line = input.width*input.numChannels; 02786 int *histogramAllChannels = new int[kHistogramSize*input.numChannels]; 02787 02788 // function call 02789 if (type == DS_DEVICE_OPENCL_DEVICE) { 02790 #if ON_WINDOWS 02791 QueryPerformanceCounter(&time_funct_start); 02792 #else 02793 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02794 #endif 02795 02796 OpenclDevice::gpuEnv = *env; 02797 int wpl = pixGetWpl(input.pix); 02798 OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels); 02799 02800 #if ON_WINDOWS 02801 QueryPerformanceCounter(&time_funct_end); 02802 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 02803 #else 02804 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 02805 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 02806 #endif 02807 } else { 02808 02809 int *histogram = new int[kHistogramSize]; 02810 #if ON_WINDOWS 02811 QueryPerformanceCounter(&time_funct_start); 02812 #else 02813 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02814 #endif 02815 for (int ch = 0; ch < input.numChannels; ++ch) { 02816 tesseract::HistogramRect(input.pix, input.numChannels, 02817 left, top, input.width, input.height, histogram); 02818 } 02819 #if ON_WINDOWS 02820 QueryPerformanceCounter(&time_funct_end); 02821 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 02822 #else 02823 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 02824 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 02825 #endif 02826 delete[] histogram; 02827 } 02828 02829 // cleanup 02830 //delete[] imageData; 02831 delete[] histogramAllChannels; 02832 return time; 02833 } 02834 02835 //Reproducing the ThresholdRectToPix native version 02836 void ThresholdRectToPix_Native(const unsigned char* imagedata, 02837 int bytes_per_pixel, 02838 int bytes_per_line, 02839 const int* thresholds, 02840 const int* hi_values, 02841 Pix** pix) { 02842 int top = 0; 02843 int left = 0; 02844 int width = pixGetWidth(*pix); 02845 int height = pixGetHeight(*pix); 02846 02847 *pix = pixCreate(width, height, 1); 02848 uinT32* pixdata = pixGetData(*pix); 02849 int wpl = pixGetWpl(*pix); 02850 const unsigned char* srcdata = imagedata + top * bytes_per_line + 02851 left * bytes_per_pixel; 02852 for (int y = 0; y < height; ++y) { 02853 const uinT8* linedata = srcdata; 02854 uinT32* pixline = pixdata + y * wpl; 02855 for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) { 02856 bool white_result = true; 02857 for (int ch = 0; ch < bytes_per_pixel; ++ch) { 02858 if (hi_values[ch] >= 0 && 02859 (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) { 02860 white_result = false; 02861 break; 02862 } 02863 } 02864 if (white_result) 02865 CLEAR_DATA_BIT(pixline, x); 02866 else 02867 SET_DATA_BIT(pixline, x); 02868 } 02869 srcdata += bytes_per_line; 02870 } 02871 } 02872 02873 double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { 02874 02875 double time; 02876 #if ON_WINDOWS 02877 LARGE_INTEGER freq, time_funct_start, time_funct_end; 02878 QueryPerformanceFrequency(&freq); 02879 #else 02880 timespec time_funct_start, time_funct_end; 02881 #endif 02882 02883 // input data 02884 unsigned char pixelHi = (unsigned char)255; 02885 int* thresholds = new int[4]; 02886 thresholds[0] = pixelHi/2; 02887 thresholds[1] = pixelHi/2; 02888 thresholds[2] = pixelHi/2; 02889 thresholds[3] = pixelHi/2; 02890 int *hi_values = new int[4]; 02891 thresholds[0] = pixelHi; 02892 thresholds[1] = pixelHi; 02893 thresholds[2] = pixelHi; 02894 thresholds[3] = pixelHi; 02895 //Pix* pix = pixCreate(width, height, 1); 02896 int top = 0; 02897 int left = 0; 02898 int bytes_per_line = input.width*input.numChannels; 02899 02900 // function call 02901 if (type == DS_DEVICE_OPENCL_DEVICE) { 02902 #if ON_WINDOWS 02903 QueryPerformanceCounter(&time_funct_start); 02904 #else 02905 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02906 #endif 02907 02908 OpenclDevice::gpuEnv = *env; 02909 int wpl = pixGetWpl(input.pix); 02910 OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left); 02911 02912 #if ON_WINDOWS 02913 QueryPerformanceCounter(&time_funct_end); 02914 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 02915 #else 02916 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 02917 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 02918 #endif 02919 } else { 02920 02921 02922 tesseract::ImageThresholder thresholder; 02923 thresholder.SetImage( input.pix ); 02924 #if ON_WINDOWS 02925 QueryPerformanceCounter(&time_funct_start); 02926 #else 02927 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02928 #endif 02929 ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line, 02930 thresholds, hi_values, &input.pix ); 02931 02932 #if ON_WINDOWS 02933 QueryPerformanceCounter(&time_funct_end); 02934 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 02935 #else 02936 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 02937 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 02938 #endif 02939 } 02940 02941 // cleanup 02942 delete[] thresholds; 02943 delete[] hi_values; 02944 return time; 02945 } 02946 02947 double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { 02948 02949 double time = 0; 02950 #if ON_WINDOWS 02951 LARGE_INTEGER freq, time_funct_start, time_funct_end; 02952 QueryPerformanceFrequency(&freq); 02953 #else 02954 timespec time_funct_start, time_funct_end; 02955 #endif 02956 02957 // input data 02958 int resolution = 300; 02959 int wpl = pixGetWpl(input.pix); 02960 int kThinLineFraction = 20; // tess constant 02961 int kMinLineLengthFraction = 4; // tess constant 02962 int max_line_width = resolution / kThinLineFraction; 02963 int min_line_length = resolution / kMinLineLengthFraction; 02964 int closing_brick = max_line_width / 3; 02965 02966 // function call 02967 if (type == DS_DEVICE_OPENCL_DEVICE) { 02968 #if ON_WINDOWS 02969 QueryPerformanceCounter(&time_funct_start); 02970 #else 02971 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02972 #endif 02973 Pix *src_pix = input.pix; 02974 OpenclDevice::gpuEnv = *env; 02975 OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix); 02976 Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL; 02977 OpenclDevice::pixGetLinesCL(NULL, input.pix, &pix_vline, &pix_hline, &pix_closed, true, closing_brick, closing_brick, max_line_width, max_line_width, min_line_length, min_line_length); 02978 02979 OpenclDevice::releaseMorphCLBuffers(); 02980 02981 #if ON_WINDOWS 02982 QueryPerformanceCounter(&time_funct_end); 02983 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 02984 #else 02985 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 02986 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 02987 #endif 02988 } else { 02989 #if ON_WINDOWS 02990 QueryPerformanceCounter(&time_funct_start); 02991 #else 02992 clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); 02993 #endif 02994 02995 // native serial code 02996 Pix *src_pix = input.pix; 02997 Pix *pix_closed = pixCloseBrick(NULL, src_pix, closing_brick, closing_brick); 02998 Pix *pix_solid = pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width); 02999 Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid); 03000 pixDestroy(&pix_solid); 03001 Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length); 03002 Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1); 03003 pixDestroy(&pix_hollow); 03004 03005 #if ON_WINDOWS 03006 QueryPerformanceCounter(&time_funct_end); 03007 time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); 03008 #else 03009 clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); 03010 time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; 03011 #endif 03012 } 03013 03014 return time; 03015 } 03016 03017 03018 03019 /****************************************************************************** 03020 * Device Selection 03021 *****************************************************************************/ 03022 03023 #include "stdlib.h" 03024 03025 03026 // encode score object as byte string 03027 ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) { 03028 *serializedScoreSize = sizeof(TessDeviceScore); 03029 *serializedScore = (void *) new unsigned char[*serializedScoreSize]; 03030 memcpy(*serializedScore, device->score, *serializedScoreSize); 03031 return DS_SUCCESS; 03032 } 03033 03034 // parses byte string and stores in score object 03035 ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) { 03036 // check that serializedScoreSize == sizeof(TessDeviceScore); 03037 device->score = new TessDeviceScore; 03038 memcpy(device->score, serializedScore, serializedScoreSize); 03039 return DS_SUCCESS; 03040 } 03041 03042 03043 03044 // evaluate devices 03045 ds_status evaluateScoreForDevice( ds_device *device, void *inputData) { 03046 03047 // overwrite statuc gpuEnv w/ current device 03048 // so native opencl calls can be used; they use static gpuEnv 03049 printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" ); 03050 GPUEnv *env = NULL; 03051 if (device->type == DS_DEVICE_OPENCL_DEVICE) { 03052 env = new GPUEnv; 03053 //printf("[DS] populating tmp GPUEnv from device\n"); 03054 populateGPUEnvFromDevice( env, device->oclDeviceID); 03055 env->mnFileCount = 0; //argc; 03056 env->mnKernelCount = 0UL; 03057 //printf("[DS] compiling kernels for tmp GPUEnv\n"); 03058 OpenclDevice::gpuEnv = *env; 03059 OpenclDevice::CompileKernelFile(env, ""); 03060 } 03061 03062 03063 TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData; 03064 03065 // pixReadTiff 03066 double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type ); 03067 03068 // HistogramRect 03069 double histogramRectTime = histogramRectMicroBench( env, *input, device->type ); 03070 03071 // ThresholdRectToPix 03072 double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type ); 03073 03074 // getLineMasks 03075 double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type ); 03076 03077 03078 // weigh times (% of cpu time) 03079 // these weights should be the % execution time that the native cpu code took 03080 float composeRGBPixelWeight = 1.2f; 03081 float histogramRectWeight = 2.4f; 03082 float thresholdRectToPixWeight = 4.5f; 03083 float getLineMasksMorphWeight = 5.0f; 03084 03085 float weightedTime = 03086 composeRGBPixelWeight * composeRGBPixelTime + 03087 histogramRectWeight * histogramRectTime + 03088 thresholdRectToPixWeight * thresholdRectToPixTime + 03089 getLineMasksMorphWeight * getLineMasksMorphTime 03090 ; 03091 device->score = (void *)new TessDeviceScore; 03092 ((TessDeviceScore *)device->score)->time = weightedTime; 03093 03094 printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" ); 03095 printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight ); 03096 printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight ); 03097 printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight ); 03098 printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight ); 03099 printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time ); 03100 return DS_SUCCESS; 03101 } 03102 03103 // initial call to select device 03104 ds_device OpenclDevice::getDeviceSelection( ) { 03105 //PERF_COUNT_START("getDeviceSelection") 03106 if (!deviceIsSelected) { 03107 PERF_COUNT_START("getDeviceSelection") 03108 // check if opencl is available at runtime 03109 if( 1 == LoadOpencl() ) { 03110 // opencl is available 03111 //PERF_COUNT_SUB("LoadOpencl") 03112 // setup devices 03113 ds_status status; 03114 ds_profile *profile; 03115 status = initDSProfile( &profile, "v0.1" ); 03116 PERF_COUNT_SUB("initDSProfile") 03117 // try reading scores from file 03118 char *fileName = "tesseract_opencl_profile_devices.dat"; 03119 status = readProfileFromFile( profile, deserializeScore, fileName); 03120 if (status != DS_SUCCESS) { 03121 // need to run evaluation 03122 printf("[DS] Profile file not available (%s); performing profiling.\n", fileName); 03123 03124 // create input data 03125 TessScoreEvaluationInputData input; 03126 populateTessScoreEvaluationInputData( &input ); 03127 //PERF_COUNT_SUB("populateTessScoreEvaluationInputData") 03128 // perform evaluations 03129 unsigned int numUpdates; 03130 status = profileDevices( profile, DS_EVALUATE_ALL, evaluateScoreForDevice, (void *)&input, &numUpdates ); 03131 PERF_COUNT_SUB("profileDevices") 03132 // write scores to file 03133 if ( status == DS_SUCCESS ) { 03134 status = writeProfileToFile( profile, serializeScore, fileName); 03135 PERF_COUNT_SUB("writeProfileToFile") 03136 if ( status == DS_SUCCESS ) { 03137 printf("[DS] Scores written to file (%s).\n", fileName); 03138 } else { 03139 printf("[DS] Error saving scores to file (%s); scores not written to file.\n", fileName); 03140 } 03141 } else { 03142 printf("[DS] Unable to evaluate performance; scores not written to file.\n"); 03143 } 03144 03145 } else { 03146 03147 PERF_COUNT_SUB("readProfileFromFile") 03148 printf("[DS] Profile read from file (%s).\n", fileName); 03149 } 03150 03151 // we now have device scores either from file or evaluation 03152 // select fastest using custom Tesseract selection algorithm 03153 float bestTime = FLT_MAX; // begin search with worst possible time 03154 int bestDeviceIdx = -1; 03155 for (int d = 0; d < profile->numDevices; d++) { 03156 //((TessDeviceScore *)device->score)->time 03157 ds_device device = profile->devices[d]; 03158 TessDeviceScore score = *(TessDeviceScore *)device.score; 03159 03160 float time = score.time; 03161 printf("[DS] Device[%i] %i:%s score is %f\n", d+1, device.type, device.oclDeviceName, time); 03162 if (time < bestTime) { 03163 bestTime = time; 03164 bestDeviceIdx = d; 03165 } 03166 } 03167 printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native"); 03168 // cleanup 03169 // TODO: call destructor for profile object? 03170 03171 bool overrided = false; 03172 char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE"); 03173 if (overrideDeviceStr != NULL) { 03174 int overrideDeviceIdx = atoi(overrideDeviceStr); 03175 if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices ) { 03176 printf("[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, %i)\n", overrideDeviceStr, overrideDeviceIdx); 03177 bestDeviceIdx = overrideDeviceIdx - 1; 03178 overrided = true; 03179 } else { 03180 printf("[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are valid devices).\n", overrideDeviceStr, profile->numDevices); 03181 } 03182 } 03183 03184 if (overrided) { 03185 printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native"); 03186 } 03187 selectedDevice = profile->devices[bestDeviceIdx]; 03188 03189 } else { 03190 // opencl isn't available at runtime, select native cpu device 03191 printf("[DS] OpenCL runtime not available.\n"); 03192 selectedDevice.type = DS_DEVICE_NATIVE_CPU; 03193 selectedDevice.oclDeviceName = "(null)"; 03194 selectedDevice.score = NULL; 03195 selectedDevice.oclDeviceID = NULL; 03196 selectedDevice.oclDriverVersion = NULL; 03197 } 03198 deviceIsSelected = true; 03199 PERF_COUNT_SUB("select from Profile") 03200 PERF_COUNT_END 03201 } 03202 //PERF_COUNT_END 03203 return selectedDevice; 03204 } 03205 03206 #endif 03207 03208 bool OpenclDevice::selectedDeviceIsOpenCL() { 03209 #if USE_DEVICE_SELECTION 03210 ds_device device = getDeviceSelection(); 03211 return (device.type == DS_DEVICE_OPENCL_DEVICE); 03212 #else 03213 return true; 03214 #endif 03215 } 03216 03217 bool OpenclDevice::selectedDeviceIsNativeCPU() { 03218 #if USE_DEVICE_SELECTION 03219 ds_device device = getDeviceSelection(); 03220 return (device.type == DS_DEVICE_NATIVE_CPU); 03221 #else 03222 return false; 03223 #endif 03224 } 03225 03226 03227 03228 #endif