fix profiling execute_multipass (#2239)

- fix clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES) by using the proper
size

- clamp localThreads[2] as for localThreads[0] and localThreads[2]

- clamp all localThreads elements in regard of CL_MAX_WORK_GROUP_SIZE

- fix the size using to create/read the output buffer

Fix #2238
This commit is contained in:
Romaric Jodin
2025-02-19 05:49:12 +01:00
committed by GitHub
parent 8c298b1c3b
commit 84fd99da76

View File

@@ -15,6 +15,7 @@
// //
#include "harness/compat.h" #include "harness/compat.h"
#include <algorithm>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <time.h> #include <time.h>
@@ -97,6 +98,7 @@ static int run_kernel( cl_device_id device, cl_context context, cl_command_queue
cl_ulong queueStart, submitStart, writeStart, writeEnd; cl_ulong queueStart, submitStart, writeStart, writeEnd;
size_t threads[3]; size_t threads[3];
size_t localThreads[3]; size_t localThreads[3];
size_t maxWorkgroupSize;
int err = 0; int err = 0;
// set thread dimensions // set thread dimensions
@@ -104,16 +106,27 @@ static int run_kernel( cl_device_id device, cl_context context, cl_command_queue
threads[1] = h; threads[1] = h;
threads[2] = d; threads[2] = d;
err = clGetDeviceInfo( device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( cl_uint ), (size_t*)localThreads, NULL ); err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES,
3 * sizeof(size_t), (size_t *)localThreads, NULL);
if (err) if (err)
{ {
localThreads[0] = 256; localThreads[1] = 1; localThreads[2] = 1; log_error("clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES) failed\n");
err = 0; return -1;
} }
if( localThreads[0] > threads[0] ) err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
localThreads[0] = threads[0]; &maxWorkgroupSize, NULL);
if( localThreads[1] > threads[1] ) if (err)
localThreads[1] = threads[1]; {
log_error("clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed\n");
return -1;
}
localThreads[0] =
std::min({ localThreads[0], threads[0], maxWorkgroupSize });
localThreads[1] = std::min(
{ localThreads[1], threads[1], maxWorkgroupSize / localThreads[0] });
localThreads[2] =
std::min({ localThreads[2], threads[2],
maxWorkgroupSize / (localThreads[0] * localThreads[1]) });
cl_sampler sampler = clCreateSampler( context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &err ); cl_sampler sampler = clCreateSampler( context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &err );
if( err ){ if( err ){
@@ -131,9 +144,9 @@ static int run_kernel( cl_device_id device, cl_context context, cl_command_queue
} }
// allocate an array memory object to load the filter weights // allocate an array memory object to load the filter weights
size_t outptr_size = sizeof(cl_uchar) * w * h * d * nChannels;
memobjs[1] = memobjs[1] =
clCreateBuffer(context, CL_MEM_READ_WRITE, clCreateBuffer(context, CL_MEM_READ_WRITE, outptr_size, NULL, &err);
sizeof(cl_uchar) * w * h * d * nChannels, NULL, &err);
if( memobjs[1] == (cl_mem)0 ){ if( memobjs[1] == (cl_mem)0 ){
log_error( " unable to create array using clCreateBuffer\n" ); log_error( " unable to create array using clCreateBuffer\n" );
clReleaseMemObject( memobjs[0] ); clReleaseMemObject( memobjs[0] );
@@ -237,9 +250,8 @@ static int run_kernel( cl_device_id device, cl_context context, cl_command_queue
} }
// read output image // read output image
err = clEnqueueReadBuffer(queue, memobjs[1], CL_TRUE, 0, err = clEnqueueReadBuffer(queue, memobjs[1], CL_TRUE, 0, outptr_size,
sizeof(cl_uchar) * w * h * d * nChannels, outptr, outptr, 0, NULL, NULL);
0, NULL, NULL);
if( err != CL_SUCCESS ){ if( err != CL_SUCCESS ){
print_error( err, "clReadImage failed\n" ); print_error( err, "clReadImage failed\n" );
clReleaseKernel( kernel[0] ); clReleaseKernel( kernel[0] );