Use CL_KERNEL_WORK_GROUP_SIZE more often (#2435)

Drivers _may_ choose to advertise values for
`CL_DEVICE_MAX_WORK_GROUP_SIZE` or `CL_DEVICE_MAX_WORK_ITEM_SIZES` that
kernels without a `reqd_work_group_size` are not able to be launched
with.

The CTS should therefore make sure that the local_size passed to
`clEnqueueNDRangeKernel` does not exceed `CL_KERNEL_WORK_GROUP_SIZE`

This fixes it up in two places I've noticed this not happening.
This commit is contained in:
Karol Herbst
2025-08-05 23:40:11 +02:00
committed by GitHub
parent 086a6c67fb
commit e2580bded2
2 changed files with 20 additions and 16 deletions

View File

@@ -108,7 +108,11 @@ REGISTER_TEST_VERSION(sub_group_dispatch, Version(2, 1))
nullptr); nullptr);
test_error(error, "clGetDeviceInfo failed"); test_error(error, "clGetDeviceInfo failed");
max_local = max_work_item_sizes[0]; error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
sizeof(max_local), &max_local, nullptr);
test_error(error, "clGetKernelWorkGroupInfo failed");
max_local = std::min(max_local, max_work_item_sizes[0]);
error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform),
(void *)&platform, NULL); (void *)&platform, NULL);

View File

@@ -107,21 +107,6 @@ static int run_kernel( cl_device_id device, cl_context context, cl_command_queue
threads[1] = h; threads[1] = h;
threads[2] = d; threads[2] = d;
err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES,
3 * sizeof(size_t), (size_t *)localThreads, NULL);
test_error(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES) failed");
err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
&maxWorkgroupSize, NULL);
test_error(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed");
localThreads[0] =
std::min({ localThreads[0], threads[0], maxWorkgroupSize });
localThreads[1] = std::min(
{ localThreads[1], threads[1], maxWorkgroupSize / localThreads[0] });
localThreads[2] =
std::min({ localThreads[2], threads[2],
maxWorkgroupSize / (localThreads[0] * localThreads[1]) });
clSamplerWrapper sampler = clCreateSampler( clSamplerWrapper sampler = clCreateSampler(
context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &err); context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &err);
test_error(err, "clCreateSampler failed"); test_error(err, "clCreateSampler failed");
@@ -143,6 +128,21 @@ static int run_kernel( cl_device_id device, cl_context context, cl_command_queue
&read3d_kernel_code, "read3d"); &read3d_kernel_code, "read3d");
test_error(err, "create_single_kernel_helper failed"); test_error(err, "create_single_kernel_helper failed");
err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES,
3 * sizeof(size_t), (size_t *)localThreads, NULL);
test_error(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES) failed");
err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
sizeof(size_t), &maxWorkgroupSize, NULL);
test_error(err, "clGetDeviceInfo(CL_KERNEL_WORK_GROUP_SIZE) failed\n");
localThreads[0] =
std::min({ localThreads[0], threads[0], maxWorkgroupSize });
localThreads[1] = std::min(
{ localThreads[1], threads[1], maxWorkgroupSize / localThreads[0] });
localThreads[2] =
std::min({ localThreads[2], threads[2],
maxWorkgroupSize / (localThreads[0] * localThreads[1]) });
// create kernel args object and set arg values. // create kernel args object and set arg values.
// set the args values // set the args values
err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjs[0]); err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjs[0]);