From e2580bded220c6ac9786a1008b479aa96d32d359 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Tue, 5 Aug 2025 23:40:11 +0200 Subject: [PATCH] Use CL_KERNEL_WORK_GROUP_SIZE more often (#2435) Drivers _may_ choose to advertise values for `CL_DEVICE_MAX_WORK_GROUP_SIZE` or `CL_DEVICE_MAX_WORK_ITEM_SIZES` that kernels without a `reqd_work_group_size` are not able to be launched with. The CTS should therefore make sure that the local_size passed to `clEnqueueNDRangeKernel` does not exceed `CL_KERNEL_WORK_GROUP_SIZE` This fixes it up in two places I've noticed this not happening. --- .../api/test_sub_group_dispatch.cpp | 6 +++- .../profiling/execute_multipass.cpp | 30 +++++++++---------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/test_conformance/api/test_sub_group_dispatch.cpp b/test_conformance/api/test_sub_group_dispatch.cpp index 70a78f4a..c1b88be5 100644 --- a/test_conformance/api/test_sub_group_dispatch.cpp +++ b/test_conformance/api/test_sub_group_dispatch.cpp @@ -108,7 +108,11 @@ REGISTER_TEST_VERSION(sub_group_dispatch, Version(2, 1)) nullptr); test_error(error, "clGetDeviceInfo failed"); - max_local = max_work_item_sizes[0]; + error = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, + sizeof(max_local), &max_local, nullptr); + test_error(error, "clGetKernelWorkGroupInfo failed"); + + max_local = std::min(max_local, max_work_item_sizes[0]); error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), (void *)&platform, NULL); diff --git a/test_conformance/profiling/execute_multipass.cpp b/test_conformance/profiling/execute_multipass.cpp index 7d654ca5..7a711e6f 100644 --- a/test_conformance/profiling/execute_multipass.cpp +++ b/test_conformance/profiling/execute_multipass.cpp @@ -107,21 +107,6 @@ static int run_kernel( cl_device_id device, cl_context context, cl_command_queue threads[1] = h; threads[2] = d; - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, - 3 * sizeof(size_t), (size_t *)localThreads, NULL); - test_error(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES) failed"); - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), - &maxWorkgroupSize, NULL); - test_error(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed"); - - localThreads[0] = - std::min({ localThreads[0], threads[0], maxWorkgroupSize }); - localThreads[1] = std::min( - { localThreads[1], threads[1], maxWorkgroupSize / localThreads[0] }); - localThreads[2] = - std::min({ localThreads[2], threads[2], - maxWorkgroupSize / (localThreads[0] * localThreads[1]) }); - clSamplerWrapper sampler = clCreateSampler( context, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &err); test_error(err, "clCreateSampler failed"); @@ -143,6 +128,21 @@ static int run_kernel( cl_device_id device, cl_context context, cl_command_queue &read3d_kernel_code, "read3d"); test_error(err, "create_single_kernel_helper failed"); + err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, + 3 * sizeof(size_t), (size_t *)localThreads, NULL); + test_error(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES) failed"); + err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, + sizeof(size_t), &maxWorkgroupSize, NULL); + test_error(err, "clGetDeviceInfo(CL_KERNEL_WORK_GROUP_SIZE) failed\n"); + + localThreads[0] = + std::min({ localThreads[0], threads[0], maxWorkgroupSize }); + localThreads[1] = std::min( + { localThreads[1], threads[1], maxWorkgroupSize / localThreads[0] }); + localThreads[2] = + std::min({ localThreads[2], threads[2], + maxWorkgroupSize / (localThreads[0] * localThreads[1]) }); + // create kernel args object and set arg values. // set the args values err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjs[0]);