c11_atomics: Fix to iteratively reduce workgroup size (#939) (#941)

The generator function for atomic_fence uses the current workgroup
size to determine the number of non atomic variables per thread
in the kernel. The kernel should hence be regenerated when the
workgroup size changes.

However regenerating the kernel can itself change the workgroup
size. This change introduces an iterative loop that reduces the
workgroup sizes by 2 each time re-generating the kernel until
we find one that works (or exit at groupsize == 1)

Change-Id: Ic32fe967e32de6643e01c6775f4bddbcad0a299a
This commit is contained in:
Sreelakshmi Haridas Maruthur
2020-09-22 10:15:09 -06:00
committed by GitHub
parent 9178524d02
commit 60b731ce15

View File

@@ -962,51 +962,76 @@ int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id dev
if(deviceThreadCount > 0)
{
cl_ulong usedLocalMemory;
cl_ulong totalLocalMemory;
cl_uint maxWorkGroupSize;
// This loop iteratively reduces the workgroup size by 2 and then
// re-generates the kernel with the reduced
// workgroup size until we find a size which is admissible for the kernel
// being run or reduce the wg size
// to the trivial case of 1 (which was separately verified to be accurate
// for the kernel being run)
// Set up the kernel code
programSource = PragmaHeader(deviceID)+ProgramHeader(numDestItems)+FunctionCode()+KernelCode(numDestItems);
programLine = programSource.c_str();
if (create_single_kernel_helper_with_build_options(
context, &program, &kernel, 1, &programLine, "test_atomic_kernel",
gOldAPI ? "" : nullptr))
{
return -1;
}
if(gDebug)
{
log_info("Program source:\n");
log_info("%s\n", programLine);
}
// tune up work sizes based on kernel info
error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
test_error(error, "Unable to obtain max work group size for device and kernel combo");
while ((CurrentGroupSize() > 1))
{
// Re-generate the kernel code with the current group size
if (kernel) clReleaseKernel(kernel);
if (program) clReleaseProgram(program);
programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
+ FunctionCode() + KernelCode(numDestItems);
programLine = programSource.c_str();
if (create_single_kernel_helper_with_build_options(
context, &program, &kernel, 1, &programLine,
"test_atomic_kernel", gOldAPI ? "" : nullptr))
{
return -1;
}
// Get work group size for the new kernel
error = clGetKernelWorkGroupInfo(kernel, deviceID,
CL_KERNEL_WORK_GROUP_SIZE,
sizeof(groupSize), &groupSize, NULL);
test_error(error,
"Unable to obtain max work group size for device and "
"kernel combo");
if(LocalMemory())
{
error = clGetKernelWorkGroupInfo (kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(usedLocalMemory), &usedLocalMemory, NULL);
test_error(error, "clGetKernelWorkGroupInfo failed");
if (LocalMemory())
{
cl_ulong usedLocalMemory;
cl_ulong totalLocalMemory;
cl_uint maxWorkGroupSize;
error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(totalLocalMemory), &totalLocalMemory, NULL);
test_error(error, "clGetDeviceInfo failed");
error = clGetKernelWorkGroupInfo(
kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
sizeof(usedLocalMemory), &usedLocalMemory, NULL);
test_error(error, "clGetKernelWorkGroupInfo failed");
// We know that each work-group is going to use typeSize * deviceThreadCount bytes of local memory
// so pick the maximum value for deviceThreadCount that uses all the local memory.
maxWorkGroupSize = ((totalLocalMemory - usedLocalMemory) / typeSize);
error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
sizeof(totalLocalMemory),
&totalLocalMemory, NULL);
test_error(error, "clGetDeviceInfo failed");
if(maxWorkGroupSize < groupSize)
groupSize = maxWorkGroupSize;
}
// We know that each work-group is going to use typeSize *
// deviceThreadCount bytes of local memory
// so pick the maximum value for deviceThreadCount that uses all
// the local memory.
maxWorkGroupSize =
((totalLocalMemory - usedLocalMemory) / typeSize);
CurrentGroupSize((cl_uint)groupSize);
if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
}
if (CurrentGroupSize() <= groupSize)
break;
else
CurrentGroupSize(CurrentGroupSize() / 2);
}
if(CurrentGroupSize() > deviceThreadCount)
CurrentGroupSize(deviceThreadCount);
if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
threadCount = deviceThreadCount+hostThreadCount;
}
if (gDebug)
{
log_info("Program source:\n");
log_info("%s\n", programLine);
}
if(deviceThreadCount > 0)
log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize());
if(hostThreadCount > 0)