c11_atomics: Fix to iteratively reduce workgroup size (#939) (#941)

The generator function for atomic_fence uses the current workgroup size to determine the number of non atomic variables per thread in the kernel. The kernel should hence be regenerated when the workgroup size changes. However regenerating the kernel can itself change the workgroup size. This change introduces an iterative loop that reduces the workgroup sizes by 2 each time re-generating the kernel until we find one that works (or exit at groupsize == 1) Change-Id: Ic32fe967e32de6643e01c6775f4bddbcad0a299a
2026-03-19 06:09:01 +00:00 · 2020-09-22 10:15:09 -06:00
parent 9178524d02
commit 60b731ce15
1 changed files with 58 additions and 33 deletions
--- a/test_conformance/c11_atomics/common.h
+++ b/test_conformance/c11_atomics/common.h
@@ -962,51 +962,76 @@ int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id dev

  if(deviceThreadCount > 0)
  {
-    cl_ulong usedLocalMemory;
-    cl_ulong totalLocalMemory;
-    cl_uint maxWorkGroupSize;
+      // This loop iteratively reduces the workgroup size by 2 and then
+      // re-generates the kernel with the reduced
+      // workgroup size until we find a size which is admissible for the kernel
+      // being run or reduce the wg size
+      // to the trivial case of 1 (which was separately verified to be accurate
+      // for the kernel being run)

-    // Set up the kernel code
-    programSource = PragmaHeader(deviceID)+ProgramHeader(numDestItems)+FunctionCode()+KernelCode(numDestItems);
-    programLine = programSource.c_str();
-    if (create_single_kernel_helper_with_build_options(
-            context, &program, &kernel, 1, &programLine, "test_atomic_kernel",
-            gOldAPI ? "" : nullptr))
-    {
-      return -1;
-    }
-    if(gDebug)
-    {
-      log_info("Program source:\n");
-      log_info("%s\n", programLine);
-    }
-    // tune up work sizes based on kernel info
-    error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
-    test_error(error, "Unable to obtain max work group size for device and kernel combo");
+      while ((CurrentGroupSize() > 1))
+      {
+          // Re-generate the kernel code with the current group size
+          if (kernel) clReleaseKernel(kernel);
+          if (program) clReleaseProgram(program);
+          programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
+              + FunctionCode() + KernelCode(numDestItems);
+          programLine = programSource.c_str();
+          if (create_single_kernel_helper_with_build_options(
+                  context, &program, &kernel, 1, &programLine,
+                  "test_atomic_kernel", gOldAPI ? "" : nullptr))
+          {
+              return -1;
+          }
+          // Get work group size for the new kernel
+          error = clGetKernelWorkGroupInfo(kernel, deviceID,
+                                           CL_KERNEL_WORK_GROUP_SIZE,
+                                           sizeof(groupSize), &groupSize, NULL);
+          test_error(error,
+                     "Unable to obtain max work group size for device and "
+                     "kernel combo");

-    if(LocalMemory())
-    {
-      error = clGetKernelWorkGroupInfo (kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(usedLocalMemory), &usedLocalMemory, NULL);
-      test_error(error, "clGetKernelWorkGroupInfo failed");
+          if (LocalMemory())
+          {
+              cl_ulong usedLocalMemory;
+              cl_ulong totalLocalMemory;
+              cl_uint maxWorkGroupSize;

-      error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(totalLocalMemory), &totalLocalMemory, NULL);
-      test_error(error, "clGetDeviceInfo failed");
+              error = clGetKernelWorkGroupInfo(
+                  kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
+                  sizeof(usedLocalMemory), &usedLocalMemory, NULL);
+              test_error(error, "clGetKernelWorkGroupInfo failed");

-      // We know that each work-group is going to use typeSize * deviceThreadCount bytes of local memory
-      // so pick the maximum value for deviceThreadCount that uses all the local memory.
-      maxWorkGroupSize = ((totalLocalMemory - usedLocalMemory) / typeSize);
+              error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
+                                      sizeof(totalLocalMemory),
+                                      &totalLocalMemory, NULL);
+              test_error(error, "clGetDeviceInfo failed");

-      if(maxWorkGroupSize < groupSize)
-        groupSize = maxWorkGroupSize;
-    }
+              // We know that each work-group is going to use typeSize *
+              // deviceThreadCount bytes of local memory
+              // so pick the maximum value for deviceThreadCount that uses all
+              // the local memory.
+              maxWorkGroupSize =
+                  ((totalLocalMemory - usedLocalMemory) / typeSize);

-    CurrentGroupSize((cl_uint)groupSize);
+              if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
+          }
+          if (CurrentGroupSize() <= groupSize)
+              break;
+          else
+              CurrentGroupSize(CurrentGroupSize() / 2);
+      }
    if(CurrentGroupSize() > deviceThreadCount)
      CurrentGroupSize(deviceThreadCount);
    if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
      deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
    threadCount = deviceThreadCount+hostThreadCount;
  }
+  if (gDebug)
+  {
+      log_info("Program source:\n");
+      log_info("%s\n", programLine);
+  }
  if(deviceThreadCount > 0)
    log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize());
  if(hostThreadCount > 0)