From 60b731ce15f1fa806bb694fcf45d7a73af640e75 Mon Sep 17 00:00:00 2001
From: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com>
Date: Tue, 22 Sep 2020 10:15:09 -0600
Subject: [PATCH] c11_atomics: Fix to iteratively reduce workgroup size  (#939)
 (#941)

The generator function for atomic_fence uses the current workgroup
size to determine the number of non atomic variables per thread
in the kernel. The kernel should hence be regenerated when the
workgroup size changes.

However regenerating the kernel can itself change the workgroup
size. This change introduces an iterative loop that reduces the
workgroup sizes by 2 each time re-generating the kernel until
we find one that works (or exit at groupsize == 1)

Change-Id: Ic32fe967e32de6643e01c6775f4bddbcad0a299a
---
 test_conformance/c11_atomics/common.h | 91 +++++++++++++++++----------
 1 file changed, 58 insertions(+), 33 deletions(-)

diff --git a/test_conformance/c11_atomics/common.h b/test_conformance/c11_atomics/common.h
index 360ab45e..887ca97b 100644
--- a/test_conformance/c11_atomics/common.h
+++ b/test_conformance/c11_atomics/common.h
@@ -962,51 +962,76 @@ int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id dev
 
   if(deviceThreadCount > 0)
   {
-    cl_ulong usedLocalMemory;
-    cl_ulong totalLocalMemory;
-    cl_uint maxWorkGroupSize;
+      // This loop iteratively reduces the workgroup size by 2 and then
+      // re-generates the kernel with the reduced
+      // workgroup size until we find a size which is admissible for the kernel
+      // being run or reduce the wg size
+      // to the trivial case of 1 (which was separately verified to be accurate
+      // for the kernel being run)
 
-    // Set up the kernel code
-    programSource = PragmaHeader(deviceID)+ProgramHeader(numDestItems)+FunctionCode()+KernelCode(numDestItems);
-    programLine = programSource.c_str();
-    if (create_single_kernel_helper_with_build_options(
-            context, &program, &kernel, 1, &programLine, "test_atomic_kernel",
-            gOldAPI ? "" : nullptr))
-    {
-      return -1;
-    }
-    if(gDebug)
-    {
-      log_info("Program source:\n");
-      log_info("%s\n", programLine);
-    }
-    // tune up work sizes based on kernel info
-    error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
-    test_error(error, "Unable to obtain max work group size for device and kernel combo");
+      while ((CurrentGroupSize() > 1))
+      {
+          // Re-generate the kernel code with the current group size
+          if (kernel) clReleaseKernel(kernel);
+          if (program) clReleaseProgram(program);
+          programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
+              + FunctionCode() + KernelCode(numDestItems);
+          programLine = programSource.c_str();
+          if (create_single_kernel_helper_with_build_options(
+                  context, &program, &kernel, 1, &programLine,
+                  "test_atomic_kernel", gOldAPI ? "" : nullptr))
+          {
+              return -1;
+          }
+          // Get work group size for the new kernel
+          error = clGetKernelWorkGroupInfo(kernel, deviceID,
+                                           CL_KERNEL_WORK_GROUP_SIZE,
+                                           sizeof(groupSize), &groupSize, NULL);
+          test_error(error,
+                     "Unable to obtain max work group size for device and "
+                     "kernel combo");
 
-    if(LocalMemory())
-    {
-      error = clGetKernelWorkGroupInfo (kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(usedLocalMemory), &usedLocalMemory, NULL);
-      test_error(error, "clGetKernelWorkGroupInfo failed");
+          if (LocalMemory())
+          {
+              cl_ulong usedLocalMemory;
+              cl_ulong totalLocalMemory;
+              cl_uint maxWorkGroupSize;
 
-      error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(totalLocalMemory), &totalLocalMemory, NULL);
-      test_error(error, "clGetDeviceInfo failed");
+              error = clGetKernelWorkGroupInfo(
+                  kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
+                  sizeof(usedLocalMemory), &usedLocalMemory, NULL);
+              test_error(error, "clGetKernelWorkGroupInfo failed");
 
-      // We know that each work-group is going to use typeSize * deviceThreadCount bytes of local memory
-      // so pick the maximum value for deviceThreadCount that uses all the local memory.
-      maxWorkGroupSize = ((totalLocalMemory - usedLocalMemory) / typeSize);
+              error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
+                                      sizeof(totalLocalMemory),
+                                      &totalLocalMemory, NULL);
+              test_error(error, "clGetDeviceInfo failed");
 
-      if(maxWorkGroupSize < groupSize)
-        groupSize = maxWorkGroupSize;
-    }
+              // We know that each work-group is going to use typeSize *
+              // deviceThreadCount bytes of local memory
+              // so pick the maximum value for deviceThreadCount that uses all
+              // the local memory.
+              maxWorkGroupSize =
+                  ((totalLocalMemory - usedLocalMemory) / typeSize);
 
-    CurrentGroupSize((cl_uint)groupSize);
+              if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
+          }
+          if (CurrentGroupSize() <= groupSize)
+              break;
+          else
+              CurrentGroupSize(CurrentGroupSize() / 2);
+      }
     if(CurrentGroupSize() > deviceThreadCount)
       CurrentGroupSize(deviceThreadCount);
     if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
       deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
     threadCount = deviceThreadCount+hostThreadCount;
   }
+  if (gDebug)
+  {
+      log_info("Program source:\n");
+      log_info("%s\n", programLine);
+  }
   if(deviceThreadCount > 0)
     log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize());
   if(hostThreadCount > 0)