From 6da9c6b68f9643a077f7281451b59f444a77a991 Mon Sep 17 00:00:00 2001
From: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Date: Wed, 11 Aug 2021 19:06:10 +0200
Subject: [PATCH] Fix double free in c11_atomics tests for SVM allocations
 (#1286)

* Only Clang format changes

* Fix double free object for SVM allocations

* Fix double free - review fixes
---
 test_conformance/c11_atomics/common.h | 2510 +++++++++++++------------
 1 file changed, 1358 insertions(+), 1152 deletions(-)
diff --git a/test_conformance/c11_atomics/common.h b/test_conformance/c11_atomics/common.h
index bbcc68c6..d30259f0 100644
--- a/test_conformance/c11_atomics/common.h
+++ b/test_conformance/c11_atomics/common.h
@@ -28,10 +28,9 @@
 #define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads)
 #define MAX_HOST_THREADS GetThreadCount()
 
-#define EXECUTE_TEST(error, test)\
-  error |= test;\
-  if(error && !gContinueOnError)\
-  return error;
+#define EXECUTE_TEST(error, test)                                              \
+    error |= test;                                                             \
+    if (error && !gContinueOnError) return error;
 
 enum TExplicitAtomicType
 {
@@ -57,764 +56,918 @@ enum TExplicitMemoryScopeType
     MEMORY_SCOPE_ALL_SVM_DEVICES
 };
 
-extern bool gHost; // temporary flag for testing native host threads (test verification)
+extern bool
+    gHost; // temporary flag for testing native host threads (test verification)
 extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2)
 extern bool gContinueOnError; // execute all cases even when errors detected
-extern bool gNoGlobalVariables; // disable cases with global atomics in program scope
+extern bool
+    gNoGlobalVariables; // disable cases with global atomics in program scope
 extern bool gNoGenericAddressSpace; // disable cases with generic address space
 extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree
 extern bool gDebug; // print OpenCL kernel code
-extern int gInternalIterations; // internal test iterations for atomic operation, sufficient to verify atomicity
-extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device
+extern int gInternalIterations; // internal test iterations for atomic
+                                // operation, sufficient to verify atomicity
+extern int
+    gMaxDeviceThreads; // maximum number of threads executed on OCL device
 extern cl_device_atomic_capabilities gAtomicMemCap,
     gAtomicFenceCap; // atomic memory and fence capabilities for this device
 
-extern const char *get_memory_order_type_name(TExplicitMemoryOrderType orderType);
-extern const char *get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);
+extern const char *
+get_memory_order_type_name(TExplicitMemoryOrderType orderType);
+extern const char *
+get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);
 
 extern cl_int getSupportedMemoryOrdersAndScopes(
     cl_device_id device, std::vector<TExplicitMemoryOrderType> &memoryOrders,
     std::vector<TExplicitMemoryScopeType> &memoryScopes);
 
-class AtomicTypeInfo
-{
+class AtomicTypeInfo {
 public:
-  TExplicitAtomicType _type;
-  AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
-  cl_uint Size(cl_device_id device);
-  const char* AtomicTypeName();
-  const char* RegularTypeName();
-  const char* AddSubOperandTypeName();
-  int IsSupported(cl_device_id device);
+    TExplicitAtomicType _type;
+    AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
+    cl_uint Size(cl_device_id device);
+    const char *AtomicTypeName();
+    const char *RegularTypeName();
+    const char *AddSubOperandTypeName();
+    int IsSupported(cl_device_id device);
 };
 
-template<typename HostDataType>
-class AtomicTypeExtendedInfo : public AtomicTypeInfo
-{
+template <typename HostDataType>
+class AtomicTypeExtendedInfo : public AtomicTypeInfo {
 public:
-  AtomicTypeExtendedInfo(TExplicitAtomicType type) : AtomicTypeInfo(type) {}
-  HostDataType MinValue();
-  HostDataType MaxValue();
-  HostDataType SpecialValue(cl_uchar x)
-  {
-    HostDataType tmp;
-    cl_uchar *ptr = (cl_uchar*)&tmp;
-    for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_uchar); i++)
-      ptr[i] = x;
-    return tmp;
-  }
-  HostDataType SpecialValue(cl_ushort x)
-  {
-    HostDataType tmp;
-    cl_ushort *ptr = (cl_ushort*)&tmp;
-    for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_ushort); i++)
-      ptr[i] = x;
-    return tmp;
-  }
+    AtomicTypeExtendedInfo(TExplicitAtomicType type): AtomicTypeInfo(type) {}
+    HostDataType MinValue();
+    HostDataType MaxValue();
+    HostDataType SpecialValue(cl_uchar x)
+    {
+        HostDataType tmp;
+        cl_uchar *ptr = (cl_uchar *)&tmp;
+        for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_uchar); i++)
+            ptr[i] = x;
+        return tmp;
+    }
+    HostDataType SpecialValue(cl_ushort x)
+    {
+        HostDataType tmp;
+        cl_ushort *ptr = (cl_ushort *)&tmp;
+        for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_ushort); i++)
+            ptr[i] = x;
+        return tmp;
+    }
 };
 
-class CTest  {
+class CTest {
 public:
-  virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) = 0;
+    virtual int Execute(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements) = 0;
 };
 
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTest : CTest
-{
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTest : CTest {
 public:
-  typedef struct {
-    CBasicTest *test;
-    cl_uint tid;
-    cl_uint threadCount;
-    volatile HostAtomicType *destMemory;
-    HostDataType *oldValues;
-  } THostThreadContext;
-  static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, void *userInfo)
-  {
-    THostThreadContext *threadContext = ((THostThreadContext*)userInfo)+job_id;
-    threadContext->test->HostFunction(threadContext->tid, threadContext->threadCount, threadContext->destMemory, threadContext->oldValues);
-    return 0;
-  }
-  CBasicTest(TExplicitAtomicType dataType, bool useSVM) : CTest(),
-    _maxDeviceThreads(MAX_DEVICE_THREADS),
-    _dataType(dataType), _useSVM(useSVM), _startValue(255),
-    _localMemory(false), _declaredInProgram(false),
-    _usedInFunction(false), _genericAddrSpace(false),
-    _oldValueCheck(true), _localRefValues(false),
-    _maxGroupSize(0), _passCount(0), _iterations(gInternalIterations)
-  {
-  }
-  virtual ~CBasicTest()
-  {
-    if(_passCount)
-      log_info("  %u tests executed successfully for %s\n", _passCount, DataType().AtomicTypeName());
-  }
-  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
-  {
-    return 1;
-  }
-  virtual cl_uint NumNonAtomicVariablesPerThread()
-  {
-    return 1;
-  }
-  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
-  {
-    return false;
-  }
-  virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
-  {
-    return false;
-  }
-  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
-  {
-    return false;
-  }
-  virtual std::string PragmaHeader(cl_device_id deviceID);
-  virtual std::string ProgramHeader(cl_uint maxNumDestItems);
-  virtual std::string FunctionCode();
-  virtual std::string KernelCode(cl_uint maxNumDestItems);
-  virtual std::string ProgramCore() = 0;
-  virtual std::string SingleTestName()
-  {
-    std::string testName = LocalMemory() ? "local" : "global";
-    testName += " ";
-    testName += DataType().AtomicTypeName();
-    if(DeclaredInProgram())
+    typedef struct
     {
-      testName += " declared in program";
-    }
-    if(DeclaredInProgram() && UsedInFunction())
-      testName += ",";
-    if(UsedInFunction())
+        CBasicTest *test;
+        cl_uint tid;
+        cl_uint threadCount;
+        volatile HostAtomicType *destMemory;
+        HostDataType *oldValues;
+    } THostThreadContext;
+    static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id,
+                                     void *userInfo)
     {
-      testName += " used in ";
-      if(GenericAddrSpace())
-        testName += "generic ";
-      testName += "function";
-    }
-    return testName;
-  }
-  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue);
-  int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    int error = 0;
-    UsedInFunction(false);
-    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
-    UsedInFunction(true);
-    GenericAddrSpace(false);
-    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
-    GenericAddrSpace(true);
-    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
-    GenericAddrSpace(false);
-    return error;
-  }
-  int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    int error = 0;
-    DeclaredInProgram(false);
-    EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
-    if(!UseSVM())
-    {
-      DeclaredInProgram(true);
-      EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
-    }
-    return error;
-  }
-  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    int error = 0;
-    if(_maxDeviceThreads > 0 && !UseSVM())
-    {
-      LocalMemory(true);
-      EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
-    }
-    if(_maxDeviceThreads+MaxHostThreads() > 0)
-    {
-      LocalMemory(false);
-      EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
-    }
-    return error;
-  }
-  virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
-  {
-    if(sizeof(HostAtomicType) != DataType().Size(deviceID))
-    {
-      log_info("Invalid test: Host atomic type size (%u) is different than OpenCL type size (%u)\n", (cl_uint)sizeof(HostAtomicType), DataType().Size(deviceID));
-      return -1;
-    }
-    if(sizeof(HostAtomicType) != sizeof(HostDataType))
-    {
-      log_info("Invalid test: Host atomic type size (%u) is different than corresponding type size (%u)\n", (cl_uint)sizeof(HostAtomicType), (cl_uint)sizeof(HostDataType));
-      return -1;
-    }
-    // Verify we can run first
-    if(UseSVM() && !gUseHostPtr)
-    {
-      cl_device_svm_capabilities caps;
-      cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0);
-      test_error(error, "clGetDeviceInfo failed");
-      if((caps & CL_DEVICE_SVM_ATOMICS) == 0)
-      {
-        log_info("\t%s - SVM_ATOMICS not supported\n", DataType().AtomicTypeName());
-        // implicit pass
+        THostThreadContext *threadContext =
+            ((THostThreadContext *)userInfo) + job_id;
+        threadContext->test->HostFunction(
+            threadContext->tid, threadContext->threadCount,
+            threadContext->destMemory, threadContext->oldValues);
         return 0;
-      }
     }
-    if(!DataType().IsSupported(deviceID))
+    CBasicTest(TExplicitAtomicType dataType, bool useSVM)
+        : CTest(), _maxDeviceThreads(MAX_DEVICE_THREADS), _dataType(dataType),
+          _useSVM(useSVM), _startValue(255), _localMemory(false),
+          _declaredInProgram(false), _usedInFunction(false),
+          _genericAddrSpace(false), _oldValueCheck(true),
+          _localRefValues(false), _maxGroupSize(0), _passCount(0),
+          _iterations(gInternalIterations)
+    {}
+    virtual ~CBasicTest()
     {
-      log_info("\t%s not supported\n", DataType().AtomicTypeName());
-      // implicit pass or host test (debug feature)
-      if(UseSVM())
-        return 0;
-      _maxDeviceThreads = 0;
+        if (_passCount)
+            log_info("  %u tests executed successfully for %s\n", _passCount,
+                     DataType().AtomicTypeName());
     }
-    if(_maxDeviceThreads+MaxHostThreads() == 0)
-      return 0;
-    return ExecuteForEachParameterSet(deviceID, context, queue);
-  }
-  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
-  {
-    log_info("Empty thread function %u\n", (cl_uint)tid);
-  }
-  AtomicTypeExtendedInfo<HostDataType> DataType() const
-  {
-    return AtomicTypeExtendedInfo<HostDataType>(_dataType);
-  }
-  cl_uint _maxDeviceThreads;
-  virtual cl_uint MaxHostThreads()
-  {
-    if(UseSVM() || gHost)
-      return MAX_HOST_THREADS;
-    else
-      return 0;
-  }
-
-  int CheckCapabilities(TExplicitMemoryScopeType memoryScope,
-                        TExplicitMemoryOrderType memoryOrder)
-  {
-      /*
-          Differentiation between atomic fence and other atomic operations
-          does not need to occur here.
-
-          The initialisation of this test checks that the minimum required
-          capabilities are supported by this device.
-
-          The following switches allow the test to skip if optional capabilites
-          are not supported by the device.
-        */
-      switch (memoryScope)
-      {
-          case MEMORY_SCOPE_EMPTY: {
-              break;
-          }
-          case MEMORY_SCOPE_WORK_GROUP: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_SCOPE_DEVICE: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_SCOPE_ALL_DEVICES: // fallthough
-          case MEMORY_SCOPE_ALL_SVM_DEVICES: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          default: {
-              log_info("Invalid memory scope\n");
-              break;
-          }
-      }
-
-      switch (memoryOrder)
-      {
-          case MEMORY_ORDER_EMPTY: {
-              break;
-          }
-          case MEMORY_ORDER_RELAXED: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_ORDER_ACQUIRE:
-          case MEMORY_ORDER_RELEASE:
-          case MEMORY_ORDER_ACQ_REL: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          case MEMORY_ORDER_SEQ_CST: {
-              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0)
-              {
-                  return TEST_SKIPPED_ITSELF;
-              }
-              break;
-          }
-          default: {
-              log_info("Invalid memory order\n");
-              break;
-          }
-      }
-
-      return 0;
-  }
-  virtual bool SVMDataBufferAllSVMConsistent() {return false;}
-  bool UseSVM() {return _useSVM;}
-  void StartValue(HostDataType startValue) {_startValue = startValue;}
-  HostDataType StartValue() {return _startValue;}
-  void LocalMemory(bool local) {_localMemory = local;}
-  bool LocalMemory() {return _localMemory;}
-  void DeclaredInProgram(bool declaredInProgram) {_declaredInProgram = declaredInProgram;}
-  bool DeclaredInProgram() {return _declaredInProgram;}
-  void UsedInFunction(bool local) {_usedInFunction = local;}
-  bool UsedInFunction() {return _usedInFunction;}
-  void GenericAddrSpace(bool genericAddrSpace) {_genericAddrSpace = genericAddrSpace;}
-  bool GenericAddrSpace() {return _genericAddrSpace;}
-  void OldValueCheck(bool check) {_oldValueCheck = check;}
-  bool OldValueCheck() {return _oldValueCheck;}
-  void LocalRefValues(bool localRefValues) {_localRefValues = localRefValues;}
-  bool LocalRefValues() {return _localRefValues;}
-  void MaxGroupSize(cl_uint maxGroupSize) {_maxGroupSize = maxGroupSize;}
-  cl_uint MaxGroupSize() {return _maxGroupSize;}
-  void CurrentGroupSize(cl_uint currentGroupSize)
-  {
-    if(MaxGroupSize() && MaxGroupSize() < currentGroupSize)
-      _currentGroupSize = MaxGroupSize();
-    else
-      _currentGroupSize = currentGroupSize;
-  }
-  cl_uint CurrentGroupSize() {return _currentGroupSize;}
-  virtual cl_uint CurrentGroupNum(cl_uint threadCount)
-  {
-    if(threadCount == 0)
-      return 0;
-    if(LocalMemory())
-      return 1;
-    return threadCount/CurrentGroupSize();
-  }
-  cl_int Iterations() {return _iterations;}
-  std::string IterationsStr() {std::stringstream ss; ss << _iterations; return ss.str();}
-private:
-  const TExplicitAtomicType _dataType;
-  const bool _useSVM;
-  HostDataType	_startValue;
-  bool _localMemory;
-  bool _declaredInProgram;
-  bool _usedInFunction;
-  bool _genericAddrSpace;
-  bool _oldValueCheck;
-  bool _localRefValues;
-  cl_uint _maxGroupSize;
-  cl_uint _currentGroupSize;
-  cl_uint _passCount;
-  const cl_int _iterations;
-};
-
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestMemOrderScope : public CBasicTest<HostAtomicType, HostDataType>
-{
-public:
-  using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
-  using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
-  using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
-  CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-  }
-  virtual std::string ProgramHeader(cl_uint maxNumDestItems)
-  {
-    std::string header;
-    if(gOldAPI)
+    virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
     {
-      std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
-      header +=
-        "#define atomic_store_explicit(x,y,o"+s+")                     atomic_store(x,y)\n"
-        "#define atomic_load_explicit(x,o"+s+")                        atomic_load(x)\n"
-        "#define atomic_exchange_explicit(x,y,o"+s+")                  atomic_exchange(x,y)\n"
-        "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_strong(x,y,z)\n"
-        "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"+s+")   atomic_compare_exchange_weak(x,y,z)\n"
-        "#define atomic_fetch_add_explicit(x,y,o"+s+")                 atomic_fetch_add(x,y)\n"
-        "#define atomic_fetch_sub_explicit(x,y,o"+s+")                 atomic_fetch_sub(x,y)\n"
-        "#define atomic_fetch_or_explicit(x,y,o"+s+")                  atomic_fetch_or(x,y)\n"
-        "#define atomic_fetch_xor_explicit(x,y,o"+s+")                 atomic_fetch_xor(x,y)\n"
-        "#define atomic_fetch_and_explicit(x,y,o"+s+")                 atomic_fetch_and(x,y)\n"
-        "#define atomic_fetch_min_explicit(x,y,o"+s+")                 atomic_fetch_min(x,y)\n"
-        "#define atomic_fetch_max_explicit(x,y,o"+s+")                 atomic_fetch_max(x,y)\n"
-        "#define atomic_flag_test_and_set_explicit(x,o"+s+")           atomic_flag_test_and_set(x)\n"
-        "#define atomic_flag_clear_explicit(x,o"+s+")                  atomic_flag_clear(x)\n";
+        return 1;
     }
-    return header+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(maxNumDestItems);
-  }
-  virtual std::string SingleTestName()
-  {
-    std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
-    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
+    virtual cl_uint NumNonAtomicVariablesPerThread() { return 1; }
+    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+                               HostDataType *startRefValues,
+                               cl_uint whichDestValue)
     {
-      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
+        return false;
     }
-    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
+    virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
+                              MTdata d)
     {
-      testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
+        return false;
     }
-    return testName;
-  }
-  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    if(LocalMemory() &&
-      MemoryScope() != MEMORY_SCOPE_EMPTY &&
-      MemoryScope() != MEMORY_SCOPE_WORK_GROUP) //memory scope should only be used for global memory
-      return 0;
-    if(MemoryScope() == MEMORY_SCOPE_DEVICE)
-      MaxGroupSize(16); // increase number of groups by forcing smaller group size
-    else
-      MaxGroupSize(0); // group size limited by device capabilities
-
-    if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF)
-        return 0; // skip test - not applicable
-
-    return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
-  }
-  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    // repeat test for each reasonable memory order/scope combination
-    std::vector<TExplicitMemoryOrderType> memoryOrder;
-    std::vector<TExplicitMemoryScopeType> memoryScope;
-    int error = 0;
-
-    // For OpenCL-3.0 and later some orderings and scopes are optional, so here
-    // we query for the supported ones.
-    test_error_ret(
-        getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
-        "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
-
-    for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
+    virtual bool VerifyRefs(bool &correct, cl_uint threadCount,
+                            HostDataType *refValues,
+                            HostAtomicType *finalValues)
     {
-      for(unsigned si = 0; si < memoryScope.size(); si++)
-      {
-        if(memoryOrder[oi] == MEMORY_ORDER_EMPTY && memoryScope[si] != MEMORY_SCOPE_EMPTY)
-          continue;
-        MemoryOrder(memoryOrder[oi]);
-        MemoryScope(memoryScope[si]);
-        EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
-      }
+        return false;
     }
-    return error;
-  }
-  void MemoryOrder(TExplicitMemoryOrderType memoryOrder) {_memoryOrder = memoryOrder;}
-  TExplicitMemoryOrderType MemoryOrder() {return _memoryOrder;}
-  std::string MemoryOrderStr()
-  {
-    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
-      return std::string(", ")+get_memory_order_type_name(MemoryOrder());
-    return "";
-  }
-  void MemoryScope(TExplicitMemoryScopeType memoryScope) {_memoryScope = memoryScope;}
-  TExplicitMemoryScopeType MemoryScope() {return _memoryScope;}
-  std::string MemoryScopeStr()
-  {
-    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
-      return std::string(", ")+get_memory_scope_type_name(MemoryScope());
-    return "";
-  }
-  std::string MemoryOrderScopeStr()
-  {
-    return MemoryOrderStr()+MemoryScopeStr();
-  }
-  virtual cl_uint CurrentGroupNum(cl_uint threadCount)
-  {
-    if(MemoryScope() == MEMORY_SCOPE_WORK_GROUP)
-      return 1;
-    return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(threadCount);
-  }
-  virtual cl_uint MaxHostThreads()
-  {
-      // block host threads execution for memory scope different than
-      // memory_scope_all_svm_devices
-      if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
-          || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
-      {
-          return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
-      }
-      else
-      {
-          return 0;
-      }
-  }
-private:
-  TExplicitMemoryOrderType _memoryOrder;
-  TExplicitMemoryScopeType _memoryScope;
-};
-
-template<typename HostAtomicType, typename HostDataType>
-class CBasicTestMemOrder2Scope : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
-{
-public:
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
-  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
-  using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
-
-  CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
-  {
-  }
-  virtual std::string SingleTestName()
-  {
-    std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
-    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
-      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
-    if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
-      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder2())).substr(sizeof("memory"));
-    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
-      testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
-    return testName;
-  }
-  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
-  {
-    // repeat test for each reasonable memory order/scope combination
-    std::vector<TExplicitMemoryOrderType> memoryOrder;
-    std::vector<TExplicitMemoryScopeType> memoryScope;
-    int error = 0;
-
-    // For OpenCL-3.0 and later some orderings and scopes are optional, so here
-    // we query for the supported ones.
-    test_error_ret(
-        getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
-        "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
-
-    for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
+    virtual std::string PragmaHeader(cl_device_id deviceID);
+    virtual std::string ProgramHeader(cl_uint maxNumDestItems);
+    virtual std::string FunctionCode();
+    virtual std::string KernelCode(cl_uint maxNumDestItems);
+    virtual std::string ProgramCore() = 0;
+    virtual std::string SingleTestName()
     {
-      for(unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
-      {
-        for(unsigned si = 0; si < memoryScope.size(); si++)
+        std::string testName = LocalMemory() ? "local" : "global";
+        testName += " ";
+        testName += DataType().AtomicTypeName();
+        if (DeclaredInProgram())
         {
-          if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
-            && memoryOrder[oi] != memoryOrder[o2i])
-            continue; // both memory order arguments must be set (or none)
-          if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
-            && memoryScope[si] != MEMORY_SCOPE_EMPTY)
-            continue; // memory scope without memory order is not allowed
-          MemoryOrder(memoryOrder[oi]);
-          MemoryOrder2(memoryOrder[o2i]);
-          MemoryScope(memoryScope[si]);
-
-          if (CheckCapabilities(MemoryScope(), MemoryOrder())
-              == TEST_SKIPPED_ITSELF)
-              continue; // skip test - not applicable
-
-          if (CheckCapabilities(MemoryScope(), MemoryOrder2())
-              == TEST_SKIPPED_ITSELF)
-              continue; // skip test - not applicable
-
-          EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
+            testName += " declared in program";
         }
-      }
+        if (DeclaredInProgram() && UsedInFunction()) testName += ",";
+        if (UsedInFunction())
+        {
+            testName += " used in ";
+            if (GenericAddrSpace()) testName += "generic ";
+            testName += "function";
+        }
+        return testName;
     }
-    return error;
-  }
-  void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) {_memoryOrder2 = memoryOrderFail;}
-  TExplicitMemoryOrderType MemoryOrder2() {return _memoryOrder2;}
-  std::string MemoryOrderFailStr()
-  {
-    if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
-      return std::string(", ")+get_memory_order_type_name(MemoryOrder2());
-    return "";
-  }
-  std::string MemoryOrderScope()
-  {
-    return MemoryOrderStr()+MemoryOrderFailStr()+MemoryScopeStr();
-  }
+    virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue);
+    int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue)
+    {
+        int error = 0;
+        UsedInFunction(false);
+        EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
+        UsedInFunction(true);
+        GenericAddrSpace(false);
+        EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
+        GenericAddrSpace(true);
+        EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
+        GenericAddrSpace(false);
+        return error;
+    }
+    int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context,
+                                      cl_command_queue queue)
+    {
+        int error = 0;
+        DeclaredInProgram(false);
+        EXECUTE_TEST(error,
+                     ExecuteForEachPointerType(deviceID, context, queue));
+        if (!UseSVM())
+        {
+            DeclaredInProgram(true);
+            EXECUTE_TEST(error,
+                         ExecuteForEachPointerType(deviceID, context, queue));
+        }
+        return error;
+    }
+    virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue)
+    {
+        int error = 0;
+        if (_maxDeviceThreads > 0 && !UseSVM())
+        {
+            LocalMemory(true);
+            EXECUTE_TEST(
+                error, ExecuteForEachDeclarationType(deviceID, context, queue));
+        }
+        if (_maxDeviceThreads + MaxHostThreads() > 0)
+        {
+            LocalMemory(false);
+            EXECUTE_TEST(
+                error, ExecuteForEachDeclarationType(deviceID, context, queue));
+        }
+        return error;
+    }
+    virtual int Execute(cl_device_id deviceID, cl_context context,
+                        cl_command_queue queue, int num_elements)
+    {
+        if (sizeof(HostAtomicType) != DataType().Size(deviceID))
+        {
+            log_info("Invalid test: Host atomic type size (%u) is different "
+                     "than OpenCL type size (%u)\n",
+                     (cl_uint)sizeof(HostAtomicType),
+                     DataType().Size(deviceID));
+            return -1;
+        }
+        if (sizeof(HostAtomicType) != sizeof(HostDataType))
+        {
+            log_info("Invalid test: Host atomic type size (%u) is different "
+                     "than corresponding type size (%u)\n",
+                     (cl_uint)sizeof(HostAtomicType),
+                     (cl_uint)sizeof(HostDataType));
+            return -1;
+        }
+        // Verify we can run first
+        if (UseSVM() && !gUseHostPtr)
+        {
+            cl_device_svm_capabilities caps;
+            cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES,
+                                           sizeof(caps), &caps, 0);
+            test_error(error, "clGetDeviceInfo failed");
+            if ((caps & CL_DEVICE_SVM_ATOMICS) == 0)
+            {
+                log_info("\t%s - SVM_ATOMICS not supported\n",
+                         DataType().AtomicTypeName());
+                // implicit pass
+                return 0;
+            }
+        }
+        if (!DataType().IsSupported(deviceID))
+        {
+            log_info("\t%s not supported\n", DataType().AtomicTypeName());
+            // implicit pass or host test (debug feature)
+            if (UseSVM()) return 0;
+            _maxDeviceThreads = 0;
+        }
+        if (_maxDeviceThreads + MaxHostThreads() == 0) return 0;
+        return ExecuteForEachParameterSet(deviceID, context, queue);
+    }
+    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+                              volatile HostAtomicType *destMemory,
+                              HostDataType *oldValues)
+    {
+        log_info("Empty thread function %u\n", (cl_uint)tid);
+    }
+    AtomicTypeExtendedInfo<HostDataType> DataType() const
+    {
+        return AtomicTypeExtendedInfo<HostDataType>(_dataType);
+    }
+    cl_uint _maxDeviceThreads;
+    virtual cl_uint MaxHostThreads()
+    {
+        if (UseSVM() || gHost)
+            return MAX_HOST_THREADS;
+        else
+            return 0;
+    }
+
+    int CheckCapabilities(TExplicitMemoryScopeType memoryScope,
+                          TExplicitMemoryOrderType memoryOrder)
+    {
+        /*
+            Differentiation between atomic fence and other atomic operations
+            does not need to occur here.
+
+            The initialisation of this test checks that the minimum required
+            capabilities are supported by this device.
+
+            The following switches allow the test to skip if optional
+           capabilites are not supported by the device.
+          */
+        switch (memoryScope)
+        {
+            case MEMORY_SCOPE_EMPTY: {
+                break;
+            }
+            case MEMORY_SCOPE_WORK_GROUP: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_SCOPE_DEVICE: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_SCOPE_ALL_DEVICES: // fallthough
+            case MEMORY_SCOPE_ALL_SVM_DEVICES: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            default: {
+                log_info("Invalid memory scope\n");
+                break;
+            }
+        }
+
+        switch (memoryOrder)
+        {
+            case MEMORY_ORDER_EMPTY: {
+                break;
+            }
+            case MEMORY_ORDER_RELAXED: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_ORDER_ACQUIRE:
+            case MEMORY_ORDER_RELEASE:
+            case MEMORY_ORDER_ACQ_REL: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            case MEMORY_ORDER_SEQ_CST: {
+                if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0)
+                {
+                    return TEST_SKIPPED_ITSELF;
+                }
+                break;
+            }
+            default: {
+                log_info("Invalid memory order\n");
+                break;
+            }
+        }
+
+        return 0;
+    }
+    virtual bool SVMDataBufferAllSVMConsistent() { return false; }
+    bool UseSVM() { return _useSVM; }
+    void StartValue(HostDataType startValue) { _startValue = startValue; }
+    HostDataType StartValue() { return _startValue; }
+    void LocalMemory(bool local) { _localMemory = local; }
+    bool LocalMemory() { return _localMemory; }
+    void DeclaredInProgram(bool declaredInProgram)
+    {
+        _declaredInProgram = declaredInProgram;
+    }
+    bool DeclaredInProgram() { return _declaredInProgram; }
+    void UsedInFunction(bool local) { _usedInFunction = local; }
+    bool UsedInFunction() { return _usedInFunction; }
+    void GenericAddrSpace(bool genericAddrSpace)
+    {
+        _genericAddrSpace = genericAddrSpace;
+    }
+    bool GenericAddrSpace() { return _genericAddrSpace; }
+    void OldValueCheck(bool check) { _oldValueCheck = check; }
+    bool OldValueCheck() { return _oldValueCheck; }
+    void LocalRefValues(bool localRefValues)
+    {
+        _localRefValues = localRefValues;
+    }
+    bool LocalRefValues() { return _localRefValues; }
+    void MaxGroupSize(cl_uint maxGroupSize) { _maxGroupSize = maxGroupSize; }
+    cl_uint MaxGroupSize() { return _maxGroupSize; }
+    void CurrentGroupSize(cl_uint currentGroupSize)
+    {
+        if (MaxGroupSize() && MaxGroupSize() < currentGroupSize)
+            _currentGroupSize = MaxGroupSize();
+        else
+            _currentGroupSize = currentGroupSize;
+    }
+    cl_uint CurrentGroupSize() { return _currentGroupSize; }
+    virtual cl_uint CurrentGroupNum(cl_uint threadCount)
+    {
+        if (threadCount == 0) return 0;
+        if (LocalMemory()) return 1;
+        return threadCount / CurrentGroupSize();
+    }
+    cl_int Iterations() { return _iterations; }
+    std::string IterationsStr()
+    {
+        std::stringstream ss;
+        ss << _iterations;
+        return ss.str();
+    }
+
 private:
-  TExplicitMemoryOrderType _memoryOrder2;
+    const TExplicitAtomicType _dataType;
+    const bool _useSVM;
+    HostDataType _startValue;
+    bool _localMemory;
+    bool _declaredInProgram;
+    bool _usedInFunction;
+    bool _genericAddrSpace;
+    bool _oldValueCheck;
+    bool _localRefValues;
+    cl_uint _maxGroupSize;
+    cl_uint _currentGroupSize;
+    cl_uint _passCount;
+    const cl_int _iterations;
 };
 
-template<typename HostAtomicType, typename HostDataType>
-std::string CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
-{
-  std::string pragma;
-
-  if(gOldAPI)
-  {
-    pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n";
-  }
-  // Create the pragma lines for this kernel
-  if(DataType().Size(deviceID) == 8)
-  {
-    pragma += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
-    pragma += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
-  }
-  if(_dataType == TYPE_ATOMIC_DOUBLE)
-    pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
-  return pragma;
-}
-
-template<typename HostAtomicType, typename HostDataType>
-std::string CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
-{
-  // Create the program header
-  std::string header;
-  std::string aTypeName = DataType().AtomicTypeName();
-  std::string cTypeName = DataType().RegularTypeName();
-  std::string argListForKernel;
-  std::string argListForFunction;
-  std::string argListNoTypes;
-  std::string functionPrototype;
-  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
-
-  if(gOldAPI)
-  {
-    header += std::string("#define ")+aTypeName+" "+cTypeName+"\n"
-      "#define atomic_store(x,y)                                (*(x) = y)\n"
-      "#define atomic_load(x)                                   (*(x))\n"
-      "#define ATOMIC_VAR_INIT(x)                               (x)\n"
-      "#define ATOMIC_FLAG_INIT                                 0\n"
-      "#define atomic_init(x,y)                                 atomic_store(x,y)\n";
-    if(aTypeName == "atomic_float")
-      header += "#define atomic_exchange(x,y)                             atomic_xchg(x,y)\n";
-    else if(aTypeName == "atomic_double")
-      header += "double atomic_exchange(volatile "+addressSpace+"atomic_double *x, double y)\n"
-        "{\n"
-        "  long tmp = *(long*)&y, res;\n"
-        "  volatile "+addressSpace+"long *tmpA = (volatile "+addressSpace+"long)x;\n"
-        "  res = atom_xchg(tmpA,tmp);\n"
-        "  return *(double*)&res;\n"
-        "}\n";
-    else
-      header += "#define atomic_exchange(x,y)                             atom_xchg(x,y)\n";
-    if(aTypeName != "atomic_float" && aTypeName != "atomic_double")
-      header +=
-      "bool atomic_compare_exchange_strong(volatile "+addressSpace+" "+aTypeName+" *a, "+cTypeName+" *expected, "+cTypeName+" desired)\n"
-      "{\n"
-      "  "+cTypeName+" old = atom_cmpxchg(a, *expected, desired);\n"
-      "  if(old == *expected)\n"
-      "    return true;\n"
-      "  *expected = old;\n"
-      "  return false;\n"
-      "}\n"
-      "#define atomic_compare_exchange_weak                     atomic_compare_exchange_strong\n";
-    header +=
-      "#define atomic_fetch_add(x,y)                            atom_add(x,y)\n"
-      "#define atomic_fetch_sub(x,y)                            atom_sub(x,y)\n"
-      "#define atomic_fetch_or(x,y)                             atom_or(x,y)\n"
-      "#define atomic_fetch_xor(x,y)                            atom_xor(x,y)\n"
-      "#define atomic_fetch_and(x,y)                            atom_and(x,y)\n"
-      "#define atomic_fetch_min(x,y)                            atom_min(x,y)\n"
-      "#define atomic_fetch_max(x,y)                            atom_max(x,y)\n"
-      "#define atomic_flag_test_and_set(x)                      atomic_exchange(x,1)\n"
-      "#define atomic_flag_clear(x)                             atomic_store(x,0)\n"
-      "\n";
-  }
-  if(!LocalMemory() && DeclaredInProgram())
-  {
-    // additional atomic variable for results copying (last thread will do this)
-    header += "__global volatile atomic_uint finishedThreads = ATOMIC_VAR_INIT(0);\n";
-    // atomic variables declared in program scope - test data
-    std::stringstream ss;
-    ss << maxNumDestItems;
-    header += std::string("__global volatile ")+aTypeName+" destMemory["+ss.str()+"] = {\n";
-    ss.str("");
-    ss << _startValue;
-    for(cl_uint i = 0; i < maxNumDestItems; i++)
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestMemOrderScope
+    : public CBasicTest<HostAtomicType, HostDataType> {
+public:
+    using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
+    using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
+    using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
+    CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false)
+        : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
+    {}
+    virtual std::string ProgramHeader(cl_uint maxNumDestItems)
     {
-      if(aTypeName == "atomic_flag")
-        header +=  "  ATOMIC_FLAG_INIT";
-      else
-        header +=  "  ATOMIC_VAR_INIT("+ss.str()+")";
-      if(i+1 < maxNumDestItems)
-        header += ",";
-      header += "\n";
+        std::string header;
+        if (gOldAPI)
+        {
+            std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
+            header += "#define atomic_store_explicit(x,y,o" + s
+                + ")                     atomic_store(x,y)\n"
+                  "#define atomic_load_explicit(x,o"
+                + s
+                + ")                        atomic_load(x)\n"
+                  "#define atomic_exchange_explicit(x,y,o"
+                + s
+                + ")                  atomic_exchange(x,y)\n"
+                  "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"
+                + s
+                + ") atomic_compare_exchange_strong(x,y,z)\n"
+                  "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"
+                + s
+                + ")   atomic_compare_exchange_weak(x,y,z)\n"
+                  "#define atomic_fetch_add_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_add(x,y)\n"
+                  "#define atomic_fetch_sub_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_sub(x,y)\n"
+                  "#define atomic_fetch_or_explicit(x,y,o"
+                + s
+                + ")                  atomic_fetch_or(x,y)\n"
+                  "#define atomic_fetch_xor_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_xor(x,y)\n"
+                  "#define atomic_fetch_and_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_and(x,y)\n"
+                  "#define atomic_fetch_min_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_min(x,y)\n"
+                  "#define atomic_fetch_max_explicit(x,y,o"
+                + s
+                + ")                 atomic_fetch_max(x,y)\n"
+                  "#define atomic_flag_test_and_set_explicit(x,o"
+                + s
+                + ")           atomic_flag_test_and_set(x)\n"
+                  "#define atomic_flag_clear_explicit(x,o"
+                + s + ")                  atomic_flag_clear(x)\n";
+        }
+        return header
+            + CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(
+                   maxNumDestItems);
     }
-    header+=
-      "};\n"
-      "\n";
-  }
-  return header;
+    virtual std::string SingleTestName()
+    {
+        std::string testName =
+            CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
+        if (MemoryOrder() != MEMORY_ORDER_EMPTY)
+        {
+            testName += std::string(", ")
+                + std::string(get_memory_order_type_name(MemoryOrder()))
+                      .substr(sizeof("memory"));
+        }
+        if (MemoryScope() != MEMORY_SCOPE_EMPTY)
+        {
+            testName += std::string(", ")
+                + std::string(get_memory_scope_type_name(MemoryScope()))
+                      .substr(sizeof("memory"));
+        }
+        return testName;
+    }
+    virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                                  cl_command_queue queue)
+    {
+        if (LocalMemory() && MemoryScope() != MEMORY_SCOPE_EMPTY
+            && MemoryScope()
+                != MEMORY_SCOPE_WORK_GROUP) // memory scope should only be used
+                                            // for global memory
+            return 0;
+        if (MemoryScope() == MEMORY_SCOPE_DEVICE)
+            MaxGroupSize(
+                16); // increase number of groups by forcing smaller group size
+        else
+            MaxGroupSize(0); // group size limited by device capabilities
+
+        if (CheckCapabilities(MemoryScope(), MemoryOrder())
+            == TEST_SKIPPED_ITSELF)
+            return 0; // skip test - not applicable
+
+        return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
+            deviceID, context, queue);
+    }
+    virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue)
+    {
+        // repeat test for each reasonable memory order/scope combination
+        std::vector<TExplicitMemoryOrderType> memoryOrder;
+        std::vector<TExplicitMemoryScopeType> memoryScope;
+        int error = 0;
+
+        // For OpenCL-3.0 and later some orderings and scopes are optional, so
+        // here we query for the supported ones.
+        test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder,
+                                                         memoryScope),
+                       "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
+
+        for (unsigned oi = 0; oi < memoryOrder.size(); oi++)
+        {
+            for (unsigned si = 0; si < memoryScope.size(); si++)
+            {
+                if (memoryOrder[oi] == MEMORY_ORDER_EMPTY
+                    && memoryScope[si] != MEMORY_SCOPE_EMPTY)
+                    continue;
+                MemoryOrder(memoryOrder[oi]);
+                MemoryScope(memoryScope[si]);
+                EXECUTE_TEST(
+                    error,
+                    (CBasicTest<HostAtomicType, HostDataType>::
+                         ExecuteForEachParameterSet(deviceID, context, queue)));
+            }
+        }
+        return error;
+    }
+    void MemoryOrder(TExplicitMemoryOrderType memoryOrder)
+    {
+        _memoryOrder = memoryOrder;
+    }
+    TExplicitMemoryOrderType MemoryOrder() { return _memoryOrder; }
+    std::string MemoryOrderStr()
+    {
+        if (MemoryOrder() != MEMORY_ORDER_EMPTY)
+            return std::string(", ")
+                + get_memory_order_type_name(MemoryOrder());
+        return "";
+    }
+    void MemoryScope(TExplicitMemoryScopeType memoryScope)
+    {
+        _memoryScope = memoryScope;
+    }
+    TExplicitMemoryScopeType MemoryScope() { return _memoryScope; }
+    std::string MemoryScopeStr()
+    {
+        if (MemoryScope() != MEMORY_SCOPE_EMPTY)
+            return std::string(", ")
+                + get_memory_scope_type_name(MemoryScope());
+        return "";
+    }
+    std::string MemoryOrderScopeStr()
+    {
+        return MemoryOrderStr() + MemoryScopeStr();
+    }
+    virtual cl_uint CurrentGroupNum(cl_uint threadCount)
+    {
+        if (MemoryScope() == MEMORY_SCOPE_WORK_GROUP) return 1;
+        return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(
+            threadCount);
+    }
+    virtual cl_uint MaxHostThreads()
+    {
+        // block host threads execution for memory scope different than
+        // memory_scope_all_svm_devices
+        if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
+            || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
+        {
+            return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
+        }
+        else
+        {
+            return 0;
+        }
+    }
+
+private:
+    TExplicitMemoryOrderType _memoryOrder;
+    TExplicitMemoryScopeType _memoryScope;
+};
+
+template <typename HostAtomicType, typename HostDataType>
+class CBasicTestMemOrder2Scope
+    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+public:
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
+    using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
+
+    CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false)
+        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
+                                                                useSVM)
+    {}
+    virtual std::string SingleTestName()
+    {
+        std::string testName =
+            CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
+        if (MemoryOrder() != MEMORY_ORDER_EMPTY)
+            testName += std::string(", ")
+                + std::string(get_memory_order_type_name(MemoryOrder()))
+                      .substr(sizeof("memory"));
+        if (MemoryOrder2() != MEMORY_ORDER_EMPTY)
+            testName += std::string(", ")
+                + std::string(get_memory_order_type_name(MemoryOrder2()))
+                      .substr(sizeof("memory"));
+        if (MemoryScope() != MEMORY_SCOPE_EMPTY)
+            testName += std::string(", ")
+                + std::string(get_memory_scope_type_name(MemoryScope()))
+                      .substr(sizeof("memory"));
+        return testName;
+    }
+    virtual int ExecuteForEachParameterSet(cl_device_id deviceID,
+                                           cl_context context,
+                                           cl_command_queue queue)
+    {
+        // repeat test for each reasonable memory order/scope combination
+        std::vector<TExplicitMemoryOrderType> memoryOrder;
+        std::vector<TExplicitMemoryScopeType> memoryScope;
+        int error = 0;
+
+        // For OpenCL-3.0 and later some orderings and scopes are optional, so
+        // here we query for the supported ones.
+        test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder,
+                                                         memoryScope),
+                       "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);
+
+        for (unsigned oi = 0; oi < memoryOrder.size(); oi++)
+        {
+            for (unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
+            {
+                for (unsigned si = 0; si < memoryScope.size(); si++)
+                {
+                    if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY
+                         || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
+                        && memoryOrder[oi] != memoryOrder[o2i])
+                        continue; // both memory order arguments must be set (or
+                                  // none)
+                    if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY
+                         || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
+                        && memoryScope[si] != MEMORY_SCOPE_EMPTY)
+                        continue; // memory scope without memory order is not
+                                  // allowed
+                    MemoryOrder(memoryOrder[oi]);
+                    MemoryOrder2(memoryOrder[o2i]);
+                    MemoryScope(memoryScope[si]);
+
+                    if (CheckCapabilities(MemoryScope(), MemoryOrder())
+                        == TEST_SKIPPED_ITSELF)
+                        continue; // skip test - not applicable
+
+                    if (CheckCapabilities(MemoryScope(), MemoryOrder2())
+                        == TEST_SKIPPED_ITSELF)
+                        continue; // skip test - not applicable
+
+                    EXECUTE_TEST(error,
+                                 (CBasicTest<HostAtomicType, HostDataType>::
+                                      ExecuteForEachParameterSet(
+                                          deviceID, context, queue)));
+                }
+            }
+        }
+        return error;
+    }
+    void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail)
+    {
+        _memoryOrder2 = memoryOrderFail;
+    }
+    TExplicitMemoryOrderType MemoryOrder2() { return _memoryOrder2; }
+    std::string MemoryOrderFailStr()
+    {
+        if (MemoryOrder2() != MEMORY_ORDER_EMPTY)
+            return std::string(", ")
+                + get_memory_order_type_name(MemoryOrder2());
+        return "";
+    }
+    std::string MemoryOrderScope()
+    {
+        return MemoryOrderStr() + MemoryOrderFailStr() + MemoryScopeStr();
+    }
+
+private:
+    TExplicitMemoryOrderType _memoryOrder2;
+};
+
+template <typename HostAtomicType, typename HostDataType>
+std::string
+CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
+{
+    std::string pragma;
+
+    if (gOldAPI)
+    {
+        pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : "
+                  "enable\n";
+        pragma += "#pragma OPENCL EXTENSION "
+                  "cl_khr_local_int32_extended_atomics : enable\n";
+        pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : "
+                  "enable\n";
+        pragma += "#pragma OPENCL EXTENSION "
+                  "cl_khr_global_int32_extended_atomics : enable\n";
+    }
+    // Create the pragma lines for this kernel
+    if (DataType().Size(deviceID) == 8)
+    {
+        pragma +=
+            "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
+        pragma +=
+            "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
+    }
+    if (_dataType == TYPE_ATOMIC_DOUBLE)
+        pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+    return pragma;
 }
 
-template<typename HostAtomicType, typename HostDataType>
+template <typename HostAtomicType, typename HostDataType>
+std::string
+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
+{
+    // Create the program header
+    std::string header;
+    std::string aTypeName = DataType().AtomicTypeName();
+    std::string cTypeName = DataType().RegularTypeName();
+    std::string argListForKernel;
+    std::string argListForFunction;
+    std::string argListNoTypes;
+    std::string functionPrototype;
+    std::string addressSpace = LocalMemory() ? "__local " : "__global ";
+
+    if (gOldAPI)
+    {
+        header += std::string("#define ") + aTypeName + " " + cTypeName
+            + "\n"
+              "#define atomic_store(x,y)                                (*(x) "
+              "= y)\n"
+              "#define atomic_load(x)                                   "
+              "(*(x))\n"
+              "#define ATOMIC_VAR_INIT(x)                               (x)\n"
+              "#define ATOMIC_FLAG_INIT                                 0\n"
+              "#define atomic_init(x,y)                                 "
+              "atomic_store(x,y)\n";
+        if (aTypeName == "atomic_float")
+            header += "#define atomic_exchange(x,y)                            "
+                      " atomic_xchg(x,y)\n";
+        else if (aTypeName == "atomic_double")
+            header += "double atomic_exchange(volatile " + addressSpace
+                + "atomic_double *x, double y)\n"
+                  "{\n"
+                  "  long tmp = *(long*)&y, res;\n"
+                  "  volatile "
+                + addressSpace + "long *tmpA = (volatile " + addressSpace
+                + "long)x;\n"
+                  "  res = atom_xchg(tmpA,tmp);\n"
+                  "  return *(double*)&res;\n"
+                  "}\n";
+        else
+            header += "#define atomic_exchange(x,y)                            "
+                      " atom_xchg(x,y)\n";
+        if (aTypeName != "atomic_float" && aTypeName != "atomic_double")
+            header += "bool atomic_compare_exchange_strong(volatile "
+                + addressSpace + " " + aTypeName + " *a, " + cTypeName
+                + " *expected, " + cTypeName
+                + " desired)\n"
+                  "{\n"
+                  "  "
+                + cTypeName
+                + " old = atom_cmpxchg(a, *expected, desired);\n"
+                  "  if(old == *expected)\n"
+                  "    return true;\n"
+                  "  *expected = old;\n"
+                  "  return false;\n"
+                  "}\n"
+                  "#define atomic_compare_exchange_weak                     "
+                  "atomic_compare_exchange_strong\n";
+        header += "#define atomic_fetch_add(x,y)                            "
+                  "atom_add(x,y)\n"
+                  "#define atomic_fetch_sub(x,y)                            "
+                  "atom_sub(x,y)\n"
+                  "#define atomic_fetch_or(x,y)                             "
+                  "atom_or(x,y)\n"
+                  "#define atomic_fetch_xor(x,y)                            "
+                  "atom_xor(x,y)\n"
+                  "#define atomic_fetch_and(x,y)                            "
+                  "atom_and(x,y)\n"
+                  "#define atomic_fetch_min(x,y)                            "
+                  "atom_min(x,y)\n"
+                  "#define atomic_fetch_max(x,y)                            "
+                  "atom_max(x,y)\n"
+                  "#define atomic_flag_test_and_set(x)                      "
+                  "atomic_exchange(x,1)\n"
+                  "#define atomic_flag_clear(x)                             "
+                  "atomic_store(x,0)\n"
+                  "\n";
+    }
+    if (!LocalMemory() && DeclaredInProgram())
+    {
+        // additional atomic variable for results copying (last thread will do
+        // this)
+        header += "__global volatile atomic_uint finishedThreads = "
+                  "ATOMIC_VAR_INIT(0);\n";
+        // atomic variables declared in program scope - test data
+        std::stringstream ss;
+        ss << maxNumDestItems;
+        header += std::string("__global volatile ") + aTypeName + " destMemory["
+            + ss.str() + "] = {\n";
+        ss.str("");
+        ss << _startValue;
+        for (cl_uint i = 0; i < maxNumDestItems; i++)
+        {
+            if (aTypeName == "atomic_flag")
+                header += "  ATOMIC_FLAG_INIT";
+            else
+                header += "  ATOMIC_VAR_INIT(" + ss.str() + ")";
+            if (i + 1 < maxNumDestItems) header += ",";
+            header += "\n";
+        }
+        header += "};\n"
+                  "\n";
+    }
+    return header;
+}
+
+template <typename HostAtomicType, typename HostDataType>
 std::string CBasicTest<HostAtomicType, HostDataType>::FunctionCode()
 {
-  if(!UsedInFunction())
-    return "";
-  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
-  std::string code = "void test_atomic_function(uint tid, uint threadCount, uint numDestItems, volatile ";
-  if(!GenericAddrSpace())
-    code += addressSpace;
-  code += std::string(DataType().AtomicTypeName())+" *destMemory, __global "+DataType().RegularTypeName()+
-    " *oldValues";
-  if(LocalRefValues())
-    code += std::string(", __local ")+DataType().RegularTypeName()+" *localValues";
-  code += ")\n"
-    "{\n";
-  code += ProgramCore();
-  code += "}\n"
-    "\n";
-  return code;
+    if (!UsedInFunction()) return "";
+    std::string addressSpace = LocalMemory() ? "__local " : "__global ";
+    std::string code = "void test_atomic_function(uint tid, uint threadCount, "
+                       "uint numDestItems, volatile ";
+    if (!GenericAddrSpace()) code += addressSpace;
+    code += std::string(DataType().AtomicTypeName()) + " *destMemory, __global "
+        + DataType().RegularTypeName() + " *oldValues";
+    if (LocalRefValues())
+        code += std::string(", __local ") + DataType().RegularTypeName()
+            + " *localValues";
+    code += ")\n"
+            "{\n";
+    code += ProgramCore();
+    code += "}\n"
+            "\n";
+    return code;
 }
 
-template<typename HostAtomicType, typename HostDataType>
-std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
+template <typename HostAtomicType, typename HostDataType>
+std::string
+CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
 {
-  std::string aTypeName = DataType().AtomicTypeName();
-  std::string cTypeName = DataType().RegularTypeName();
-  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
-  std::string code = "__kernel void test_atomic_kernel(uint threadCount, uint numDestItems, ";
+    std::string aTypeName = DataType().AtomicTypeName();
+    std::string cTypeName = DataType().RegularTypeName();
+    std::string addressSpace = LocalMemory() ? "__local " : "__global ";
+    std::string code = "__kernel void test_atomic_kernel(uint threadCount, "
+                       "uint numDestItems, ";
 
-  // prepare list of arguments for kernel
-  if(LocalMemory())
-  {
-    code += std::string("__global ")+cTypeName+" *finalDest, __global "+cTypeName+" *oldValues,"
-      " volatile "+addressSpace+aTypeName+" *"+(DeclaredInProgram() ? "notUsed" : "")+"destMemory";
-  }
-  else
-  {
-    code += "volatile "+addressSpace+(DeclaredInProgram() ? (cTypeName+" *finalDest") : (aTypeName+" *destMemory"))+
-      ", __global "+cTypeName+" *oldValues";
-  }
-  if(LocalRefValues())
-    code += std::string(", __local ")+cTypeName+" *localValues";
-  code += ")\n"
-    "{\n";
-  if(LocalMemory() && DeclaredInProgram())
-  {
-    // local atomics declared in kernel scope
-    std::stringstream ss;
-    ss << maxNumDestItems;
-    code += std::string("  __local volatile ")+aTypeName+" destMemory["+ss.str()+"];\n";
-  }
-  code += "  uint  tid = get_global_id(0);\n"
-    "\n";
-  if(LocalMemory())
-  {
-      // memory_order_relaxed is sufficient for these initialization operations
-      // as the barrier below will act as a fence, providing an order to the
-      // operations. memory_scope_work_group is sufficient as local memory is
-      // only visible within the work-group.
-      code += R"(
+    // prepare list of arguments for kernel
+    if (LocalMemory())
+    {
+        code += std::string("__global ") + cTypeName + " *finalDest, __global "
+            + cTypeName
+            + " *oldValues,"
+              " volatile "
+            + addressSpace + aTypeName + " *"
+            + (DeclaredInProgram() ? "notUsed" : "") + "destMemory";
+    }
+    else
+    {
+        code += "volatile " + addressSpace
+            + (DeclaredInProgram() ? (cTypeName + " *finalDest")
+                                   : (aTypeName + " *destMemory"))
+            + ", __global " + cTypeName + " *oldValues";
+    }
+    if (LocalRefValues())
+        code += std::string(", __local ") + cTypeName + " *localValues";
+    code += ")\n"
+            "{\n";
+    if (LocalMemory() && DeclaredInProgram())
+    {
+        // local atomics declared in kernel scope
+        std::stringstream ss;
+        ss << maxNumDestItems;
+        code += std::string("  __local volatile ") + aTypeName + " destMemory["
+            + ss.str() + "];\n";
+    }
+    code += "  uint  tid = get_global_id(0);\n"
+            "\n";
+    if (LocalMemory())
+    {
+        // memory_order_relaxed is sufficient for these initialization
+        // operations as the barrier below will act as a fence, providing an
+        // order to the operations. memory_scope_work_group is sufficient as
+        // local memory is only visible within the work-group.
+        code += R"(
               // initialize atomics not reachable from host (first thread
               // is doing this, other threads are waiting on barrier)
               if(get_local_id(0) == 0)
                 for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)
                 {)";
-      if (aTypeName == "atomic_flag")
-      {
-          code += R"(
+        if (aTypeName == "atomic_flag")
+        {
+            code += R"(
                   if(finalDest[dstItemIdx])
                     atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
                                                       memory_order_relaxed,
@@ -823,512 +976,565 @@ std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumD
                     atomic_flag_clear_explicit(destMemory+dstItemIdx,
                                                memory_order_relaxed,
                                                memory_scope_work_group);)";
-      }
-    else
-    {
-        code += R"(
+        }
+        else
+        {
+            code += R"(
                 atomic_store_explicit(destMemory+dstItemIdx,
                                       finalDest[dstItemIdx],
                                       memory_order_relaxed,
                                       memory_scope_work_group);)";
+        }
+        code += "    }\n"
+                "  barrier(CLK_LOCAL_MEM_FENCE);\n"
+                "\n";
     }
-    code +=
-      "    }\n"
-      "  barrier(CLK_LOCAL_MEM_FENCE);\n"
-      "\n";
-  }
-  if (LocalRefValues())
-  {
-    code +=
-      "  // Copy input reference values into local memory\n";
-    if (NumNonAtomicVariablesPerThread() == 1)
-      code += "  localValues[get_local_id(0)] = oldValues[tid];\n";
-    else
+    if (LocalRefValues())
     {
-      std::stringstream ss;
-      ss << NumNonAtomicVariablesPerThread();
-      code +=
-        "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
-        "    localValues[get_local_id(0)*" + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
+        code += "  // Copy input reference values into local memory\n";
+        if (NumNonAtomicVariablesPerThread() == 1)
+            code += "  localValues[get_local_id(0)] = oldValues[tid];\n";
+        else
+        {
+            std::stringstream ss;
+            ss << NumNonAtomicVariablesPerThread();
+            code += "  for(uint rfId = 0; rfId < " + ss.str()
+                + "; rfId++)\n"
+                  "    localValues[get_local_id(0)*"
+                + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
+        }
+        code += "  barrier(CLK_LOCAL_MEM_FENCE);\n"
+                "\n";
     }
-    code +=
-      "  barrier(CLK_LOCAL_MEM_FENCE);\n"
-      "\n";
-  }
-  if (UsedInFunction())
-    code += std::string("  test_atomic_function(tid, threadCount, numDestItems, destMemory, oldValues")+
-    (LocalRefValues() ? ", localValues" : "")+");\n";
-  else
-    code += ProgramCore();
-  code += "\n";
-  if (LocalRefValues())
-  {
-    code +=
-      "  // Copy local reference values into output array\n"
-      "  barrier(CLK_LOCAL_MEM_FENCE);\n";
-    if (NumNonAtomicVariablesPerThread() == 1)
-      code += "  oldValues[tid] = localValues[get_local_id(0)];\n";
+    if (UsedInFunction())
+        code += std::string("  test_atomic_function(tid, threadCount, "
+                            "numDestItems, destMemory, oldValues")
+            + (LocalRefValues() ? ", localValues" : "") + ");\n";
     else
-    {
-      std::stringstream ss;
-      ss << NumNonAtomicVariablesPerThread();
-      code +=
-        "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
-        "    oldValues[tid*" + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + "+rfId];\n";
-    }
+        code += ProgramCore();
     code += "\n";
-  }
-  if(LocalMemory() || DeclaredInProgram())
-  {
-    code += "  // Copy final values to host reachable buffer\n";
-    if(LocalMemory())
-      code +=
-        "  barrier(CLK_LOCAL_MEM_FENCE);\n"
-        "  if(get_local_id(0) == 0) // first thread in workgroup\n";
-    else
-      // global atomics declared in program scope
-      code += R"(
+    if (LocalRefValues())
+    {
+        code += "  // Copy local reference values into output array\n"
+                "  barrier(CLK_LOCAL_MEM_FENCE);\n";
+        if (NumNonAtomicVariablesPerThread() == 1)
+            code += "  oldValues[tid] = localValues[get_local_id(0)];\n";
+        else
+        {
+            std::stringstream ss;
+            ss << NumNonAtomicVariablesPerThread();
+            code += "  for(uint rfId = 0; rfId < " + ss.str()
+                + "; rfId++)\n"
+                  "    oldValues[tid*"
+                + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str()
+                + "+rfId];\n";
+        }
+        code += "\n";
+    }
+    if (LocalMemory() || DeclaredInProgram())
+    {
+        code += "  // Copy final values to host reachable buffer\n";
+        if (LocalMemory())
+            code += "  barrier(CLK_LOCAL_MEM_FENCE);\n"
+                    "  if(get_local_id(0) == 0) // first thread in workgroup\n";
+        else
+            // global atomics declared in program scope
+            code += R"(
                 if(atomic_fetch_add_explicit(&finishedThreads, 1u,
                                            memory_order_relaxed,
                                            memory_scope_work_group)
                    == get_global_size(0)-1) // last finished thread
                    )";
-    code +=
-        "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n";
-    if(aTypeName == "atomic_flag")
-    {
-        code += R"(
+        code += "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
+                "dstItemIdx++)\n";
+        if (aTypeName == "atomic_flag")
+        {
+            code += R"(
                 finalDest[dstItemIdx] =
                     atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
                                                       memory_order_relaxed,
                                                       memory_scope_work_group);)";
-    }
-    else
-    {
-        code += R"(
+        }
+        else
+        {
+            code += R"(
                 finalDest[dstItemIdx] =
                     atomic_load_explicit(destMemory+dstItemIdx,
                                          memory_order_relaxed,
                                          memory_scope_work_group);)";
+        }
     }
-  }
-  code += "}\n"
-    "\n";
-  return code;
+    code += "}\n"
+            "\n";
+    return code;
 }
 
 template <typename HostAtomicType, typename HostDataType>
-int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
+int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
+    cl_device_id deviceID, cl_context context, cl_command_queue queue)
 {
-  int error;
-  clProgramWrapper program;
-  clKernelWrapper kernel;
-  size_t threadNum[1];
-  clMemWrapper streams[2];
-  std::vector<HostAtomicType> destItems;
-  HostAtomicType *svmAtomicBuffer = 0;
-  std::vector<HostDataType> refValues, startRefValues;
-  HostDataType *svmDataBuffer = 0;
-  cl_uint deviceThreadCount, hostThreadCount, threadCount;
-  size_t groupSize = 0;
-  std::string programSource;
-  const char *programLine;
-  MTdata d;
-  size_t typeSize = DataType().Size(deviceID);
+    int error;
+    clProgramWrapper program;
+    clKernelWrapper kernel;
+    size_t threadNum[1];
+    clMemWrapper streams[2];
+    std::vector<HostAtomicType> destItems;
+    HostAtomicType *svmAtomicBuffer = 0;
+    std::vector<HostDataType> refValues, startRefValues;
+    HostDataType *svmDataBuffer = 0;
+    cl_uint deviceThreadCount, hostThreadCount, threadCount;
+    size_t groupSize = 0;
+    std::string programSource;
+    const char *programLine;
+    MTdata d;
+    size_t typeSize = DataType().Size(deviceID);
 
-  deviceThreadCount = _maxDeviceThreads;
-  hostThreadCount = MaxHostThreads();
-  threadCount = deviceThreadCount+hostThreadCount;
+    deviceThreadCount = _maxDeviceThreads;
+    hostThreadCount = MaxHostThreads();
+    threadCount = deviceThreadCount + hostThreadCount;
 
-  //log_info("\t%s %s%s...\n", local ? "local" : "global", DataType().AtomicTypeName(), memoryOrderScope.c_str());
-  log_info("\t%s...\n", SingleTestName().c_str());
+    // log_info("\t%s %s%s...\n", local ? "local" : "global",
+    // DataType().AtomicTypeName(), memoryOrderScope.c_str());
+    log_info("\t%s...\n", SingleTestName().c_str());
 
-  if(!LocalMemory() && DeclaredInProgram() && gNoGlobalVariables) // no support for program scope global variables
-  {
-    log_info("\t\tTest disabled\n");
-    return 0;
-  }
-  if(UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
-  {
-    log_info("\t\tTest disabled\n");
-    return 0;
-  }
+    if (!LocalMemory() && DeclaredInProgram()
+        && gNoGlobalVariables) // no support for program scope global variables
+    {
+        log_info("\t\tTest disabled\n");
+        return 0;
+    }
+    if (UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
+    {
+        log_info("\t\tTest disabled\n");
+        return 0;
+    }
 
-  // set up work sizes based on device capabilities and test configuration
-  error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
-  test_error(error, "Unable to obtain max work group size for device");
-  CurrentGroupSize((cl_uint)groupSize);
-  if(CurrentGroupSize() > deviceThreadCount)
-    CurrentGroupSize(deviceThreadCount);
-  if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
-    deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
-  threadCount = deviceThreadCount+hostThreadCount;
+    // set up work sizes based on device capabilities and test configuration
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                            sizeof(groupSize), &groupSize, NULL);
+    test_error(error, "Unable to obtain max work group size for device");
+    CurrentGroupSize((cl_uint)groupSize);
+    if (CurrentGroupSize() > deviceThreadCount)
+        CurrentGroupSize(deviceThreadCount);
+    if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
+        deviceThreadCount =
+            CurrentGroupSize() * CurrentGroupNum(deviceThreadCount);
+    threadCount = deviceThreadCount + hostThreadCount;
 
-  // If we're given a num_results function, we need to determine how many result objects we need.
-  // This is the first assessment for current maximum number of threads (exact thread count is not known here)
-  // - needed for program source code generation (arrays of atomics declared in program)
-  cl_uint numDestItems = NumResults(threadCount, deviceID);
+    // If we're given a num_results function, we need to determine how many
+    // result objects we need. This is the first assessment for current maximum
+    // number of threads (exact thread count is not known here)
+    // - needed for program source code generation (arrays of atomics declared
+    // in program)
+    cl_uint numDestItems = NumResults(threadCount, deviceID);
 
-  if(deviceThreadCount > 0)
-  {
-      // This loop iteratively reduces the workgroup size by 2 and then
-      // re-generates the kernel with the reduced
-      // workgroup size until we find a size which is admissible for the kernel
-      // being run or reduce the wg size
-      // to the trivial case of 1 (which was separately verified to be accurate
-      // for the kernel being run)
+    if (deviceThreadCount > 0)
+    {
+        // This loop iteratively reduces the workgroup size by 2 and then
+        // re-generates the kernel with the reduced
+        // workgroup size until we find a size which is admissible for the
+        // kernel being run or reduce the wg size to the trivial case of 1
+        // (which was separately verified to be accurate for the kernel being
+        // run)
 
-      while ((CurrentGroupSize() > 1))
-      {
-          // Re-generate the kernel code with the current group size
-          if (kernel) clReleaseKernel(kernel);
-          if (program) clReleaseProgram(program);
-          programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
-              + FunctionCode() + KernelCode(numDestItems);
-          programLine = programSource.c_str();
-          if (create_single_kernel_helper_with_build_options(
-                  context, &program, &kernel, 1, &programLine,
-                  "test_atomic_kernel", gOldAPI ? "" : nullptr))
-          {
-              return -1;
-          }
-          // Get work group size for the new kernel
-          error = clGetKernelWorkGroupInfo(kernel, deviceID,
-                                           CL_KERNEL_WORK_GROUP_SIZE,
-                                           sizeof(groupSize), &groupSize, NULL);
-          test_error(error,
-                     "Unable to obtain max work group size for device and "
-                     "kernel combo");
+        while ((CurrentGroupSize() > 1))
+        {
+            // Re-generate the kernel code with the current group size
+            if (kernel) clReleaseKernel(kernel);
+            if (program) clReleaseProgram(program);
+            programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
+                + FunctionCode() + KernelCode(numDestItems);
+            programLine = programSource.c_str();
+            if (create_single_kernel_helper_with_build_options(
+                    context, &program, &kernel, 1, &programLine,
+                    "test_atomic_kernel", gOldAPI ? "" : nullptr))
+            {
+                return -1;
+            }
+            // Get work group size for the new kernel
+            error = clGetKernelWorkGroupInfo(
+                kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize),
+                &groupSize, NULL);
+            test_error(error,
+                       "Unable to obtain max work group size for device and "
+                       "kernel combo");
 
-          if (LocalMemory())
-          {
-              cl_ulong usedLocalMemory;
-              cl_ulong totalLocalMemory;
-              cl_uint maxWorkGroupSize;
+            if (LocalMemory())
+            {
+                cl_ulong usedLocalMemory;
+                cl_ulong totalLocalMemory;
+                cl_uint maxWorkGroupSize;
 
-              error = clGetKernelWorkGroupInfo(
-                  kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
-                  sizeof(usedLocalMemory), &usedLocalMemory, NULL);
-              test_error(error, "clGetKernelWorkGroupInfo failed");
+                error = clGetKernelWorkGroupInfo(
+                    kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
+                    sizeof(usedLocalMemory), &usedLocalMemory, NULL);
+                test_error(error, "clGetKernelWorkGroupInfo failed");
 
-              error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
-                                      sizeof(totalLocalMemory),
-                                      &totalLocalMemory, NULL);
-              test_error(error, "clGetDeviceInfo failed");
+                error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
+                                        sizeof(totalLocalMemory),
+                                        &totalLocalMemory, NULL);
+                test_error(error, "clGetDeviceInfo failed");
 
-              // We know that each work-group is going to use typeSize *
-              // deviceThreadCount bytes of local memory
-              // so pick the maximum value for deviceThreadCount that uses all
-              // the local memory.
-              maxWorkGroupSize =
-                  ((totalLocalMemory - usedLocalMemory) / typeSize);
+                // We know that each work-group is going to use typeSize *
+                // deviceThreadCount bytes of local memory
+                // so pick the maximum value for deviceThreadCount that uses all
+                // the local memory.
+                maxWorkGroupSize =
+                    ((totalLocalMemory - usedLocalMemory) / typeSize);
 
-              if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
-          }
-          if (CurrentGroupSize() <= groupSize)
-              break;
-          else
-              CurrentGroupSize(CurrentGroupSize() / 2);
-      }
-    if(CurrentGroupSize() > deviceThreadCount)
-      CurrentGroupSize(deviceThreadCount);
-    if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
-      deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
-    threadCount = deviceThreadCount+hostThreadCount;
-  }
-  if (gDebug)
-  {
-      log_info("Program source:\n");
-      log_info("%s\n", programLine);
-  }
-  if(deviceThreadCount > 0)
-    log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize());
-  if(hostThreadCount > 0)
-    log_info("\t\t(host threads %u)\n", hostThreadCount);
+                if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
+            }
+            if (CurrentGroupSize() <= groupSize)
+                break;
+            else
+                CurrentGroupSize(CurrentGroupSize() / 2);
+        }
+        if (CurrentGroupSize() > deviceThreadCount)
+            CurrentGroupSize(deviceThreadCount);
+        if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
+            deviceThreadCount =
+                CurrentGroupSize() * CurrentGroupNum(deviceThreadCount);
+        threadCount = deviceThreadCount + hostThreadCount;
+    }
+    if (gDebug)
+    {
+        log_info("Program source:\n");
+        log_info("%s\n", programLine);
+    }
+    if (deviceThreadCount > 0)
+        log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount,
+                 CurrentGroupSize());
+    if (hostThreadCount > 0)
+        log_info("\t\t(host threads %u)\n", hostThreadCount);
 
-  refValues.resize(threadCount*NumNonAtomicVariablesPerThread());
+    refValues.resize(threadCount * NumNonAtomicVariablesPerThread());
 
-  // Generate ref data if we have a ref generator provided
-  d = init_genrand(gRandomSeed);
-  startRefValues.resize(threadCount*NumNonAtomicVariablesPerThread());
-  if(GenerateRefs(threadCount, &startRefValues[0], d))
-  {
-    //copy ref values for host threads
-    memcpy(&refValues[0], &startRefValues[0], sizeof(HostDataType)*threadCount*NumNonAtomicVariablesPerThread());
-  }
-  else
-  {
-    startRefValues.resize(0);
-  }
-  free_mtdata(d);
-  d = NULL;
-
-  // If we're given a num_results function, we need to determine how many result objects we need. If
-  // we don't have it, we assume it's just 1
-  // This is final value (exact thread count is known in this place)
-  numDestItems = NumResults(threadCount, deviceID);
-
-  destItems.resize(numDestItems);
-  for(cl_uint i = 0; i < numDestItems; i++)
-    destItems[i] = _startValue;
-
-  // Create main buffer with atomic variables (array size dependent on particular test)
-  if(UseSVM())
-  {
-    if(gUseHostPtr)
-      svmAtomicBuffer = (HostAtomicType*)malloc(typeSize * numDestItems);
+    // Generate ref data if we have a ref generator provided
+    d = init_genrand(gRandomSeed);
+    startRefValues.resize(threadCount * NumNonAtomicVariablesPerThread());
+    if (GenerateRefs(threadCount, &startRefValues[0], d))
+    {
+        // copy ref values for host threads
+        memcpy(&refValues[0], &startRefValues[0],
+               sizeof(HostDataType) * threadCount
+                   * NumNonAtomicVariablesPerThread());
+    }
     else
-      svmAtomicBuffer = (HostAtomicType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, typeSize * numDestItems, 0);
-    if(!svmAtomicBuffer)
     {
-      log_error("ERROR: clSVMAlloc failed!\n");
-      return -1;
+        startRefValues.resize(0);
     }
-    memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
-    streams[0] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
-                                typeSize * numDestItems, svmAtomicBuffer, NULL);
-  }
-  else
-  {
-      streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
-                                  typeSize * numDestItems, &destItems[0], NULL);
-  }
-  if (!streams[0])
-  {
-    log_error("ERROR: Creating output array failed!\n");
-    return -1;
-  }
-  // Create buffer for per-thread input/output data
-  if(UseSVM())
-  {
-    if(gUseHostPtr)
-      svmDataBuffer = (HostDataType*)malloc(typeSize*threadCount*NumNonAtomicVariablesPerThread());
-    else
-      svmDataBuffer = (HostDataType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS : 0), typeSize*threadCount*NumNonAtomicVariablesPerThread(), 0);
-    if(!svmDataBuffer)
-    {
-      log_error("ERROR: clSVMAlloc failed!\n");
-      return -1;
-    }
-    if(startRefValues.size())
-      memcpy(svmDataBuffer, &startRefValues[0], typeSize*threadCount*NumNonAtomicVariablesPerThread());
-    streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
-                                typeSize * threadCount
-                                    * NumNonAtomicVariablesPerThread(),
-                                svmDataBuffer, NULL);
-  }
-  else
-  {
-      streams[1] = clCreateBuffer(
-          context,
-          ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
-          typeSize * threadCount * NumNonAtomicVariablesPerThread(),
-          startRefValues.size() ? &startRefValues[0] : 0, NULL);
-  }
-  if (!streams[1])
-  {
-    log_error("ERROR: Creating reference array failed!\n");
-    return -1;
-  }
-  if(deviceThreadCount > 0)
-  {
-    cl_uint argInd = 0;
-    /* Set the arguments */
-    error = clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
-    test_error(error, "Unable to set kernel argument");
-    error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), &numDestItems);
-    test_error(error, "Unable to set indexed kernel argument");
-    error = clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
-    test_error(error, "Unable to set indexed kernel arguments");
-    error = clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
-    test_error(error, "Unable to set indexed kernel arguments");
-    if(LocalMemory())
-    {
-      error = clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
-      test_error(error, "Unable to set indexed local kernel argument");
-    }
-    if(LocalRefValues())
-    {
-      error = clSetKernelArg(kernel, argInd++, LocalRefValues() ? typeSize*CurrentGroupSize()*NumNonAtomicVariablesPerThread() : 1, NULL);
-      test_error(error, "Unable to set indexed kernel argument");
-    }
-  }
-  /* Configure host threads */
-  std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
-  for(unsigned int t = 0; t < hostThreadCount; t++)
-  {
-    hostThreadContexts[t].test = this;
-    hostThreadContexts[t].tid = deviceThreadCount+t;
-    hostThreadContexts[t].threadCount = threadCount;
-    hostThreadContexts[t].destMemory = UseSVM() ? svmAtomicBuffer : &destItems[0];
-    hostThreadContexts[t].oldValues = UseSVM() ? svmDataBuffer : &refValues[0];
-  }
+    free_mtdata(d);
+    d = NULL;
 
-  if(deviceThreadCount > 0)
-  {
-    /* Run the kernel */
-    threadNum[0] = deviceThreadCount;
-    groupSize = CurrentGroupSize();
-    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, &groupSize, 0, NULL, NULL);
-    test_error(error, "Unable to execute test kernel");
-    /* start device threads */
-    error = clFlush(queue);
-    test_error(error, "clFlush failed");
-  }
+    // If we're given a num_results function, we need to determine how many
+    // result objects we need. If we don't have it, we assume it's just 1 This
+    // is final value (exact thread count is known in this place)
+    numDestItems = NumResults(threadCount, deviceID);
 
-  /* Start host threads and wait for finish */
-  if(hostThreadCount > 0)
-    ThreadPool_Do(HostThreadFunction, hostThreadCount, &hostThreadContexts[0]);
+    destItems.resize(numDestItems);
+    for (cl_uint i = 0; i < numDestItems; i++) destItems[i] = _startValue;
 
-  if(UseSVM())
-  {
-    error = clFinish(queue);
-    test_error(error, "clFinish failed");
-    memcpy(&destItems[0], svmAtomicBuffer, typeSize*numDestItems);
-    memcpy(&refValues[0], svmDataBuffer, typeSize*threadCount*NumNonAtomicVariablesPerThread());
-  }
-  else
-  {
-    if(deviceThreadCount > 0)
+    // Create main buffer with atomic variables (array size dependent on
+    // particular test)
+    if (UseSVM())
     {
-      error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
-      test_error(error, "Unable to read result value!");
-      error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize * deviceThreadCount*NumNonAtomicVariablesPerThread(), &refValues[0], 0, NULL, NULL);
-      test_error(error, "Unable to read reference values!");
-    }
-  }
-  bool dataVerified = false;
-  // If we have an expectedFn, then we need to generate a final value to compare against. If we don't
-  // have one, it's because we're comparing ref values only
-  for(cl_uint i = 0; i < numDestItems; i++)
-  {
-    HostDataType expected;
-
-    if(!ExpectedValue(expected, threadCount, startRefValues.size() ? &startRefValues[0] : 0, i))
-      break; // no expected value function provided
-
-    if(expected != destItems[i])
-    {
-      std::stringstream logLine;
-      logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected << ", was " << destItems[i] << ")\n";
-      log_error("%s", logLine.str().c_str());
-      for(i = 0; i < threadCount; i++)
-      {
-        logLine.str("");
-        logLine << " --- " << i << " - ";
-        if(startRefValues.size())
-          logLine << startRefValues[i] << " -> " << refValues[i];
+        if (gUseHostPtr)
+            svmAtomicBuffer = (HostAtomicType *)malloc(typeSize * numDestItems);
         else
-          logLine << refValues[i];
-        logLine << " --- ";
-        if(i < numDestItems)
-          logLine << destItems[i];
-        logLine << "\n";
-        log_info("%s", logLine.str().c_str());
-      }
-      if(!gDebug)
-      {
-        log_info("Program source:\n");
-        log_info("%s\n", programLine);
-      }
-      return -1;
-    }
-    dataVerified = true;
-  }
-
-  bool dataCorrect = false;
-  /* Use the verify function (if provided) to also check the results */
-  if(VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
-  {
-    if(!dataCorrect)
-    {
-      log_error("ERROR: Reference values did not validate!\n");
-      std::stringstream logLine;
-      for(cl_uint i = 0; i < threadCount; i++)
-      for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
-      {
-        logLine.str("");
-        logLine << " --- " << i << " - " << refValues[i*NumNonAtomicVariablesPerThread()+j] << " --- ";
-        if(j == 0 && i < numDestItems)
-          logLine << destItems[i];
-        logLine << "\n";
-        log_info("%s", logLine.str().c_str());
-      }
-      if(!gDebug)
-      {
-        log_info("Program source:\n");
-        log_info("%s\n", programLine);
-      }
-      return -1;
-    }
-  }
-  else if(!dataVerified)
-  {
-    log_error("ERROR: Test doesn't check total or refs; no values are verified!\n");
-    return -1;
-  }
-
-  if(OldValueCheck() &&
-    !(DeclaredInProgram() && !LocalMemory())) // don't test for programs scope global atomics
-                                             // 'old' value has been overwritten by previous clEnqueueNDRangeKernel
-  {
-    /* Re-write the starting value */
-    for(size_t i = 0; i < numDestItems; i++)
-      destItems[i] = _startValue;
-    refValues[0] = 0;
-    if(deviceThreadCount > 0)
-    {
-      error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
-      test_error(error, "Unable to write starting values!");
-
-      /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
-      threadNum[0] = 1;
-      error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, threadNum, 0, NULL, NULL);
-      test_error(error, "Unable to execute test kernel");
-
-      error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, &refValues[0], 0, NULL, NULL);
-      test_error(error, "Unable to read reference values!");
+            svmAtomicBuffer = (HostAtomicType *)clSVMAlloc(
+                context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS,
+                typeSize * numDestItems, 0);
+        if (!svmAtomicBuffer)
+        {
+            log_error("ERROR: clSVMAlloc failed!\n");
+            return -1;
+        }
+        memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
+        streams[0] =
+            clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
+                           typeSize * numDestItems, svmAtomicBuffer, NULL);
     }
     else
     {
-      /* Start host thread */
-      HostFunction(0, 1, &destItems[0], &refValues[0]);
+        streams[0] =
+            clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                           typeSize * numDestItems, &destItems[0], NULL);
+    }
+    if (!streams[0])
+    {
+        log_error("ERROR: Creating output array failed!\n");
+        return -1;
+    }
+    // Create buffer for per-thread input/output data
+    if (UseSVM())
+    {
+        if (gUseHostPtr)
+            svmDataBuffer = (HostDataType *)malloc(
+                typeSize * threadCount * NumNonAtomicVariablesPerThread());
+        else
+            svmDataBuffer = (HostDataType *)clSVMAlloc(
+                context,
+                CL_MEM_SVM_FINE_GRAIN_BUFFER
+                    | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS
+                                                       : 0),
+                typeSize * threadCount * NumNonAtomicVariablesPerThread(), 0);
+        if (!svmDataBuffer)
+        {
+            log_error("ERROR: clSVMAlloc failed!\n");
+            return -1;
+        }
+        if (startRefValues.size())
+            memcpy(svmDataBuffer, &startRefValues[0],
+                   typeSize * threadCount * NumNonAtomicVariablesPerThread());
+        streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
+                                    typeSize * threadCount
+                                        * NumNonAtomicVariablesPerThread(),
+                                    svmDataBuffer, NULL);
+    }
+    else
+    {
+        streams[1] = clCreateBuffer(
+            context,
+            ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR
+                                    : CL_MEM_READ_WRITE)),
+            typeSize * threadCount * NumNonAtomicVariablesPerThread(),
+            startRefValues.size() ? &startRefValues[0] : 0, NULL);
+    }
+    if (!streams[1])
+    {
+        log_error("ERROR: Creating reference array failed!\n");
+        return -1;
+    }
+    if (deviceThreadCount > 0)
+    {
+        cl_uint argInd = 0;
+        /* Set the arguments */
+        error =
+            clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
+        test_error(error, "Unable to set kernel argument");
+        error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems),
+                               &numDestItems);
+        test_error(error, "Unable to set indexed kernel argument");
+        error =
+            clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
+        test_error(error, "Unable to set indexed kernel arguments");
+        error =
+            clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
+        test_error(error, "Unable to set indexed kernel arguments");
+        if (LocalMemory())
+        {
+            error =
+                clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
+            test_error(error, "Unable to set indexed local kernel argument");
+        }
+        if (LocalRefValues())
+        {
+            error =
+                clSetKernelArg(kernel, argInd++,
+                               LocalRefValues() ? typeSize * CurrentGroupSize()
+                                       * NumNonAtomicVariablesPerThread()
+                                                : 1,
+                               NULL);
+            test_error(error, "Unable to set indexed kernel argument");
+        }
+    }
+    /* Configure host threads */
+    std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
+    for (unsigned int t = 0; t < hostThreadCount; t++)
+    {
+        hostThreadContexts[t].test = this;
+        hostThreadContexts[t].tid = deviceThreadCount + t;
+        hostThreadContexts[t].threadCount = threadCount;
+        hostThreadContexts[t].destMemory =
+            UseSVM() ? svmAtomicBuffer : &destItems[0];
+        hostThreadContexts[t].oldValues =
+            UseSVM() ? svmDataBuffer : &refValues[0];
     }
 
-    if(refValues[0] != _startValue)//destItems[0])
+    if (deviceThreadCount > 0)
     {
-      std::stringstream logLine;
-      logLine << "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
-        " (should have been " << destItems[0] << ", returned " << refValues[0] << ")!\n";
-      log_error("%s", logLine.str().c_str());
-      if(!gDebug)
-      {
-        log_info("Program source:\n");
-        log_info("%s\n", programLine);
-      }
-      return -1;
+        /* Run the kernel */
+        threadNum[0] = deviceThreadCount;
+        groupSize = CurrentGroupSize();
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum,
+                                       &groupSize, 0, NULL, NULL);
+        test_error(error, "Unable to execute test kernel");
+        /* start device threads */
+        error = clFlush(queue);
+        test_error(error, "clFlush failed");
+    }
+
+    /* Start host threads and wait for finish */
+    if (hostThreadCount > 0)
+        ThreadPool_Do(HostThreadFunction, hostThreadCount,
+                      &hostThreadContexts[0]);
+
+    if (UseSVM())
+    {
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+        memcpy(&destItems[0], svmAtomicBuffer, typeSize * numDestItems);
+        memcpy(&refValues[0], svmDataBuffer,
+               typeSize * threadCount * NumNonAtomicVariablesPerThread());
     }
-  }
-  if(UseSVM())
-  {
-    // the buffer object must first be released before the SVM buffer is freed
-    error = clReleaseMemObject(streams[0]);
-    streams[0] = 0;
-    test_error(error, "clReleaseMemObject failed");
-    if(gUseHostPtr)
-      free(svmAtomicBuffer);
     else
-      clSVMFree(context, svmAtomicBuffer);
-    error = clReleaseMemObject(streams[1]);
-    streams[1] = 0;
-    test_error(error, "clReleaseMemObject failed");
-    if(gUseHostPtr)
-      free(svmDataBuffer);
-    else
-      clSVMFree(context, svmDataBuffer);
-  }
-  _passCount++;
-  return 0;
+    {
+        if (deviceThreadCount > 0)
+        {
+            error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0,
+                                        typeSize * numDestItems, &destItems[0],
+                                        0, NULL, NULL);
+            test_error(error, "Unable to read result value!");
+            error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0,
+                                        typeSize * deviceThreadCount
+                                            * NumNonAtomicVariablesPerThread(),
+                                        &refValues[0], 0, NULL, NULL);
+            test_error(error, "Unable to read reference values!");
+        }
+    }
+    bool dataVerified = false;
+    // If we have an expectedFn, then we need to generate a final value to
+    // compare against. If we don't have one, it's because we're comparing ref
+    // values only
+    for (cl_uint i = 0; i < numDestItems; i++)
+    {
+        HostDataType expected;
+
+        if (!ExpectedValue(expected, threadCount,
+                           startRefValues.size() ? &startRefValues[0] : 0, i))
+            break; // no expected value function provided
+
+        if (expected != destItems[i])
+        {
+            std::stringstream logLine;
+            logLine << "ERROR: Result " << i
+                    << " from kernel does not validate! (should be " << expected
+                    << ", was " << destItems[i] << ")\n";
+            log_error("%s", logLine.str().c_str());
+            for (i = 0; i < threadCount; i++)
+            {
+                logLine.str("");
+                logLine << " --- " << i << " - ";
+                if (startRefValues.size())
+                    logLine << startRefValues[i] << " -> " << refValues[i];
+                else
+                    logLine << refValues[i];
+                logLine << " --- ";
+                if (i < numDestItems) logLine << destItems[i];
+                logLine << "\n";
+                log_info("%s", logLine.str().c_str());
+            }
+            if (!gDebug)
+            {
+                log_info("Program source:\n");
+                log_info("%s\n", programLine);
+            }
+            return -1;
+        }
+        dataVerified = true;
+    }
+
+    bool dataCorrect = false;
+    /* Use the verify function (if provided) to also check the results */
+    if (VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
+    {
+        if (!dataCorrect)
+        {
+            log_error("ERROR: Reference values did not validate!\n");
+            std::stringstream logLine;
+            for (cl_uint i = 0; i < threadCount; i++)
+                for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
+                {
+                    logLine.str("");
+                    logLine
+                        << " --- " << i << " - "
+                        << refValues[i * NumNonAtomicVariablesPerThread() + j]
+                        << " --- ";
+                    if (j == 0 && i < numDestItems) logLine << destItems[i];
+                    logLine << "\n";
+                    log_info("%s", logLine.str().c_str());
+                }
+            if (!gDebug)
+            {
+                log_info("Program source:\n");
+                log_info("%s\n", programLine);
+            }
+            return -1;
+        }
+    }
+    else if (!dataVerified)
+    {
+        log_error("ERROR: Test doesn't check total or refs; no values are "
+                  "verified!\n");
+        return -1;
+    }
+
+    if (OldValueCheck()
+        && !(DeclaredInProgram()
+             && !LocalMemory())) // don't test for programs scope global atomics
+                                 // 'old' value has been overwritten by previous
+                                 // clEnqueueNDRangeKernel
+    {
+        /* Re-write the starting value */
+        for (size_t i = 0; i < numDestItems; i++) destItems[i] = _startValue;
+        refValues[0] = 0;
+        if (deviceThreadCount > 0)
+        {
+            error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0,
+                                         typeSize * numDestItems, &destItems[0],
+                                         0, NULL, NULL);
+            test_error(error, "Unable to write starting values!");
+
+            /* Run the kernel once for a single thread, so we can verify that
+             * the returned value is the original one */
+            threadNum[0] = 1;
+            error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum,
+                                           threadNum, 0, NULL, NULL);
+            test_error(error, "Unable to execute test kernel");
+
+            error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize,
+                                        &refValues[0], 0, NULL, NULL);
+            test_error(error, "Unable to read reference values!");
+        }
+        else
+        {
+            /* Start host thread */
+            HostFunction(0, 1, &destItems[0], &refValues[0]);
+        }
+
+        if (refValues[0] != _startValue) // destItems[0])
+        {
+            std::stringstream logLine;
+            logLine << "ERROR: atomic function operated correctly but did NOT "
+                       "return correct 'old' value "
+                       " (should have been "
+                    << destItems[0] << ", returned " << refValues[0] << ")!\n";
+            log_error("%s", logLine.str().c_str());
+            if (!gDebug)
+            {
+                log_info("Program source:\n");
+                log_info("%s\n", programLine);
+            }
+            return -1;
+        }
+    }
+    if (UseSVM())
+    {
+        // the buffer object must first be released before the SVM buffer is
+        // freed. The Wrapper Class method reset() will do that
+        streams[0].reset();
+        if (gUseHostPtr)
+            free(svmAtomicBuffer);
+        else
+            clSVMFree(context, svmAtomicBuffer);
+        streams[1].reset();
+        if (gUseHostPtr)
+            free(svmDataBuffer);
+        else
+            clSVMFree(context, svmDataBuffer);
+    }
+    _passCount++;
+    return 0;
 }
 
 #endif //_COMMON_H_