OpenCL-CTS/test_conformance/c11_atomics/common.h

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef _COMMON_H_
#define _COMMON_H_

#include "harness/testHarness.h"
#include "harness/typeWrappers.h"
#include "harness/ThreadPool.h"

#include "host_atomics.h"

#include <vector>
#include <sstream>

#define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads)
#define MAX_HOST_THREADS GetThreadCount()

#define EXECUTE_TEST(error, test)\
  error |= test;\
  if(error && !gContinueOnError)\
  return error;

enum TExplicitAtomicType
{
  TYPE_ATOMIC_INT,
  TYPE_ATOMIC_UINT,
  TYPE_ATOMIC_LONG,
  TYPE_ATOMIC_ULONG,
  TYPE_ATOMIC_FLOAT,
  TYPE_ATOMIC_DOUBLE,
  TYPE_ATOMIC_INTPTR_T,
  TYPE_ATOMIC_UINTPTR_T,
  TYPE_ATOMIC_SIZE_T,
  TYPE_ATOMIC_PTRDIFF_T,
  TYPE_ATOMIC_FLAG
};

enum TExplicitMemoryScopeType
{
  MEMORY_SCOPE_EMPTY,
  MEMORY_SCOPE_WORK_GROUP,
  MEMORY_SCOPE_DEVICE,
  MEMORY_SCOPE_ALL_SVM_DEVICES
};

extern bool gHost; // temporary flag for testing native host threads (test verification)
extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2)
extern bool gContinueOnError; // execute all cases even when errors detected
extern bool gNoGlobalVariables; // disable cases with global atomics in program scope
extern bool gNoGenericAddressSpace; // disable cases with generic address space
extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree
extern bool gDebug; // print OpenCL kernel code
extern int gInternalIterations; // internal test iterations for atomic operation, sufficient to verify atomicity
extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device

extern const char *get_memory_order_type_name(TExplicitMemoryOrderType orderType);
extern const char *get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);

class AtomicTypeInfo
{
public:
  TExplicitAtomicType _type;
  AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
  cl_uint Size(cl_device_id device);
  const char* AtomicTypeName();
  const char* RegularTypeName();
  const char* AddSubOperandTypeName();
  int IsSupported(cl_device_id device);
};

template<typename HostDataType>
class AtomicTypeExtendedInfo : public AtomicTypeInfo
{
public:
  AtomicTypeExtendedInfo(TExplicitAtomicType type) : AtomicTypeInfo(type) {}
  HostDataType MinValue();
  HostDataType MaxValue();
  HostDataType SpecialValue(cl_uchar x)
  {
    HostDataType tmp;
    cl_uchar *ptr = (cl_uchar*)&tmp;
    for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_uchar); i++)
      ptr[i] = x;
    return tmp;
  }
  HostDataType SpecialValue(cl_ushort x)
  {
    HostDataType tmp;
    cl_ushort *ptr = (cl_ushort*)&tmp;
    for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_ushort); i++)
      ptr[i] = x;
    return tmp;
  }
};

class CTest  {
public:
  virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) = 0;
};

template<typename HostAtomicType, typename HostDataType>
class CBasicTest : CTest
{
public:
  typedef struct {
    CBasicTest *test;
    cl_uint tid;
    cl_uint threadCount;
    volatile HostAtomicType *destMemory;
    HostDataType *oldValues;
  } THostThreadContext;
  static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, void *userInfo)
  {
    THostThreadContext *threadContext = ((THostThreadContext*)userInfo)+job_id;
    threadContext->test->HostFunction(threadContext->tid, threadContext->threadCount, threadContext->destMemory, threadContext->oldValues);
    return 0;
  }
  CBasicTest(TExplicitAtomicType dataType, bool useSVM) : CTest(),
    _maxDeviceThreads(MAX_DEVICE_THREADS),
    _dataType(dataType), _useSVM(useSVM), _startValue(255),
    _localMemory(false), _declaredInProgram(false),
    _usedInFunction(false), _genericAddrSpace(false),
    _oldValueCheck(true), _localRefValues(false),
    _maxGroupSize(0), _passCount(0), _iterations(gInternalIterations)
  {
  }
  virtual ~CBasicTest()
  {
    if(_passCount)
      log_info("  %u tests executed successfully for %s\n", _passCount, DataType().AtomicTypeName());
  }
  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
  {
    return 1;
  }
  virtual cl_uint NumNonAtomicVariablesPerThread()
  {
    return 1;
  }
  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
  {
    return false;
  }
  virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
  {
    return false;
  }
  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
  {
    return false;
  }
  virtual std::string PragmaHeader(cl_device_id deviceID);
  virtual std::string ProgramHeader(cl_uint maxNumDestItems);
  virtual std::string FunctionCode();
  virtual std::string KernelCode(cl_uint maxNumDestItems);
  virtual std::string ProgramCore() = 0;
  virtual std::string SingleTestName()
  {
    std::string testName = LocalMemory() ? "local" : "global";
    testName += " ";
    testName += DataType().AtomicTypeName();
    if(DeclaredInProgram())
    {
      testName += " declared in program";
    }
    if(DeclaredInProgram() && UsedInFunction())
      testName += ",";
    if(UsedInFunction())
    {
      testName += " used in ";
      if(GenericAddrSpace())
        testName += "generic ";
      testName += "function";
    }
    return testName;
  }
  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue);
  int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    int error = 0;
    UsedInFunction(false);
    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
    UsedInFunction(true);
    GenericAddrSpace(false);
    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
    GenericAddrSpace(true);
    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
    GenericAddrSpace(false);
    return error;
  }
  int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    int error = 0;
    DeclaredInProgram(false);
    EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
    if(!UseSVM())
    {
      DeclaredInProgram(true);
      EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
    }
    return error;
  }
  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    int error = 0;
    if(_maxDeviceThreads > 0 && !UseSVM())
    {
      LocalMemory(true);
      EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
    }
    if(_maxDeviceThreads+MaxHostThreads() > 0)
    {
      LocalMemory(false);
      EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
    }
    return error;
  }
  virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
  {
    if(sizeof(HostAtomicType) != DataType().Size(deviceID))
    {
      log_info("Invalid test: Host atomic type size (%u) is different than OpenCL type size (%u)\n", (cl_uint)sizeof(HostAtomicType), DataType().Size(deviceID));
      return -1;
    }
    if(sizeof(HostAtomicType) != sizeof(HostDataType))
    {
      log_info("Invalid test: Host atomic type size (%u) is different than corresponding type size (%u)\n", (cl_uint)sizeof(HostAtomicType), (cl_uint)sizeof(HostDataType));
      return -1;
    }
    // Verify we can run first
    if(UseSVM() && !gUseHostPtr)
    {
      cl_device_svm_capabilities caps;
      cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0);
      test_error(error, "clGetDeviceInfo failed");
      if((caps & CL_DEVICE_SVM_ATOMICS) == 0)
      {
        log_info("\t%s - SVM_ATOMICS not supported\n", DataType().AtomicTypeName());
        // implicit pass
        return 0;
      }
    }
    if(!DataType().IsSupported(deviceID))
    {
      log_info("\t%s not supported\n", DataType().AtomicTypeName());
      // implicit pass or host test (debug feature)
      if(UseSVM())
        return 0;
      _maxDeviceThreads = 0;
    }
    if(_maxDeviceThreads+MaxHostThreads() == 0)
      return 0;
    return ExecuteForEachParameterSet(deviceID, context, queue);
  }
  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
  {
    log_info("Empty thread function %u\n", (cl_uint)tid);
  }
  AtomicTypeExtendedInfo<HostDataType> DataType() const
  {
    return AtomicTypeExtendedInfo<HostDataType>(_dataType);
  }
  cl_uint _maxDeviceThreads;
  virtual cl_uint MaxHostThreads()
  {
    if(UseSVM() || gHost)
      return MAX_HOST_THREADS;
    else
      return 0;
  }
  virtual bool SVMDataBufferAllSVMConsistent() {return false;}
  bool UseSVM() {return _useSVM;}
  void StartValue(HostDataType startValue) {_startValue = startValue;}
  HostDataType StartValue() {return _startValue;}
  void LocalMemory(bool local) {_localMemory = local;}
  bool LocalMemory() {return _localMemory;}
  void DeclaredInProgram(bool declaredInProgram) {_declaredInProgram = declaredInProgram;}
  bool DeclaredInProgram() {return _declaredInProgram;}
  void UsedInFunction(bool local) {_usedInFunction = local;}
  bool UsedInFunction() {return _usedInFunction;}
  void GenericAddrSpace(bool genericAddrSpace) {_genericAddrSpace = genericAddrSpace;}
  bool GenericAddrSpace() {return _genericAddrSpace;}
  void OldValueCheck(bool check) {_oldValueCheck = check;}
  bool OldValueCheck() {return _oldValueCheck;}
  void LocalRefValues(bool localRefValues) {_localRefValues = localRefValues;}
  bool LocalRefValues() {return _localRefValues;}
  void MaxGroupSize(cl_uint maxGroupSize) {_maxGroupSize = maxGroupSize;}
  cl_uint MaxGroupSize() {return _maxGroupSize;}
  void CurrentGroupSize(cl_uint currentGroupSize)
  {
    if(MaxGroupSize() && MaxGroupSize() < currentGroupSize)
      _currentGroupSize = MaxGroupSize();
    else
      _currentGroupSize = currentGroupSize;
  }
  cl_uint CurrentGroupSize() {return _currentGroupSize;}
  virtual cl_uint CurrentGroupNum(cl_uint threadCount)
  {
    if(threadCount == 0)
      return 0;
    if(LocalMemory())
      return 1;
    return threadCount/CurrentGroupSize();
  }
  cl_int Iterations() {return _iterations;}
  std::string IterationsStr() {std::stringstream ss; ss << _iterations; return ss.str();}
private:
  const TExplicitAtomicType _dataType;
  const bool _useSVM;
  HostDataType	_startValue;
  bool _localMemory;
  bool _declaredInProgram;
  bool _usedInFunction;
  bool _genericAddrSpace;
  bool _oldValueCheck;
  bool _localRefValues;
  cl_uint _maxGroupSize;
  cl_uint _currentGroupSize;
  cl_uint _passCount;
  const cl_int _iterations;
};

template<typename HostAtomicType, typename HostDataType>
class CBasicTestMemOrderScope : public CBasicTest<HostAtomicType, HostDataType>
{
public:
  using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
  using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
  CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
  {
  }
  virtual std::string ProgramHeader(cl_uint maxNumDestItems)
  {
    std::string header;
    if(gOldAPI)
    {
      std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
      header +=
        "#define atomic_store_explicit(x,y,o"+s+")                     atomic_store(x,y)\n"
        "#define atomic_load_explicit(x,o"+s+")                        atomic_load(x)\n"
        "#define atomic_exchange_explicit(x,y,o"+s+")                  atomic_exchange(x,y)\n"
        "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_strong(x,y,z)\n"
        "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"+s+")   atomic_compare_exchange_weak(x,y,z)\n"
        "#define atomic_fetch_add_explicit(x,y,o"+s+")                 atomic_fetch_add(x,y)\n"
        "#define atomic_fetch_sub_explicit(x,y,o"+s+")                 atomic_fetch_sub(x,y)\n"
        "#define atomic_fetch_or_explicit(x,y,o"+s+")                  atomic_fetch_or(x,y)\n"
        "#define atomic_fetch_xor_explicit(x,y,o"+s+")                 atomic_fetch_xor(x,y)\n"
        "#define atomic_fetch_and_explicit(x,y,o"+s+")                 atomic_fetch_and(x,y)\n"
        "#define atomic_fetch_min_explicit(x,y,o"+s+")                 atomic_fetch_min(x,y)\n"
        "#define atomic_fetch_max_explicit(x,y,o"+s+")                 atomic_fetch_max(x,y)\n"
        "#define atomic_flag_test_and_set_explicit(x,o"+s+")           atomic_flag_test_and_set(x)\n"
        "#define atomic_flag_clear_explicit(x,o"+s+")                  atomic_flag_clear(x)\n";
    }
    return header+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(maxNumDestItems);
  }
  virtual std::string SingleTestName()
  {
    std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
    {
      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
    }
    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
    {
      testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
    }
    return testName;
  }
  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    if(LocalMemory() &&
      MemoryScope() != MEMORY_SCOPE_EMPTY &&
      MemoryScope() != MEMORY_SCOPE_WORK_GROUP) //memory scope should only be used for global memory
      return 0;
    if(MemoryScope() == MEMORY_SCOPE_DEVICE)
      MaxGroupSize(16); // increase number of groups by forcing smaller group size
    else
      MaxGroupSize(0); // group size limited by device capabilities
    return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
  }
  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    // repeat test for each reasonable memory order/scope combination
    std::vector<TExplicitMemoryOrderType> memoryOrder;
    std::vector<TExplicitMemoryScopeType> memoryScope;
    int error = 0;

    memoryOrder.push_back(MEMORY_ORDER_EMPTY);
    memoryOrder.push_back(MEMORY_ORDER_RELAXED);
    memoryOrder.push_back(MEMORY_ORDER_ACQUIRE);
    memoryOrder.push_back(MEMORY_ORDER_RELEASE);
    memoryOrder.push_back(MEMORY_ORDER_ACQ_REL);
    memoryOrder.push_back(MEMORY_ORDER_SEQ_CST);
    memoryScope.push_back(MEMORY_SCOPE_EMPTY);
    memoryScope.push_back(MEMORY_SCOPE_WORK_GROUP);
    memoryScope.push_back(MEMORY_SCOPE_DEVICE);
    memoryScope.push_back(MEMORY_SCOPE_ALL_SVM_DEVICES);

    for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
    {
      for(unsigned si = 0; si < memoryScope.size(); si++)
      {
        if(memoryOrder[oi] == MEMORY_ORDER_EMPTY && memoryScope[si] != MEMORY_SCOPE_EMPTY)
          continue;
        MemoryOrder(memoryOrder[oi]);
        MemoryScope(memoryScope[si]);
        EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
      }
    }
    return error;
  }
  void MemoryOrder(TExplicitMemoryOrderType memoryOrder) {_memoryOrder = memoryOrder;}
  TExplicitMemoryOrderType MemoryOrder() {return _memoryOrder;}
  std::string MemoryOrderStr()
  {
    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
      return std::string(", ")+get_memory_order_type_name(MemoryOrder());
    return "";
  }
  void MemoryScope(TExplicitMemoryScopeType memoryScope) {_memoryScope = memoryScope;}
  TExplicitMemoryScopeType MemoryScope() {return _memoryScope;}
  std::string MemoryScopeStr()
  {
    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
      return std::string(", ")+get_memory_scope_type_name(MemoryScope());
    return "";
  }
  std::string MemoryOrderScopeStr()
  {
    return MemoryOrderStr()+MemoryScopeStr();
  }
  virtual cl_uint CurrentGroupNum(cl_uint threadCount)
  {
    if(MemoryScope() == MEMORY_SCOPE_WORK_GROUP)
      return 1;
    return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(threadCount);
  }
  virtual cl_uint MaxHostThreads()
  {
    // block host threads execution for memory scope different than memory_scope_all_svm_devices
    if(MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
      return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
    else
      return 0;
  }
private:
  TExplicitMemoryOrderType _memoryOrder;
  TExplicitMemoryScopeType _memoryScope;
};

template<typename HostAtomicType, typename HostDataType>
class CBasicTestMemOrder2Scope : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
{
public:
  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
  CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
  {
  }
  virtual std::string SingleTestName()
  {
    std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
    if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder2())).substr(sizeof("memory"));
    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
      testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
    return testName;
  }
  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    // repeat test for each reasonable memory order/scope combination
    std::vector<TExplicitMemoryOrderType> memoryOrder;
    std::vector<TExplicitMemoryScopeType> memoryScope;
    int error = 0;

    memoryOrder.push_back(MEMORY_ORDER_EMPTY);
    memoryOrder.push_back(MEMORY_ORDER_RELAXED);
    memoryOrder.push_back(MEMORY_ORDER_ACQUIRE);
    memoryOrder.push_back(MEMORY_ORDER_RELEASE);
    memoryOrder.push_back(MEMORY_ORDER_ACQ_REL);
    memoryOrder.push_back(MEMORY_ORDER_SEQ_CST);
    memoryScope.push_back(MEMORY_SCOPE_EMPTY);
    memoryScope.push_back(MEMORY_SCOPE_WORK_GROUP);
    memoryScope.push_back(MEMORY_SCOPE_DEVICE);
    memoryScope.push_back(MEMORY_SCOPE_ALL_SVM_DEVICES);

    for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
    {
      for(unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
      {
        for(unsigned si = 0; si < memoryScope.size(); si++)
        {
          if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
            && memoryOrder[oi] != memoryOrder[o2i])
            continue; // both memory order arguments must be set (or none)
          if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
            && memoryScope[si] != MEMORY_SCOPE_EMPTY)
            continue; // memory scope without memory order is not allowed
          MemoryOrder(memoryOrder[oi]);
          MemoryOrder2(memoryOrder[o2i]);
          MemoryScope(memoryScope[si]);
          EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
        }
      }
    }
    return error;
  }
  void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) {_memoryOrder2 = memoryOrderFail;}
  TExplicitMemoryOrderType MemoryOrder2() {return _memoryOrder2;}
  std::string MemoryOrderFailStr()
  {
    if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
      return std::string(", ")+get_memory_order_type_name(MemoryOrder2());
    return "";
  }
  std::string MemoryOrderScope()
  {
    return MemoryOrderStr()+MemoryOrderFailStr()+MemoryScopeStr();
  }
private:
  TExplicitMemoryOrderType _memoryOrder2;
};

template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
{
  std::string pragma;

  if(gOldAPI)
  {
    pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n";
    pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n";
    pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n";
    pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n";
  }
  // Create the pragma lines for this kernel
  if(DataType().Size(deviceID) == 8)
  {
    pragma += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
    pragma += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
  }
  if(_dataType == TYPE_ATOMIC_DOUBLE)
    pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
  return pragma;
}

template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
{
  // Create the program header
  std::string header;
  std::string aTypeName = DataType().AtomicTypeName();
  std::string cTypeName = DataType().RegularTypeName();
  std::string argListForKernel;
  std::string argListForFunction;
  std::string argListNoTypes;
  std::string functionPrototype;
  std::string addressSpace = LocalMemory() ? "__local " : "__global ";

  if(gOldAPI)
  {
    header += std::string("#define ")+aTypeName+" "+cTypeName+"\n"
      "#define atomic_store(x,y)                                (*(x) = y)\n"
      "#define atomic_load(x)                                   (*(x))\n"
      "#define ATOMIC_VAR_INIT(x)                               (x)\n"
      "#define ATOMIC_FLAG_INIT                                 0\n"
      "#define atomic_init(x,y)                                 atomic_store(x,y)\n";
    if(aTypeName == "atomic_float")
      header += "#define atomic_exchange(x,y)                             atomic_xchg(x,y)\n";
    else if(aTypeName == "atomic_double")
      header += "double atomic_exchange(volatile "+addressSpace+"atomic_double *x, double y)\n"
        "{\n"
        "  long tmp = *(long*)&y, res;\n"
        "  volatile "+addressSpace+"long *tmpA = (volatile "+addressSpace+"long)x;\n"
        "  res = atom_xchg(tmpA,tmp);\n"
        "  return *(double*)&res;\n"
        "}\n";
    else
      header += "#define atomic_exchange(x,y)                             atom_xchg(x,y)\n";
    if(aTypeName != "atomic_float" && aTypeName != "atomic_double")
      header +=
      "bool atomic_compare_exchange_strong(volatile "+addressSpace+" "+aTypeName+" *a, "+cTypeName+" *expected, "+cTypeName+" desired)\n"
      "{\n"
      "  "+cTypeName+" old = atom_cmpxchg(a, *expected, desired);\n"
      "  if(old == *expected)\n"
      "    return true;\n"
      "  *expected = old;\n"
      "  return false;\n"
      "}\n"
      "#define atomic_compare_exchange_weak                     atomic_compare_exchange_strong\n";
    header +=
      "#define atomic_fetch_add(x,y)                            atom_add(x,y)\n"
      "#define atomic_fetch_sub(x,y)                            atom_sub(x,y)\n"
      "#define atomic_fetch_or(x,y)                             atom_or(x,y)\n"
      "#define atomic_fetch_xor(x,y)                            atom_xor(x,y)\n"
      "#define atomic_fetch_and(x,y)                            atom_and(x,y)\n"
      "#define atomic_fetch_min(x,y)                            atom_min(x,y)\n"
      "#define atomic_fetch_max(x,y)                            atom_max(x,y)\n"
      "#define atomic_flag_test_and_set(x)                      atomic_exchange(x,1)\n"
      "#define atomic_flag_clear(x)                             atomic_store(x,0)\n"
      "\n";
  }
  if(!LocalMemory() && DeclaredInProgram())
  {
    // additional atomic variable for results copying (last thread will do this)
    header += "__global volatile atomic_uint finishedThreads = ATOMIC_VAR_INIT(0);\n";
    // atomic variables declared in program scope - test data
    std::stringstream ss;
    ss << maxNumDestItems;
    header += std::string("__global volatile ")+aTypeName+" destMemory["+ss.str()+"] = {\n";
    ss.str("");
    ss << _startValue;
    for(cl_uint i = 0; i < maxNumDestItems; i++)
    {
      if(aTypeName == "atomic_flag")
        header +=  "  ATOMIC_FLAG_INIT";
      else
        header +=  "  ATOMIC_VAR_INIT("+ss.str()+")";
      if(i+1 < maxNumDestItems)
        header += ",";
      header += "\n";
    }
    header+=
      "};\n"
      "\n";
  }
  return header;
}

template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::FunctionCode()
{
  if(!UsedInFunction())
    return "";
  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
  std::string code = "void test_atomic_function(uint tid, uint threadCount, uint numDestItems, volatile ";
  if(!GenericAddrSpace())
    code += addressSpace;
  code += std::string(DataType().AtomicTypeName())+" *destMemory, __global "+DataType().RegularTypeName()+
    " *oldValues";
  if(LocalRefValues())
    code += std::string(", __local ")+DataType().RegularTypeName()+" *localValues";
  code += ")\n"
    "{\n";
  code += ProgramCore();
  code += "}\n"
    "\n";
  return code;
}

template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
{
  std::string aTypeName = DataType().AtomicTypeName();
  std::string cTypeName = DataType().RegularTypeName();
  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
  std::string code = "__kernel void test_atomic_kernel(uint threadCount, uint numDestItems, ";

  // prepare list of arguments for kernel
  if(LocalMemory())
  {
    code += std::string("__global ")+cTypeName+" *finalDest, __global "+cTypeName+" *oldValues,"
      " volatile "+addressSpace+aTypeName+" *"+(DeclaredInProgram() ? "notUsed" : "")+"destMemory";
  }
  else
  {
    code += "volatile "+addressSpace+(DeclaredInProgram() ? (cTypeName+" *finalDest") : (aTypeName+" *destMemory"))+
      ", __global "+cTypeName+" *oldValues";
  }
  if(LocalRefValues())
    code += std::string(", __local ")+cTypeName+" *localValues";
  code += ")\n"
    "{\n";
  if(LocalMemory() && DeclaredInProgram())
  {
    // local atomics declared in kernel scope
    std::stringstream ss;
    ss << maxNumDestItems;
    code += std::string("  __local volatile ")+aTypeName+" destMemory["+ss.str()+"];\n";
  }
  code += "  uint  tid = get_global_id(0);\n"
    "\n";
  if(LocalMemory())
  {
    code +=
      "  // initialize atomics not reachable from host (first thread is doing this, other threads are waiting on barrier)\n"
      "  if(get_local_id(0) == 0)\n"
      "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n"
      "    {\n";
    if(aTypeName == "atomic_flag")
    {
      code +=
        "      if(finalDest[dstItemIdx])\n"
        "        atomic_flag_test_and_set(destMemory+dstItemIdx);\n"
        "      else\n"
        "        atomic_flag_clear(destMemory+dstItemIdx);\n";
    }
    else
    {
      code +=
        "      atomic_store(destMemory+dstItemIdx, finalDest[dstItemIdx]);\n";
    }
    code +=
      "    }\n"
      "  barrier(CLK_LOCAL_MEM_FENCE);\n"
      "\n";
  }
  if (LocalRefValues())
  {
    code +=
      "  // Copy input reference values into local memory\n";
    if (NumNonAtomicVariablesPerThread() == 1)
      code += "  localValues[get_local_id(0)] = oldValues[tid];\n";
    else
    {
      std::stringstream ss;
      ss << NumNonAtomicVariablesPerThread();
      code +=
        "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
        "    localValues[get_local_id(0)*" + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
    }
    code +=
      "  barrier(CLK_LOCAL_MEM_FENCE);\n"
      "\n";
  }
  if (UsedInFunction())
    code += std::string("  test_atomic_function(tid, threadCount, numDestItems, destMemory, oldValues")+
    (LocalRefValues() ? ", localValues" : "")+");\n";
  else
    code += ProgramCore();
  code += "\n";
  if (LocalRefValues())
  {
    code +=
      "  // Copy local reference values into output array\n"
      "  barrier(CLK_LOCAL_MEM_FENCE);\n";
    if (NumNonAtomicVariablesPerThread() == 1)
      code += "  oldValues[tid] = localValues[get_local_id(0)];\n";
    else
    {
      std::stringstream ss;
      ss << NumNonAtomicVariablesPerThread();
      code +=
        "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
        "    oldValues[tid*" + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + "+rfId];\n";
    }
    code += "\n";
  }
  if(LocalMemory() || DeclaredInProgram())
  {
    code += "  // Copy final values to host reachable buffer\n";
    if(LocalMemory())
      code +=
        "  barrier(CLK_LOCAL_MEM_FENCE);\n"
        "  if(get_local_id(0) == 0) // first thread in workgroup\n";
    else
      // global atomics declared in program scope
      code +=
      "  if(atomic_fetch_add(&finishedThreads, 1) == get_global_size(0)-1)\n"
      "    // last finished thread\n";
    code +=
        "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n";
    if(aTypeName == "atomic_flag")
    {
      code +=
        "      finalDest[dstItemIdx] = atomic_flag_test_and_set(destMemory+dstItemIdx);\n";
    }
    else
    {
      code +=
        "      finalDest[dstItemIdx] = atomic_load(destMemory+dstItemIdx);\n";
    }
  }
  code += "}\n"
    "\n";
  return code;
}

template <typename HostAtomicType, typename HostDataType>
int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
{
  int error;
  clProgramWrapper program;
  clKernelWrapper kernel;
  size_t threadNum[1];
  clMemWrapper streams[2];
  std::vector<HostAtomicType> destItems;
  HostAtomicType *svmAtomicBuffer = 0;
  std::vector<HostDataType> refValues, startRefValues;
  HostDataType *svmDataBuffer = 0;
  cl_uint deviceThreadCount, hostThreadCount, threadCount;
  size_t groupSize = 0;
  std::string programSource;
  const char *programLine;
  MTdata d;
  size_t typeSize = DataType().Size(deviceID);

  deviceThreadCount = _maxDeviceThreads;
  hostThreadCount = MaxHostThreads();
  threadCount = deviceThreadCount+hostThreadCount;

  //log_info("\t%s %s%s...\n", local ? "local" : "global", DataType().AtomicTypeName(), memoryOrderScope.c_str());
  log_info("\t%s...\n", SingleTestName().c_str());

  if(!LocalMemory() && DeclaredInProgram() && gNoGlobalVariables) // no support for program scope global variables
  {
    log_info("\t\tTest disabled\n");
    return 0;
  }
  if(UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
  {
    log_info("\t\tTest disabled\n");
    return 0;
  }

  // set up work sizes based on device capabilities and test configuration
  error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
  test_error(error, "Unable to obtain max work group size for device");
  CurrentGroupSize((cl_uint)groupSize);
  if(CurrentGroupSize() > deviceThreadCount)
    CurrentGroupSize(deviceThreadCount);
  if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
    deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
  threadCount = deviceThreadCount+hostThreadCount;

  // If we're given a num_results function, we need to determine how many result objects we need.
  // This is the first assessment for current maximum number of threads (exact thread count is not known here)
  // - needed for program source code generation (arrays of atomics declared in program)
  cl_uint numDestItems = NumResults(threadCount, deviceID);

  if(deviceThreadCount > 0)
  {
    cl_ulong usedLocalMemory;
    cl_ulong totalLocalMemory;
    cl_uint maxWorkGroupSize;

    // Set up the kernel code
    programSource = PragmaHeader(deviceID)+ProgramHeader(numDestItems)+FunctionCode()+KernelCode(numDestItems);
    programLine = programSource.c_str();
    if(create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, &programLine, "test_atomic_kernel",
      gOldAPI ? "" : "-cl-std=CL2.0"))
    {
      return -1;
    }
    if(gDebug)
    {
      log_info("Program source:\n");
      log_info("%s\n", programLine);
    }
    // tune up work sizes based on kernel info
    error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
    test_error(error, "Unable to obtain max work group size for device and kernel combo");

    if(LocalMemory())
    {
      error = clGetKernelWorkGroupInfo (kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(usedLocalMemory), &usedLocalMemory, NULL);
      test_error(error, "clGetKernelWorkGroupInfo failed");

      error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(totalLocalMemory), &totalLocalMemory, NULL);
      test_error(error, "clGetDeviceInfo failed");

      // We know that each work-group is going to use typeSize * deviceThreadCount bytes of local memory
      // so pick the maximum value for deviceThreadCount that uses all the local memory.
      maxWorkGroupSize = ((totalLocalMemory - usedLocalMemory) / typeSize);

      if(maxWorkGroupSize < groupSize)
        groupSize = maxWorkGroupSize;
    }

    CurrentGroupSize((cl_uint)groupSize);
    if(CurrentGroupSize() > deviceThreadCount)
      CurrentGroupSize(deviceThreadCount);
    if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
      deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
    threadCount = deviceThreadCount+hostThreadCount;
  }
  if(deviceThreadCount > 0)
    log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize());
  if(hostThreadCount > 0)
    log_info("\t\t(host threads %u)\n", hostThreadCount);

  refValues.resize(threadCount*NumNonAtomicVariablesPerThread());

  // Generate ref data if we have a ref generator provided
  d = init_genrand(gRandomSeed);
  startRefValues.resize(threadCount*NumNonAtomicVariablesPerThread());
  if(GenerateRefs(threadCount, &startRefValues[0], d))
  {
    //copy ref values for host threads
    memcpy(&refValues[0], &startRefValues[0], sizeof(HostDataType)*threadCount*NumNonAtomicVariablesPerThread());
  }
  else
  {
    startRefValues.resize(0);
  }
  free_mtdata(d);
  d = NULL;

  // If we're given a num_results function, we need to determine how many result objects we need. If
  // we don't have it, we assume it's just 1
  // This is final value (exact thread count is known in this place)
  numDestItems = NumResults(threadCount, deviceID);

  destItems.resize(numDestItems);
  for(cl_uint i = 0; i < numDestItems; i++)
    destItems[i] = _startValue;

  // Create main buffer with atomic variables (array size dependent on particular test)
  if(UseSVM())
  {
    if(gUseHostPtr)
      svmAtomicBuffer = (HostAtomicType*)malloc(typeSize * numDestItems);
    else
      svmAtomicBuffer = (HostAtomicType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, typeSize * numDestItems, 0);
    if(!svmAtomicBuffer)
    {
      log_error("ERROR: clSVMAlloc failed!\n");
      return -1;
    }
    memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_USE_HOST_PTR), typeSize * numDestItems, svmAtomicBuffer, NULL);
  }
  else
  {
    streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), typeSize * numDestItems, &destItems[0], NULL);
  }
  if (!streams[0])
  {
    log_error("ERROR: Creating output array failed!\n");
    return -1;
  }
  // Create buffer for per-thread input/output data
  if(UseSVM())
  {
    if(gUseHostPtr)
      svmDataBuffer = (HostDataType*)malloc(typeSize*threadCount*NumNonAtomicVariablesPerThread());
    else
      svmDataBuffer = (HostDataType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS : 0), typeSize*threadCount*NumNonAtomicVariablesPerThread(), 0);
    if(!svmDataBuffer)
    {
      log_error("ERROR: clSVMAlloc failed!\n");
      return -1;
    }
    if(startRefValues.size())
      memcpy(svmDataBuffer, &startRefValues[0], typeSize*threadCount*NumNonAtomicVariablesPerThread());
    streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_USE_HOST_PTR), typeSize*threadCount*NumNonAtomicVariablesPerThread(), svmDataBuffer, NULL);
  }
  else
  {
    streams[1] = clCreateBuffer(context, (cl_mem_flags)((startRefValues.size() ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
      typeSize * threadCount*NumNonAtomicVariablesPerThread(), startRefValues.size() ? &startRefValues[0] : 0, NULL);
  }
  if (!streams[1])
  {
    log_error("ERROR: Creating reference array failed!\n");
    return -1;
  }
  if(deviceThreadCount > 0)
  {
    cl_uint argInd = 0;
    /* Set the arguments */
    error = clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
    test_error(error, "Unable to set kernel argument");
    error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), &numDestItems);
    test_error(error, "Unable to set indexed kernel argument");
    error = clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
    test_error(error, "Unable to set indexed kernel arguments");
    error = clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
    test_error(error, "Unable to set indexed kernel arguments");
    if(LocalMemory())
    {
      error = clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
      test_error(error, "Unable to set indexed local kernel argument");
    }
    if(LocalRefValues())
    {
      error = clSetKernelArg(kernel, argInd++, LocalRefValues() ? typeSize*CurrentGroupSize()*NumNonAtomicVariablesPerThread() : 1, NULL);
      test_error(error, "Unable to set indexed kernel argument");
    }
  }
  /* Configure host threads */
  std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
  for(unsigned int t = 0; t < hostThreadCount; t++)
  {
    hostThreadContexts[t].test = this;
    hostThreadContexts[t].tid = deviceThreadCount+t;
    hostThreadContexts[t].threadCount = threadCount;
    hostThreadContexts[t].destMemory = UseSVM() ? svmAtomicBuffer : &destItems[0];
    hostThreadContexts[t].oldValues = UseSVM() ? svmDataBuffer : &refValues[0];
  }

  if(deviceThreadCount > 0)
  {
    /* Run the kernel */
    threadNum[0] = deviceThreadCount;
    groupSize = CurrentGroupSize();
    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, &groupSize, 0, NULL, NULL);
    test_error(error, "Unable to execute test kernel");
    /* start device threads */
    error = clFlush(queue);
    test_error(error, "clFlush failed");
  }

  /* Start host threads and wait for finish */
  if(hostThreadCount > 0)
    ThreadPool_Do(HostThreadFunction, hostThreadCount, &hostThreadContexts[0]);

  if(UseSVM())
  {
    error = clFinish(queue);
    test_error(error, "clFinish failed");
    memcpy(&destItems[0], svmAtomicBuffer, typeSize*numDestItems);
    memcpy(&refValues[0], svmDataBuffer, typeSize*threadCount*NumNonAtomicVariablesPerThread());
  }
  else
  {
    if(deviceThreadCount > 0)
    {
      error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
      test_error(error, "Unable to read result value!");
      error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize * deviceThreadCount*NumNonAtomicVariablesPerThread(), &refValues[0], 0, NULL, NULL);
      test_error(error, "Unable to read reference values!");
    }
  }
  bool dataVerified = false;
  // If we have an expectedFn, then we need to generate a final value to compare against. If we don't
  // have one, it's because we're comparing ref values only
  for(cl_uint i = 0; i < numDestItems; i++)
  {
    HostDataType expected;

    if(!ExpectedValue(expected, threadCount, startRefValues.size() ? &startRefValues[0] : 0, i))
      break; // no expected value function provided

    if(expected != destItems[i])
    {
      std::stringstream logLine;
      logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected << ", was " << destItems[i] << ")\n";
      log_error("%s", logLine.str().c_str());
      for(i = 0; i < threadCount; i++)
      {
        logLine.str("");
        logLine << " --- " << i << " - ";
        if(startRefValues.size())
          logLine << startRefValues[i] << " -> " << refValues[i];
        else
          logLine << refValues[i];
        logLine << " --- ";
        if(i < numDestItems)
          logLine << destItems[i];
        logLine << "\n";
        log_info("%s", logLine.str().c_str());
      }
      if(!gDebug)
      {
        log_info("Program source:\n");
        log_info("%s\n", programLine);
      }
      return -1;
    }
    dataVerified = true;
  }

  bool dataCorrect = false;
  /* Use the verify function (if provided) to also check the results */
  if(VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
  {
    if(!dataCorrect)
    {
      log_error("ERROR: Reference values did not validate!\n");
      std::stringstream logLine;
      for(cl_uint i = 0; i < threadCount; i++)
      for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
      {
        logLine.str("");
        logLine << " --- " << i << " - " << refValues[i*NumNonAtomicVariablesPerThread()+j] << " --- ";
        if(j == 0 && i < numDestItems)
          logLine << destItems[i];
        logLine << "\n";
        log_info("%s", logLine.str().c_str());
      }
      if(!gDebug)
      {
        log_info("Program source:\n");
        log_info("%s\n", programLine);
      }
      return -1;
    }
  }
  else if(!dataVerified)
  {
    log_error("ERROR: Test doesn't check total or refs; no values are verified!\n");
    return -1;
  }

  if(OldValueCheck() &&
    !(DeclaredInProgram() && !LocalMemory())) // don't test for programs scope global atomics
                                             // 'old' value has been overwritten by previous clEnqueueNDRangeKernel
  {
    /* Re-write the starting value */
    for(size_t i = 0; i < numDestItems; i++)
      destItems[i] = _startValue;
    refValues[0] = 0;
    if(deviceThreadCount > 0)
    {
      error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
      test_error(error, "Unable to write starting values!");

      /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
      threadNum[0] = 1;
      error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, threadNum, 0, NULL, NULL);
      test_error(error, "Unable to execute test kernel");

      error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, &refValues[0], 0, NULL, NULL);
      test_error(error, "Unable to read reference values!");
    }
    else
    {
      /* Start host thread */
      HostFunction(0, 1, &destItems[0], &refValues[0]);
    }

    if(refValues[0] != _startValue)//destItems[0])
    {
      std::stringstream logLine;
      logLine << "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
        " (should have been " << destItems[0] << ", returned " << refValues[0] << ")!\n";
      log_error("%s", logLine.str().c_str());
      if(!gDebug)
      {
        log_info("Program source:\n");
        log_info("%s\n", programLine);
      }
      return -1;
    }
  }
  if(UseSVM())
  {
    // the buffer object must first be released before the SVM buffer is freed
    error = clReleaseMemObject(streams[0]);
    streams[0] = 0;
    test_error(error, "clReleaseMemObject failed");
    if(gUseHostPtr)
      free(svmAtomicBuffer);
    else
      clSVMFree(context, svmAtomicBuffer);
    error = clReleaseMemObject(streams[1]);
    streams[1] = 0;
    test_error(error, "clReleaseMemObject failed");
    if(gUseHostPtr)
      free(svmDataBuffer);
    else
      clSVMFree(context, svmDataBuffer);
  }
  _passCount++;
  return 0;
}

#endif //_COMMON_H_