Files
OpenCL-CTS/test_conformance/c11_atomics/common.h
Ankit Goyal 4fbcd96e7f Remove "C" linkages (#781)
* Remove extern C linkages

* Update crc32 to cpp and remove extern C linkage
2020-05-20 14:16:19 +01:00

1178 lines
44 KiB
C++

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef _COMMON_H_
#define _COMMON_H_
#include "harness/testHarness.h"
#include "harness/typeWrappers.h"
#include "harness/ThreadPool.h"
#include "host_atomics.h"
#include <vector>
#include <sstream>
#define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads)
#define MAX_HOST_THREADS GetThreadCount()
#define EXECUTE_TEST(error, test)\
error |= test;\
if(error && !gContinueOnError)\
return error;
enum TExplicitAtomicType
{
TYPE_ATOMIC_INT,
TYPE_ATOMIC_UINT,
TYPE_ATOMIC_LONG,
TYPE_ATOMIC_ULONG,
TYPE_ATOMIC_FLOAT,
TYPE_ATOMIC_DOUBLE,
TYPE_ATOMIC_INTPTR_T,
TYPE_ATOMIC_UINTPTR_T,
TYPE_ATOMIC_SIZE_T,
TYPE_ATOMIC_PTRDIFF_T,
TYPE_ATOMIC_FLAG
};
enum TExplicitMemoryScopeType
{
MEMORY_SCOPE_EMPTY,
MEMORY_SCOPE_WORK_GROUP,
MEMORY_SCOPE_DEVICE,
MEMORY_SCOPE_ALL_SVM_DEVICES
};
extern bool gHost; // temporary flag for testing native host threads (test verification)
extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2)
extern bool gContinueOnError; // execute all cases even when errors detected
extern bool gNoGlobalVariables; // disable cases with global atomics in program scope
extern bool gNoGenericAddressSpace; // disable cases with generic address space
extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree
extern bool gDebug; // print OpenCL kernel code
extern int gInternalIterations; // internal test iterations for atomic operation, sufficient to verify atomicity
extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device
extern const char *get_memory_order_type_name(TExplicitMemoryOrderType orderType);
extern const char *get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);
class AtomicTypeInfo
{
public:
TExplicitAtomicType _type;
AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
cl_uint Size(cl_device_id device);
const char* AtomicTypeName();
const char* RegularTypeName();
const char* AddSubOperandTypeName();
int IsSupported(cl_device_id device);
};
template<typename HostDataType>
class AtomicTypeExtendedInfo : public AtomicTypeInfo
{
public:
AtomicTypeExtendedInfo(TExplicitAtomicType type) : AtomicTypeInfo(type) {}
HostDataType MinValue();
HostDataType MaxValue();
HostDataType SpecialValue(cl_uchar x)
{
HostDataType tmp;
cl_uchar *ptr = (cl_uchar*)&tmp;
for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_uchar); i++)
ptr[i] = x;
return tmp;
}
HostDataType SpecialValue(cl_ushort x)
{
HostDataType tmp;
cl_ushort *ptr = (cl_ushort*)&tmp;
for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_ushort); i++)
ptr[i] = x;
return tmp;
}
};
class CTest {
public:
virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) = 0;
};
template<typename HostAtomicType, typename HostDataType>
class CBasicTest : CTest
{
public:
typedef struct {
CBasicTest *test;
cl_uint tid;
cl_uint threadCount;
volatile HostAtomicType *destMemory;
HostDataType *oldValues;
} THostThreadContext;
static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, void *userInfo)
{
THostThreadContext *threadContext = ((THostThreadContext*)userInfo)+job_id;
threadContext->test->HostFunction(threadContext->tid, threadContext->threadCount, threadContext->destMemory, threadContext->oldValues);
return 0;
}
CBasicTest(TExplicitAtomicType dataType, bool useSVM) : CTest(),
_maxDeviceThreads(MAX_DEVICE_THREADS),
_dataType(dataType), _useSVM(useSVM), _startValue(255),
_localMemory(false), _declaredInProgram(false),
_usedInFunction(false), _genericAddrSpace(false),
_oldValueCheck(true), _localRefValues(false),
_maxGroupSize(0), _passCount(0), _iterations(gInternalIterations)
{
}
virtual ~CBasicTest()
{
if(_passCount)
log_info(" %u tests executed successfully for %s\n", _passCount, DataType().AtomicTypeName());
}
virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
{
return 1;
}
virtual cl_uint NumNonAtomicVariablesPerThread()
{
return 1;
}
virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
{
return false;
}
virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
{
return false;
}
virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
{
return false;
}
virtual std::string PragmaHeader(cl_device_id deviceID);
virtual std::string ProgramHeader(cl_uint maxNumDestItems);
virtual std::string FunctionCode();
virtual std::string KernelCode(cl_uint maxNumDestItems);
virtual std::string ProgramCore() = 0;
virtual std::string SingleTestName()
{
std::string testName = LocalMemory() ? "local" : "global";
testName += " ";
testName += DataType().AtomicTypeName();
if(DeclaredInProgram())
{
testName += " declared in program";
}
if(DeclaredInProgram() && UsedInFunction())
testName += ",";
if(UsedInFunction())
{
testName += " used in ";
if(GenericAddrSpace())
testName += "generic ";
testName += "function";
}
return testName;
}
virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue);
int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
{
int error = 0;
UsedInFunction(false);
EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
UsedInFunction(true);
GenericAddrSpace(false);
EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
GenericAddrSpace(true);
EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
GenericAddrSpace(false);
return error;
}
int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
{
int error = 0;
DeclaredInProgram(false);
EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
if(!UseSVM())
{
DeclaredInProgram(true);
EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
}
return error;
}
virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
{
int error = 0;
if(_maxDeviceThreads > 0 && !UseSVM())
{
LocalMemory(true);
EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
}
if(_maxDeviceThreads+MaxHostThreads() > 0)
{
LocalMemory(false);
EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
}
return error;
}
virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
if(sizeof(HostAtomicType) != DataType().Size(deviceID))
{
log_info("Invalid test: Host atomic type size (%u) is different than OpenCL type size (%u)\n", (cl_uint)sizeof(HostAtomicType), DataType().Size(deviceID));
return -1;
}
if(sizeof(HostAtomicType) != sizeof(HostDataType))
{
log_info("Invalid test: Host atomic type size (%u) is different than corresponding type size (%u)\n", (cl_uint)sizeof(HostAtomicType), (cl_uint)sizeof(HostDataType));
return -1;
}
// Verify we can run first
if(UseSVM() && !gUseHostPtr)
{
cl_device_svm_capabilities caps;
cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0);
test_error(error, "clGetDeviceInfo failed");
if((caps & CL_DEVICE_SVM_ATOMICS) == 0)
{
log_info("\t%s - SVM_ATOMICS not supported\n", DataType().AtomicTypeName());
// implicit pass
return 0;
}
}
if(!DataType().IsSupported(deviceID))
{
log_info("\t%s not supported\n", DataType().AtomicTypeName());
// implicit pass or host test (debug feature)
if(UseSVM())
return 0;
_maxDeviceThreads = 0;
}
if(_maxDeviceThreads+MaxHostThreads() == 0)
return 0;
return ExecuteForEachParameterSet(deviceID, context, queue);
}
virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
{
log_info("Empty thread function %u\n", (cl_uint)tid);
}
AtomicTypeExtendedInfo<HostDataType> DataType() const
{
return AtomicTypeExtendedInfo<HostDataType>(_dataType);
}
cl_uint _maxDeviceThreads;
virtual cl_uint MaxHostThreads()
{
if(UseSVM() || gHost)
return MAX_HOST_THREADS;
else
return 0;
}
virtual bool SVMDataBufferAllSVMConsistent() {return false;}
bool UseSVM() {return _useSVM;}
void StartValue(HostDataType startValue) {_startValue = startValue;}
HostDataType StartValue() {return _startValue;}
void LocalMemory(bool local) {_localMemory = local;}
bool LocalMemory() {return _localMemory;}
void DeclaredInProgram(bool declaredInProgram) {_declaredInProgram = declaredInProgram;}
bool DeclaredInProgram() {return _declaredInProgram;}
void UsedInFunction(bool local) {_usedInFunction = local;}
bool UsedInFunction() {return _usedInFunction;}
void GenericAddrSpace(bool genericAddrSpace) {_genericAddrSpace = genericAddrSpace;}
bool GenericAddrSpace() {return _genericAddrSpace;}
void OldValueCheck(bool check) {_oldValueCheck = check;}
bool OldValueCheck() {return _oldValueCheck;}
void LocalRefValues(bool localRefValues) {_localRefValues = localRefValues;}
bool LocalRefValues() {return _localRefValues;}
void MaxGroupSize(cl_uint maxGroupSize) {_maxGroupSize = maxGroupSize;}
cl_uint MaxGroupSize() {return _maxGroupSize;}
void CurrentGroupSize(cl_uint currentGroupSize)
{
if(MaxGroupSize() && MaxGroupSize() < currentGroupSize)
_currentGroupSize = MaxGroupSize();
else
_currentGroupSize = currentGroupSize;
}
cl_uint CurrentGroupSize() {return _currentGroupSize;}
virtual cl_uint CurrentGroupNum(cl_uint threadCount)
{
if(threadCount == 0)
return 0;
if(LocalMemory())
return 1;
return threadCount/CurrentGroupSize();
}
cl_int Iterations() {return _iterations;}
std::string IterationsStr() {std::stringstream ss; ss << _iterations; return ss.str();}
private:
const TExplicitAtomicType _dataType;
const bool _useSVM;
HostDataType _startValue;
bool _localMemory;
bool _declaredInProgram;
bool _usedInFunction;
bool _genericAddrSpace;
bool _oldValueCheck;
bool _localRefValues;
cl_uint _maxGroupSize;
cl_uint _currentGroupSize;
cl_uint _passCount;
const cl_int _iterations;
};
template<typename HostAtomicType, typename HostDataType>
class CBasicTestMemOrderScope : public CBasicTest<HostAtomicType, HostDataType>
{
public:
using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
{
}
virtual std::string ProgramHeader(cl_uint maxNumDestItems)
{
std::string header;
if(gOldAPI)
{
std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
header +=
"#define atomic_store_explicit(x,y,o"+s+") atomic_store(x,y)\n"
"#define atomic_load_explicit(x,o"+s+") atomic_load(x)\n"
"#define atomic_exchange_explicit(x,y,o"+s+") atomic_exchange(x,y)\n"
"#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_strong(x,y,z)\n"
"#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_weak(x,y,z)\n"
"#define atomic_fetch_add_explicit(x,y,o"+s+") atomic_fetch_add(x,y)\n"
"#define atomic_fetch_sub_explicit(x,y,o"+s+") atomic_fetch_sub(x,y)\n"
"#define atomic_fetch_or_explicit(x,y,o"+s+") atomic_fetch_or(x,y)\n"
"#define atomic_fetch_xor_explicit(x,y,o"+s+") atomic_fetch_xor(x,y)\n"
"#define atomic_fetch_and_explicit(x,y,o"+s+") atomic_fetch_and(x,y)\n"
"#define atomic_fetch_min_explicit(x,y,o"+s+") atomic_fetch_min(x,y)\n"
"#define atomic_fetch_max_explicit(x,y,o"+s+") atomic_fetch_max(x,y)\n"
"#define atomic_flag_test_and_set_explicit(x,o"+s+") atomic_flag_test_and_set(x)\n"
"#define atomic_flag_clear_explicit(x,o"+s+") atomic_flag_clear(x)\n";
}
return header+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(maxNumDestItems);
}
virtual std::string SingleTestName()
{
std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
if(MemoryOrder() != MEMORY_ORDER_EMPTY)
{
testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
}
if(MemoryScope() != MEMORY_SCOPE_EMPTY)
{
testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
}
return testName;
}
virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
{
if(LocalMemory() &&
MemoryScope() != MEMORY_SCOPE_EMPTY &&
MemoryScope() != MEMORY_SCOPE_WORK_GROUP) //memory scope should only be used for global memory
return 0;
if(MemoryScope() == MEMORY_SCOPE_DEVICE)
MaxGroupSize(16); // increase number of groups by forcing smaller group size
else
MaxGroupSize(0); // group size limited by device capabilities
return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
}
virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
{
// repeat test for each reasonable memory order/scope combination
std::vector<TExplicitMemoryOrderType> memoryOrder;
std::vector<TExplicitMemoryScopeType> memoryScope;
int error = 0;
memoryOrder.push_back(MEMORY_ORDER_EMPTY);
memoryOrder.push_back(MEMORY_ORDER_RELAXED);
memoryOrder.push_back(MEMORY_ORDER_ACQUIRE);
memoryOrder.push_back(MEMORY_ORDER_RELEASE);
memoryOrder.push_back(MEMORY_ORDER_ACQ_REL);
memoryOrder.push_back(MEMORY_ORDER_SEQ_CST);
memoryScope.push_back(MEMORY_SCOPE_EMPTY);
memoryScope.push_back(MEMORY_SCOPE_WORK_GROUP);
memoryScope.push_back(MEMORY_SCOPE_DEVICE);
memoryScope.push_back(MEMORY_SCOPE_ALL_SVM_DEVICES);
for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
{
for(unsigned si = 0; si < memoryScope.size(); si++)
{
if(memoryOrder[oi] == MEMORY_ORDER_EMPTY && memoryScope[si] != MEMORY_SCOPE_EMPTY)
continue;
MemoryOrder(memoryOrder[oi]);
MemoryScope(memoryScope[si]);
EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
}
}
return error;
}
void MemoryOrder(TExplicitMemoryOrderType memoryOrder) {_memoryOrder = memoryOrder;}
TExplicitMemoryOrderType MemoryOrder() {return _memoryOrder;}
std::string MemoryOrderStr()
{
if(MemoryOrder() != MEMORY_ORDER_EMPTY)
return std::string(", ")+get_memory_order_type_name(MemoryOrder());
return "";
}
void MemoryScope(TExplicitMemoryScopeType memoryScope) {_memoryScope = memoryScope;}
TExplicitMemoryScopeType MemoryScope() {return _memoryScope;}
std::string MemoryScopeStr()
{
if(MemoryScope() != MEMORY_SCOPE_EMPTY)
return std::string(", ")+get_memory_scope_type_name(MemoryScope());
return "";
}
std::string MemoryOrderScopeStr()
{
return MemoryOrderStr()+MemoryScopeStr();
}
virtual cl_uint CurrentGroupNum(cl_uint threadCount)
{
if(MemoryScope() == MEMORY_SCOPE_WORK_GROUP)
return 1;
return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(threadCount);
}
virtual cl_uint MaxHostThreads()
{
// block host threads execution for memory scope different than memory_scope_all_svm_devices
if(MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
else
return 0;
}
private:
TExplicitMemoryOrderType _memoryOrder;
TExplicitMemoryScopeType _memoryScope;
};
template<typename HostAtomicType, typename HostDataType>
class CBasicTestMemOrder2Scope : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
{
public:
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
{
}
virtual std::string SingleTestName()
{
std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
if(MemoryOrder() != MEMORY_ORDER_EMPTY)
testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder2())).substr(sizeof("memory"));
if(MemoryScope() != MEMORY_SCOPE_EMPTY)
testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
return testName;
}
virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
{
// repeat test for each reasonable memory order/scope combination
std::vector<TExplicitMemoryOrderType> memoryOrder;
std::vector<TExplicitMemoryScopeType> memoryScope;
int error = 0;
memoryOrder.push_back(MEMORY_ORDER_EMPTY);
memoryOrder.push_back(MEMORY_ORDER_RELAXED);
memoryOrder.push_back(MEMORY_ORDER_ACQUIRE);
memoryOrder.push_back(MEMORY_ORDER_RELEASE);
memoryOrder.push_back(MEMORY_ORDER_ACQ_REL);
memoryOrder.push_back(MEMORY_ORDER_SEQ_CST);
memoryScope.push_back(MEMORY_SCOPE_EMPTY);
memoryScope.push_back(MEMORY_SCOPE_WORK_GROUP);
memoryScope.push_back(MEMORY_SCOPE_DEVICE);
memoryScope.push_back(MEMORY_SCOPE_ALL_SVM_DEVICES);
for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
{
for(unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
{
for(unsigned si = 0; si < memoryScope.size(); si++)
{
if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
&& memoryOrder[oi] != memoryOrder[o2i])
continue; // both memory order arguments must be set (or none)
if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
&& memoryScope[si] != MEMORY_SCOPE_EMPTY)
continue; // memory scope without memory order is not allowed
MemoryOrder(memoryOrder[oi]);
MemoryOrder2(memoryOrder[o2i]);
MemoryScope(memoryScope[si]);
EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
}
}
}
return error;
}
void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) {_memoryOrder2 = memoryOrderFail;}
TExplicitMemoryOrderType MemoryOrder2() {return _memoryOrder2;}
std::string MemoryOrderFailStr()
{
if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
return std::string(", ")+get_memory_order_type_name(MemoryOrder2());
return "";
}
std::string MemoryOrderScope()
{
return MemoryOrderStr()+MemoryOrderFailStr()+MemoryScopeStr();
}
private:
TExplicitMemoryOrderType _memoryOrder2;
};
template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
{
std::string pragma;
if(gOldAPI)
{
pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n";
pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n";
pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n";
pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n";
}
// Create the pragma lines for this kernel
if(DataType().Size(deviceID) == 8)
{
pragma += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
pragma += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
}
if(_dataType == TYPE_ATOMIC_DOUBLE)
pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
return pragma;
}
template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
{
// Create the program header
std::string header;
std::string aTypeName = DataType().AtomicTypeName();
std::string cTypeName = DataType().RegularTypeName();
std::string argListForKernel;
std::string argListForFunction;
std::string argListNoTypes;
std::string functionPrototype;
std::string addressSpace = LocalMemory() ? "__local " : "__global ";
if(gOldAPI)
{
header += std::string("#define ")+aTypeName+" "+cTypeName+"\n"
"#define atomic_store(x,y) (*(x) = y)\n"
"#define atomic_load(x) (*(x))\n"
"#define ATOMIC_VAR_INIT(x) (x)\n"
"#define ATOMIC_FLAG_INIT 0\n"
"#define atomic_init(x,y) atomic_store(x,y)\n";
if(aTypeName == "atomic_float")
header += "#define atomic_exchange(x,y) atomic_xchg(x,y)\n";
else if(aTypeName == "atomic_double")
header += "double atomic_exchange(volatile "+addressSpace+"atomic_double *x, double y)\n"
"{\n"
" long tmp = *(long*)&y, res;\n"
" volatile "+addressSpace+"long *tmpA = (volatile "+addressSpace+"long)x;\n"
" res = atom_xchg(tmpA,tmp);\n"
" return *(double*)&res;\n"
"}\n";
else
header += "#define atomic_exchange(x,y) atom_xchg(x,y)\n";
if(aTypeName != "atomic_float" && aTypeName != "atomic_double")
header +=
"bool atomic_compare_exchange_strong(volatile "+addressSpace+" "+aTypeName+" *a, "+cTypeName+" *expected, "+cTypeName+" desired)\n"
"{\n"
" "+cTypeName+" old = atom_cmpxchg(a, *expected, desired);\n"
" if(old == *expected)\n"
" return true;\n"
" *expected = old;\n"
" return false;\n"
"}\n"
"#define atomic_compare_exchange_weak atomic_compare_exchange_strong\n";
header +=
"#define atomic_fetch_add(x,y) atom_add(x,y)\n"
"#define atomic_fetch_sub(x,y) atom_sub(x,y)\n"
"#define atomic_fetch_or(x,y) atom_or(x,y)\n"
"#define atomic_fetch_xor(x,y) atom_xor(x,y)\n"
"#define atomic_fetch_and(x,y) atom_and(x,y)\n"
"#define atomic_fetch_min(x,y) atom_min(x,y)\n"
"#define atomic_fetch_max(x,y) atom_max(x,y)\n"
"#define atomic_flag_test_and_set(x) atomic_exchange(x,1)\n"
"#define atomic_flag_clear(x) atomic_store(x,0)\n"
"\n";
}
if(!LocalMemory() && DeclaredInProgram())
{
// additional atomic variable for results copying (last thread will do this)
header += "__global volatile atomic_uint finishedThreads = ATOMIC_VAR_INIT(0);\n";
// atomic variables declared in program scope - test data
std::stringstream ss;
ss << maxNumDestItems;
header += std::string("__global volatile ")+aTypeName+" destMemory["+ss.str()+"] = {\n";
ss.str("");
ss << _startValue;
for(cl_uint i = 0; i < maxNumDestItems; i++)
{
if(aTypeName == "atomic_flag")
header += " ATOMIC_FLAG_INIT";
else
header += " ATOMIC_VAR_INIT("+ss.str()+")";
if(i+1 < maxNumDestItems)
header += ",";
header += "\n";
}
header+=
"};\n"
"\n";
}
return header;
}
template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::FunctionCode()
{
if(!UsedInFunction())
return "";
std::string addressSpace = LocalMemory() ? "__local " : "__global ";
std::string code = "void test_atomic_function(uint tid, uint threadCount, uint numDestItems, volatile ";
if(!GenericAddrSpace())
code += addressSpace;
code += std::string(DataType().AtomicTypeName())+" *destMemory, __global "+DataType().RegularTypeName()+
" *oldValues";
if(LocalRefValues())
code += std::string(", __local ")+DataType().RegularTypeName()+" *localValues";
code += ")\n"
"{\n";
code += ProgramCore();
code += "}\n"
"\n";
return code;
}
template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
{
std::string aTypeName = DataType().AtomicTypeName();
std::string cTypeName = DataType().RegularTypeName();
std::string addressSpace = LocalMemory() ? "__local " : "__global ";
std::string code = "__kernel void test_atomic_kernel(uint threadCount, uint numDestItems, ";
// prepare list of arguments for kernel
if(LocalMemory())
{
code += std::string("__global ")+cTypeName+" *finalDest, __global "+cTypeName+" *oldValues,"
" volatile "+addressSpace+aTypeName+" *"+(DeclaredInProgram() ? "notUsed" : "")+"destMemory";
}
else
{
code += "volatile "+addressSpace+(DeclaredInProgram() ? (cTypeName+" *finalDest") : (aTypeName+" *destMemory"))+
", __global "+cTypeName+" *oldValues";
}
if(LocalRefValues())
code += std::string(", __local ")+cTypeName+" *localValues";
code += ")\n"
"{\n";
if(LocalMemory() && DeclaredInProgram())
{
// local atomics declared in kernel scope
std::stringstream ss;
ss << maxNumDestItems;
code += std::string(" __local volatile ")+aTypeName+" destMemory["+ss.str()+"];\n";
}
code += " uint tid = get_global_id(0);\n"
"\n";
if(LocalMemory())
{
code +=
" // initialize atomics not reachable from host (first thread is doing this, other threads are waiting on barrier)\n"
" if(get_local_id(0) == 0)\n"
" for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n"
" {\n";
if(aTypeName == "atomic_flag")
{
code +=
" if(finalDest[dstItemIdx])\n"
" atomic_flag_test_and_set(destMemory+dstItemIdx);\n"
" else\n"
" atomic_flag_clear(destMemory+dstItemIdx);\n";
}
else
{
code +=
" atomic_store(destMemory+dstItemIdx, finalDest[dstItemIdx]);\n";
}
code +=
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n";
}
if (LocalRefValues())
{
code +=
" // Copy input reference values into local memory\n";
if (NumNonAtomicVariablesPerThread() == 1)
code += " localValues[get_local_id(0)] = oldValues[tid];\n";
else
{
std::stringstream ss;
ss << NumNonAtomicVariablesPerThread();
code +=
" for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
" localValues[get_local_id(0)*" + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
}
code +=
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n";
}
if (UsedInFunction())
code += std::string(" test_atomic_function(tid, threadCount, numDestItems, destMemory, oldValues")+
(LocalRefValues() ? ", localValues" : "")+");\n";
else
code += ProgramCore();
code += "\n";
if (LocalRefValues())
{
code +=
" // Copy local reference values into output array\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n";
if (NumNonAtomicVariablesPerThread() == 1)
code += " oldValues[tid] = localValues[get_local_id(0)];\n";
else
{
std::stringstream ss;
ss << NumNonAtomicVariablesPerThread();
code +=
" for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
" oldValues[tid*" + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + "+rfId];\n";
}
code += "\n";
}
if(LocalMemory() || DeclaredInProgram())
{
code += " // Copy final values to host reachable buffer\n";
if(LocalMemory())
code +=
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if(get_local_id(0) == 0) // first thread in workgroup\n";
else
// global atomics declared in program scope
code +=
" if(atomic_fetch_add(&finishedThreads, 1) == get_global_size(0)-1)\n"
" // last finished thread\n";
code +=
" for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n";
if(aTypeName == "atomic_flag")
{
code +=
" finalDest[dstItemIdx] = atomic_flag_test_and_set(destMemory+dstItemIdx);\n";
}
else
{
code +=
" finalDest[dstItemIdx] = atomic_load(destMemory+dstItemIdx);\n";
}
}
code += "}\n"
"\n";
return code;
}
template <typename HostAtomicType, typename HostDataType>
int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
{
int error;
clProgramWrapper program;
clKernelWrapper kernel;
size_t threadNum[1];
clMemWrapper streams[2];
std::vector<HostAtomicType> destItems;
HostAtomicType *svmAtomicBuffer = 0;
std::vector<HostDataType> refValues, startRefValues;
HostDataType *svmDataBuffer = 0;
cl_uint deviceThreadCount, hostThreadCount, threadCount;
size_t groupSize = 0;
std::string programSource;
const char *programLine;
MTdata d;
size_t typeSize = DataType().Size(deviceID);
deviceThreadCount = _maxDeviceThreads;
hostThreadCount = MaxHostThreads();
threadCount = deviceThreadCount+hostThreadCount;
//log_info("\t%s %s%s...\n", local ? "local" : "global", DataType().AtomicTypeName(), memoryOrderScope.c_str());
log_info("\t%s...\n", SingleTestName().c_str());
if(!LocalMemory() && DeclaredInProgram() && gNoGlobalVariables) // no support for program scope global variables
{
log_info("\t\tTest disabled\n");
return 0;
}
if(UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
{
log_info("\t\tTest disabled\n");
return 0;
}
// set up work sizes based on device capabilities and test configuration
error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
test_error(error, "Unable to obtain max work group size for device");
CurrentGroupSize((cl_uint)groupSize);
if(CurrentGroupSize() > deviceThreadCount)
CurrentGroupSize(deviceThreadCount);
if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
threadCount = deviceThreadCount+hostThreadCount;
// If we're given a num_results function, we need to determine how many result objects we need.
// This is the first assessment for current maximum number of threads (exact thread count is not known here)
// - needed for program source code generation (arrays of atomics declared in program)
cl_uint numDestItems = NumResults(threadCount, deviceID);
if(deviceThreadCount > 0)
{
cl_ulong usedLocalMemory;
cl_ulong totalLocalMemory;
cl_uint maxWorkGroupSize;
// Set up the kernel code
programSource = PragmaHeader(deviceID)+ProgramHeader(numDestItems)+FunctionCode()+KernelCode(numDestItems);
programLine = programSource.c_str();
if(create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, &programLine, "test_atomic_kernel",
gOldAPI ? "" : "-cl-std=CL2.0"))
{
return -1;
}
if(gDebug)
{
log_info("Program source:\n");
log_info("%s\n", programLine);
}
// tune up work sizes based on kernel info
error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
test_error(error, "Unable to obtain max work group size for device and kernel combo");
if(LocalMemory())
{
error = clGetKernelWorkGroupInfo (kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(usedLocalMemory), &usedLocalMemory, NULL);
test_error(error, "clGetKernelWorkGroupInfo failed");
error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(totalLocalMemory), &totalLocalMemory, NULL);
test_error(error, "clGetDeviceInfo failed");
// We know that each work-group is going to use typeSize * deviceThreadCount bytes of local memory
// so pick the maximum value for deviceThreadCount that uses all the local memory.
maxWorkGroupSize = ((totalLocalMemory - usedLocalMemory) / typeSize);
if(maxWorkGroupSize < groupSize)
groupSize = maxWorkGroupSize;
}
CurrentGroupSize((cl_uint)groupSize);
if(CurrentGroupSize() > deviceThreadCount)
CurrentGroupSize(deviceThreadCount);
if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
threadCount = deviceThreadCount+hostThreadCount;
}
if(deviceThreadCount > 0)
log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize());
if(hostThreadCount > 0)
log_info("\t\t(host threads %u)\n", hostThreadCount);
refValues.resize(threadCount*NumNonAtomicVariablesPerThread());
// Generate ref data if we have a ref generator provided
d = init_genrand(gRandomSeed);
startRefValues.resize(threadCount*NumNonAtomicVariablesPerThread());
if(GenerateRefs(threadCount, &startRefValues[0], d))
{
//copy ref values for host threads
memcpy(&refValues[0], &startRefValues[0], sizeof(HostDataType)*threadCount*NumNonAtomicVariablesPerThread());
}
else
{
startRefValues.resize(0);
}
free_mtdata(d);
d = NULL;
// If we're given a num_results function, we need to determine how many result objects we need. If
// we don't have it, we assume it's just 1
// This is final value (exact thread count is known in this place)
numDestItems = NumResults(threadCount, deviceID);
destItems.resize(numDestItems);
for(cl_uint i = 0; i < numDestItems; i++)
destItems[i] = _startValue;
// Create main buffer with atomic variables (array size dependent on particular test)
if(UseSVM())
{
if(gUseHostPtr)
svmAtomicBuffer = (HostAtomicType*)malloc(typeSize * numDestItems);
else
svmAtomicBuffer = (HostAtomicType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, typeSize * numDestItems, 0);
if(!svmAtomicBuffer)
{
log_error("ERROR: clSVMAlloc failed!\n");
return -1;
}
memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_USE_HOST_PTR), typeSize * numDestItems, svmAtomicBuffer, NULL);
}
else
{
streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), typeSize * numDestItems, &destItems[0], NULL);
}
if (!streams[0])
{
log_error("ERROR: Creating output array failed!\n");
return -1;
}
// Create buffer for per-thread input/output data
if(UseSVM())
{
if(gUseHostPtr)
svmDataBuffer = (HostDataType*)malloc(typeSize*threadCount*NumNonAtomicVariablesPerThread());
else
svmDataBuffer = (HostDataType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS : 0), typeSize*threadCount*NumNonAtomicVariablesPerThread(), 0);
if(!svmDataBuffer)
{
log_error("ERROR: clSVMAlloc failed!\n");
return -1;
}
if(startRefValues.size())
memcpy(svmDataBuffer, &startRefValues[0], typeSize*threadCount*NumNonAtomicVariablesPerThread());
streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_USE_HOST_PTR), typeSize*threadCount*NumNonAtomicVariablesPerThread(), svmDataBuffer, NULL);
}
else
{
streams[1] = clCreateBuffer(context, (cl_mem_flags)((startRefValues.size() ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
typeSize * threadCount*NumNonAtomicVariablesPerThread(), startRefValues.size() ? &startRefValues[0] : 0, NULL);
}
if (!streams[1])
{
log_error("ERROR: Creating reference array failed!\n");
return -1;
}
if(deviceThreadCount > 0)
{
cl_uint argInd = 0;
/* Set the arguments */
error = clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
test_error(error, "Unable to set kernel argument");
error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), &numDestItems);
test_error(error, "Unable to set indexed kernel argument");
error = clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
test_error(error, "Unable to set indexed kernel arguments");
error = clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
test_error(error, "Unable to set indexed kernel arguments");
if(LocalMemory())
{
error = clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
test_error(error, "Unable to set indexed local kernel argument");
}
if(LocalRefValues())
{
error = clSetKernelArg(kernel, argInd++, LocalRefValues() ? typeSize*CurrentGroupSize()*NumNonAtomicVariablesPerThread() : 1, NULL);
test_error(error, "Unable to set indexed kernel argument");
}
}
/* Configure host threads */
std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
for(unsigned int t = 0; t < hostThreadCount; t++)
{
hostThreadContexts[t].test = this;
hostThreadContexts[t].tid = deviceThreadCount+t;
hostThreadContexts[t].threadCount = threadCount;
hostThreadContexts[t].destMemory = UseSVM() ? svmAtomicBuffer : &destItems[0];
hostThreadContexts[t].oldValues = UseSVM() ? svmDataBuffer : &refValues[0];
}
if(deviceThreadCount > 0)
{
/* Run the kernel */
threadNum[0] = deviceThreadCount;
groupSize = CurrentGroupSize();
error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, &groupSize, 0, NULL, NULL);
test_error(error, "Unable to execute test kernel");
/* start device threads */
error = clFlush(queue);
test_error(error, "clFlush failed");
}
/* Start host threads and wait for finish */
if(hostThreadCount > 0)
ThreadPool_Do(HostThreadFunction, hostThreadCount, &hostThreadContexts[0]);
if(UseSVM())
{
error = clFinish(queue);
test_error(error, "clFinish failed");
memcpy(&destItems[0], svmAtomicBuffer, typeSize*numDestItems);
memcpy(&refValues[0], svmDataBuffer, typeSize*threadCount*NumNonAtomicVariablesPerThread());
}
else
{
if(deviceThreadCount > 0)
{
error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
test_error(error, "Unable to read result value!");
error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize * deviceThreadCount*NumNonAtomicVariablesPerThread(), &refValues[0], 0, NULL, NULL);
test_error(error, "Unable to read reference values!");
}
}
bool dataVerified = false;
// If we have an expectedFn, then we need to generate a final value to compare against. If we don't
// have one, it's because we're comparing ref values only
for(cl_uint i = 0; i < numDestItems; i++)
{
HostDataType expected;
if(!ExpectedValue(expected, threadCount, startRefValues.size() ? &startRefValues[0] : 0, i))
break; // no expected value function provided
if(expected != destItems[i])
{
std::stringstream logLine;
logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected << ", was " << destItems[i] << ")\n";
log_error("%s", logLine.str().c_str());
for(i = 0; i < threadCount; i++)
{
logLine.str("");
logLine << " --- " << i << " - ";
if(startRefValues.size())
logLine << startRefValues[i] << " -> " << refValues[i];
else
logLine << refValues[i];
logLine << " --- ";
if(i < numDestItems)
logLine << destItems[i];
logLine << "\n";
log_info("%s", logLine.str().c_str());
}
if(!gDebug)
{
log_info("Program source:\n");
log_info("%s\n", programLine);
}
return -1;
}
dataVerified = true;
}
bool dataCorrect = false;
/* Use the verify function (if provided) to also check the results */
if(VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
{
if(!dataCorrect)
{
log_error("ERROR: Reference values did not validate!\n");
std::stringstream logLine;
for(cl_uint i = 0; i < threadCount; i++)
for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
{
logLine.str("");
logLine << " --- " << i << " - " << refValues[i*NumNonAtomicVariablesPerThread()+j] << " --- ";
if(j == 0 && i < numDestItems)
logLine << destItems[i];
logLine << "\n";
log_info("%s", logLine.str().c_str());
}
if(!gDebug)
{
log_info("Program source:\n");
log_info("%s\n", programLine);
}
return -1;
}
}
else if(!dataVerified)
{
log_error("ERROR: Test doesn't check total or refs; no values are verified!\n");
return -1;
}
if(OldValueCheck() &&
!(DeclaredInProgram() && !LocalMemory())) // don't test for programs scope global atomics
// 'old' value has been overwritten by previous clEnqueueNDRangeKernel
{
/* Re-write the starting value */
for(size_t i = 0; i < numDestItems; i++)
destItems[i] = _startValue;
refValues[0] = 0;
if(deviceThreadCount > 0)
{
error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
test_error(error, "Unable to write starting values!");
/* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
threadNum[0] = 1;
error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, threadNum, 0, NULL, NULL);
test_error(error, "Unable to execute test kernel");
error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, &refValues[0], 0, NULL, NULL);
test_error(error, "Unable to read reference values!");
}
else
{
/* Start host thread */
HostFunction(0, 1, &destItems[0], &refValues[0]);
}
if(refValues[0] != _startValue)//destItems[0])
{
std::stringstream logLine;
logLine << "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
" (should have been " << destItems[0] << ", returned " << refValues[0] << ")!\n";
log_error("%s", logLine.str().c_str());
if(!gDebug)
{
log_info("Program source:\n");
log_info("%s\n", programLine);
}
return -1;
}
}
if(UseSVM())
{
// the buffer object must first be released before the SVM buffer is freed
error = clReleaseMemObject(streams[0]);
streams[0] = 0;
test_error(error, "clReleaseMemObject failed");
if(gUseHostPtr)
free(svmAtomicBuffer);
else
clSVMFree(context, svmAtomicBuffer);
error = clReleaseMemObject(streams[1]);
streams[1] = 0;
test_error(error, "clReleaseMemObject failed");
if(gUseHostPtr)
free(svmDataBuffer);
else
clSVMFree(context, svmDataBuffer);
}
_passCount++;
return 0;
}
#endif //_COMMON_H_