// // Copyright (c) 2017 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #ifndef _COMMON_H_ #define _COMMON_H_ #include "../../test_common/harness/testHarness.h" #include "../../test_common/harness/typeWrappers.h" #include "../../test_common/harness/ThreadPool.h" #include "host_atomics.h" #include #include #define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads) #define MAX_HOST_THREADS GetThreadCount() #define EXECUTE_TEST(error, test)\ error |= test;\ if(error && !gContinueOnError)\ return error; enum TExplicitAtomicType { TYPE_ATOMIC_INT, TYPE_ATOMIC_UINT, TYPE_ATOMIC_LONG, TYPE_ATOMIC_ULONG, TYPE_ATOMIC_FLOAT, TYPE_ATOMIC_DOUBLE, TYPE_ATOMIC_INTPTR_T, TYPE_ATOMIC_UINTPTR_T, TYPE_ATOMIC_SIZE_T, TYPE_ATOMIC_PTRDIFF_T, TYPE_ATOMIC_FLAG }; enum TExplicitMemoryScopeType { MEMORY_SCOPE_EMPTY, MEMORY_SCOPE_WORK_GROUP, MEMORY_SCOPE_DEVICE, MEMORY_SCOPE_ALL_SVM_DEVICES }; extern bool gHost; // temporary flag for testing native host threads (test verification) extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2) extern bool gContinueOnError; // execute all cases even when errors detected extern bool gNoGlobalVariables; // disable cases with global atomics in program scope extern bool gNoGenericAddressSpace; // disable cases with generic address space extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree extern bool gDebug; // print OpenCL kernel code extern int gInternalIterations; // internal test iterations for atomic operation, sufficient to verify atomicity extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device extern cl_uint gRandomSeed; extern const char *get_memory_order_type_name(TExplicitMemoryOrderType orderType); extern const char *get_memory_scope_type_name(TExplicitMemoryScopeType scopeType); class AtomicTypeInfo { public: TExplicitAtomicType _type; AtomicTypeInfo(TExplicitAtomicType type): _type(type) {} cl_uint Size(cl_device_id device); const char* AtomicTypeName(); const char* RegularTypeName(); const char* AddSubOperandTypeName(); int IsSupported(cl_device_id device); }; template class AtomicTypeExtendedInfo : public AtomicTypeInfo { public: AtomicTypeExtendedInfo(TExplicitAtomicType type) : AtomicTypeInfo(type) {} HostDataType MinValue(); HostDataType MaxValue(); HostDataType SpecialValue(cl_uchar x) { HostDataType tmp; cl_uchar *ptr = (cl_uchar*)&tmp; for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_uchar); i++) ptr[i] = x; return tmp; } HostDataType SpecialValue(cl_ushort x) { HostDataType tmp; cl_ushort *ptr = (cl_ushort*)&tmp; for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_ushort); i++) ptr[i] = x; return tmp; } }; class CTest { public: virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) = 0; }; template class CBasicTest : CTest { public: typedef struct { CBasicTest *test; cl_uint tid; cl_uint threadCount; volatile HostAtomicType *destMemory; HostDataType *oldValues; } THostThreadContext; static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, void *userInfo) { THostThreadContext *threadContext = ((THostThreadContext*)userInfo)+job_id; threadContext->test->HostFunction(threadContext->tid, threadContext->threadCount, threadContext->destMemory, threadContext->oldValues); return 0; } CBasicTest(TExplicitAtomicType dataType, bool useSVM) : CTest(), _maxDeviceThreads(MAX_DEVICE_THREADS), _dataType(dataType), _useSVM(useSVM), _startValue(255), _localMemory(false), _declaredInProgram(false), _usedInFunction(false), _genericAddrSpace(false), _oldValueCheck(true), _localRefValues(false), _maxGroupSize(0), _passCount(0), _iterations(gInternalIterations) { } virtual ~CBasicTest() { if(_passCount) log_info(" %u tests executed successfully for %s\n", _passCount, DataType().AtomicTypeName()); } virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) { return 1; } virtual cl_uint NumNonAtomicVariablesPerThread() { return 1; } virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) { return false; } virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) { return false; } virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) { return false; } virtual std::string PragmaHeader(cl_device_id deviceID); virtual std::string ProgramHeader(cl_uint maxNumDestItems); virtual std::string FunctionCode(); virtual std::string KernelCode(cl_uint maxNumDestItems); virtual std::string ProgramCore() = 0; virtual std::string SingleTestName() { std::string testName = LocalMemory() ? "local" : "global"; testName += " "; testName += DataType().AtomicTypeName(); if(DeclaredInProgram()) { testName += " declared in program"; } if(DeclaredInProgram() && UsedInFunction()) testName += ","; if(UsedInFunction()) { testName += " used in "; if(GenericAddrSpace()) testName += "generic "; testName += "function"; } return testName; } virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue); int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, cl_command_queue queue) { int error = 0; UsedInFunction(false); EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); UsedInFunction(true); GenericAddrSpace(false); EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); GenericAddrSpace(true); EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); GenericAddrSpace(false); return error; } int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, cl_command_queue queue) { int error = 0; DeclaredInProgram(false); EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue)); if(!UseSVM()) { DeclaredInProgram(true); EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue)); } return error; } virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue) { int error = 0; if(_maxDeviceThreads > 0 && !UseSVM()) { LocalMemory(true); EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue)); } if(_maxDeviceThreads+MaxHostThreads() > 0) { LocalMemory(false); EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue)); } return error; } virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) { if(sizeof(HostAtomicType) != DataType().Size(deviceID)) { log_info("Invalid test: Host atomic type size (%u) is different than OpenCL type size (%u)\n", (cl_uint)sizeof(HostAtomicType), DataType().Size(deviceID)); return -1; } if(sizeof(HostAtomicType) != sizeof(HostDataType)) { log_info("Invalid test: Host atomic type size (%u) is different than corresponding type size (%u)\n", (cl_uint)sizeof(HostAtomicType), (cl_uint)sizeof(HostDataType)); return -1; } // Verify we can run first if(UseSVM() && !gUseHostPtr) { cl_device_svm_capabilities caps; cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0); test_error(error, "clGetDeviceInfo failed"); if((caps & CL_DEVICE_SVM_ATOMICS) == 0) { log_info("\t%s - SVM_ATOMICS not supported\n", DataType().AtomicTypeName()); // implicit pass return 0; } } if(!DataType().IsSupported(deviceID)) { log_info("\t%s not supported\n", DataType().AtomicTypeName()); // implicit pass or host test (debug feature) if(UseSVM()) return 0; _maxDeviceThreads = 0; } if(_maxDeviceThreads+MaxHostThreads() == 0) return 0; return ExecuteForEachParameterSet(deviceID, context, queue); } virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) { log_info("Empty thread function %u\n", (cl_uint)tid); } AtomicTypeExtendedInfo DataType() const { return AtomicTypeExtendedInfo(_dataType); } cl_uint _maxDeviceThreads; virtual cl_uint MaxHostThreads() { if(UseSVM() || gHost) return MAX_HOST_THREADS; else return 0; } virtual bool SVMDataBufferAllSVMConsistent() {return false;} bool UseSVM() {return _useSVM;} void StartValue(HostDataType startValue) {_startValue = startValue;} HostDataType StartValue() {return _startValue;} void LocalMemory(bool local) {_localMemory = local;} bool LocalMemory() {return _localMemory;} void DeclaredInProgram(bool declaredInProgram) {_declaredInProgram = declaredInProgram;} bool DeclaredInProgram() {return _declaredInProgram;} void UsedInFunction(bool local) {_usedInFunction = local;} bool UsedInFunction() {return _usedInFunction;} void GenericAddrSpace(bool genericAddrSpace) {_genericAddrSpace = genericAddrSpace;} bool GenericAddrSpace() {return _genericAddrSpace;} void OldValueCheck(bool check) {_oldValueCheck = check;} bool OldValueCheck() {return _oldValueCheck;} void LocalRefValues(bool localRefValues) {_localRefValues = localRefValues;} bool LocalRefValues() {return _localRefValues;} void MaxGroupSize(cl_uint maxGroupSize) {_maxGroupSize = maxGroupSize;} cl_uint MaxGroupSize() {return _maxGroupSize;} void CurrentGroupSize(cl_uint currentGroupSize) { if(MaxGroupSize() && MaxGroupSize() < currentGroupSize) _currentGroupSize = MaxGroupSize(); else _currentGroupSize = currentGroupSize; } cl_uint CurrentGroupSize() {return _currentGroupSize;} virtual cl_uint CurrentGroupNum(cl_uint threadCount) { if(threadCount == 0) return 0; if(LocalMemory()) return 1; return threadCount/CurrentGroupSize(); } cl_int Iterations() {return _iterations;} std::string IterationsStr() {std::stringstream ss; ss << _iterations; return ss.str();} private: const TExplicitAtomicType _dataType; const bool _useSVM; HostDataType _startValue; bool _localMemory; bool _declaredInProgram; bool _usedInFunction; bool _genericAddrSpace; bool _oldValueCheck; bool _localRefValues; cl_uint _maxGroupSize; cl_uint _currentGroupSize; cl_uint _passCount; const cl_int _iterations; }; template class CBasicTestMemOrderScope : public CBasicTest { public: using CBasicTest::LocalMemory; using CBasicTest::MaxGroupSize; CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTest(dataType, useSVM) { } virtual std::string ProgramHeader(cl_uint maxNumDestItems) { std::string header; if(gOldAPI) { std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s"; header += "#define atomic_store_explicit(x,y,o"+s+") atomic_store(x,y)\n" "#define atomic_load_explicit(x,o"+s+") atomic_load(x)\n" "#define atomic_exchange_explicit(x,y,o"+s+") atomic_exchange(x,y)\n" "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_strong(x,y,z)\n" "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_weak(x,y,z)\n" "#define atomic_fetch_add_explicit(x,y,o"+s+") atomic_fetch_add(x,y)\n" "#define atomic_fetch_sub_explicit(x,y,o"+s+") atomic_fetch_sub(x,y)\n" "#define atomic_fetch_or_explicit(x,y,o"+s+") atomic_fetch_or(x,y)\n" "#define atomic_fetch_xor_explicit(x,y,o"+s+") atomic_fetch_xor(x,y)\n" "#define atomic_fetch_and_explicit(x,y,o"+s+") atomic_fetch_and(x,y)\n" "#define atomic_fetch_min_explicit(x,y,o"+s+") atomic_fetch_min(x,y)\n" "#define atomic_fetch_max_explicit(x,y,o"+s+") atomic_fetch_max(x,y)\n" "#define atomic_flag_test_and_set_explicit(x,o"+s+") atomic_flag_test_and_set(x)\n" "#define atomic_flag_clear_explicit(x,o"+s+") atomic_flag_clear(x)\n"; } return header+CBasicTest::ProgramHeader(maxNumDestItems); } virtual std::string SingleTestName() { std::string testName = CBasicTest::SingleTestName(); if(MemoryOrder() != MEMORY_ORDER_EMPTY) { testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory")); } if(MemoryScope() != MEMORY_SCOPE_EMPTY) { testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory")); } return testName; } virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) { if(LocalMemory() && MemoryScope() != MEMORY_SCOPE_EMPTY && MemoryScope() != MEMORY_SCOPE_WORK_GROUP) //memory scope should only be used for global memory return 0; if(MemoryScope() == MEMORY_SCOPE_DEVICE) MaxGroupSize(16); // increase number of groups by forcing smaller group size else MaxGroupSize(0); // group size limited by device capabilities return CBasicTest::ExecuteSingleTest(deviceID, context, queue); } virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue) { // repeat test for each reasonable memory order/scope combination std::vector memoryOrder; std::vector memoryScope; int error = 0; memoryOrder.push_back(MEMORY_ORDER_EMPTY); memoryOrder.push_back(MEMORY_ORDER_RELAXED); memoryOrder.push_back(MEMORY_ORDER_ACQUIRE); memoryOrder.push_back(MEMORY_ORDER_RELEASE); memoryOrder.push_back(MEMORY_ORDER_ACQ_REL); memoryOrder.push_back(MEMORY_ORDER_SEQ_CST); memoryScope.push_back(MEMORY_SCOPE_EMPTY); memoryScope.push_back(MEMORY_SCOPE_WORK_GROUP); memoryScope.push_back(MEMORY_SCOPE_DEVICE); memoryScope.push_back(MEMORY_SCOPE_ALL_SVM_DEVICES); for(unsigned oi = 0; oi < memoryOrder.size(); oi++) { for(unsigned si = 0; si < memoryScope.size(); si++) { if(memoryOrder[oi] == MEMORY_ORDER_EMPTY && memoryScope[si] != MEMORY_SCOPE_EMPTY) continue; MemoryOrder(memoryOrder[oi]); MemoryScope(memoryScope[si]); EXECUTE_TEST(error, (CBasicTest::ExecuteForEachParameterSet(deviceID, context, queue))); } } return error; } void MemoryOrder(TExplicitMemoryOrderType memoryOrder) {_memoryOrder = memoryOrder;} TExplicitMemoryOrderType MemoryOrder() {return _memoryOrder;} std::string MemoryOrderStr() { if(MemoryOrder() != MEMORY_ORDER_EMPTY) return std::string(", ")+get_memory_order_type_name(MemoryOrder()); return ""; } void MemoryScope(TExplicitMemoryScopeType memoryScope) {_memoryScope = memoryScope;} TExplicitMemoryScopeType MemoryScope() {return _memoryScope;} std::string MemoryScopeStr() { if(MemoryScope() != MEMORY_SCOPE_EMPTY) return std::string(", ")+get_memory_scope_type_name(MemoryScope()); return ""; } std::string MemoryOrderScopeStr() { return MemoryOrderStr()+MemoryScopeStr(); } virtual cl_uint CurrentGroupNum(cl_uint threadCount) { if(MemoryScope() == MEMORY_SCOPE_WORK_GROUP) return 1; return CBasicTest::CurrentGroupNum(threadCount); } virtual cl_uint MaxHostThreads() { // block host threads execution for memory scope different than memory_scope_all_svm_devices if(MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost) return CBasicTest::MaxHostThreads(); else return 0; } private: TExplicitMemoryOrderType _memoryOrder; TExplicitMemoryScopeType _memoryScope; }; template class CBasicTestMemOrder2Scope : public CBasicTestMemOrderScope { public: using CBasicTestMemOrderScope::LocalMemory; using CBasicTestMemOrderScope::MemoryOrder; using CBasicTestMemOrderScope::MemoryScope; using CBasicTestMemOrderScope::MemoryOrderStr; using CBasicTestMemOrderScope::MemoryScopeStr; CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTestMemOrderScope(dataType, useSVM) { } virtual std::string SingleTestName() { std::string testName = CBasicTest::SingleTestName(); if(MemoryOrder() != MEMORY_ORDER_EMPTY) testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory")); if(MemoryOrder2() != MEMORY_ORDER_EMPTY) testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder2())).substr(sizeof("memory")); if(MemoryScope() != MEMORY_SCOPE_EMPTY) testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory")); return testName; } virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue) { // repeat test for each reasonable memory order/scope combination std::vector memoryOrder; std::vector memoryScope; int error = 0; memoryOrder.push_back(MEMORY_ORDER_EMPTY); memoryOrder.push_back(MEMORY_ORDER_RELAXED); memoryOrder.push_back(MEMORY_ORDER_ACQUIRE); memoryOrder.push_back(MEMORY_ORDER_RELEASE); memoryOrder.push_back(MEMORY_ORDER_ACQ_REL); memoryOrder.push_back(MEMORY_ORDER_SEQ_CST); memoryScope.push_back(MEMORY_SCOPE_EMPTY); memoryScope.push_back(MEMORY_SCOPE_WORK_GROUP); memoryScope.push_back(MEMORY_SCOPE_DEVICE); memoryScope.push_back(MEMORY_SCOPE_ALL_SVM_DEVICES); for(unsigned oi = 0; oi < memoryOrder.size(); oi++) { for(unsigned o2i = 0; o2i < memoryOrder.size(); o2i++) { for(unsigned si = 0; si < memoryScope.size(); si++) { if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY) && memoryOrder[oi] != memoryOrder[o2i]) continue; // both memory order arguments must be set (or none) if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY) && memoryScope[si] != MEMORY_SCOPE_EMPTY) continue; // memory scope without memory order is not allowed MemoryOrder(memoryOrder[oi]); MemoryOrder2(memoryOrder[o2i]); MemoryScope(memoryScope[si]); EXECUTE_TEST(error, (CBasicTest::ExecuteForEachParameterSet(deviceID, context, queue))); } } } return error; } void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) {_memoryOrder2 = memoryOrderFail;} TExplicitMemoryOrderType MemoryOrder2() {return _memoryOrder2;} std::string MemoryOrderFailStr() { if(MemoryOrder2() != MEMORY_ORDER_EMPTY) return std::string(", ")+get_memory_order_type_name(MemoryOrder2()); return ""; } std::string MemoryOrderScope() { return MemoryOrderStr()+MemoryOrderFailStr()+MemoryScopeStr(); } private: TExplicitMemoryOrderType _memoryOrder2; }; template std::string CBasicTest::PragmaHeader(cl_device_id deviceID) { std::string pragma; if(gOldAPI) { pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"; pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"; pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"; pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"; } // Create the pragma lines for this kernel if(DataType().Size(deviceID) == 8) { pragma += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"; pragma += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"; } if(_dataType == TYPE_ATOMIC_DOUBLE) pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; return pragma; } template std::string CBasicTest::ProgramHeader(cl_uint maxNumDestItems) { // Create the program header std::string header; std::string aTypeName = DataType().AtomicTypeName(); std::string cTypeName = DataType().RegularTypeName(); std::string argListForKernel; std::string argListForFunction; std::string argListNoTypes; std::string functionPrototype; std::string addressSpace = LocalMemory() ? "__local " : "__global "; if(gOldAPI) { header += std::string("#define ")+aTypeName+" "+cTypeName+"\n" "#define atomic_store(x,y) (*(x) = y)\n" "#define atomic_load(x) (*(x))\n" "#define ATOMIC_VAR_INIT(x) (x)\n" "#define ATOMIC_FLAG_INIT 0\n" "#define atomic_init(x,y) atomic_store(x,y)\n"; if(aTypeName == "atomic_float") header += "#define atomic_exchange(x,y) atomic_xchg(x,y)\n"; else if(aTypeName == "atomic_double") header += "double atomic_exchange(volatile "+addressSpace+"atomic_double *x, double y)\n" "{\n" " long tmp = *(long*)&y, res;\n" " volatile "+addressSpace+"long *tmpA = (volatile "+addressSpace+"long)x;\n" " res = atom_xchg(tmpA,tmp);\n" " return *(double*)&res;\n" "}\n"; else header += "#define atomic_exchange(x,y) atom_xchg(x,y)\n"; if(aTypeName != "atomic_float" && aTypeName != "atomic_double") header += "bool atomic_compare_exchange_strong(volatile "+addressSpace+" "+aTypeName+" *a, "+cTypeName+" *expected, "+cTypeName+" desired)\n" "{\n" " "+cTypeName+" old = atom_cmpxchg(a, *expected, desired);\n" " if(old == *expected)\n" " return true;\n" " *expected = old;\n" " return false;\n" "}\n" "#define atomic_compare_exchange_weak atomic_compare_exchange_strong\n"; header += "#define atomic_fetch_add(x,y) atom_add(x,y)\n" "#define atomic_fetch_sub(x,y) atom_sub(x,y)\n" "#define atomic_fetch_or(x,y) atom_or(x,y)\n" "#define atomic_fetch_xor(x,y) atom_xor(x,y)\n" "#define atomic_fetch_and(x,y) atom_and(x,y)\n" "#define atomic_fetch_min(x,y) atom_min(x,y)\n" "#define atomic_fetch_max(x,y) atom_max(x,y)\n" "#define atomic_flag_test_and_set(x) atomic_exchange(x,1)\n" "#define atomic_flag_clear(x) atomic_store(x,0)\n" "\n"; } if(!LocalMemory() && DeclaredInProgram()) { // additional atomic variable for results copying (last thread will do this) header += "__global volatile atomic_uint finishedThreads = ATOMIC_VAR_INIT(0);\n"; // atomic variables declared in program scope - test data std::stringstream ss; ss << maxNumDestItems; header += std::string("__global volatile ")+aTypeName+" destMemory["+ss.str()+"] = {\n"; ss.str(""); ss << _startValue; for(cl_uint i = 0; i < maxNumDestItems; i++) { if(aTypeName == "atomic_flag") header += " ATOMIC_FLAG_INIT"; else header += " ATOMIC_VAR_INIT("+ss.str()+")"; if(i+1 < maxNumDestItems) header += ","; header += "\n"; } header+= "};\n" "\n"; } return header; } template std::string CBasicTest::FunctionCode() { if(!UsedInFunction()) return ""; std::string addressSpace = LocalMemory() ? "__local " : "__global "; std::string code = "void test_atomic_function(uint tid, uint threadCount, uint numDestItems, volatile "; if(!GenericAddrSpace()) code += addressSpace; code += std::string(DataType().AtomicTypeName())+" *destMemory, __global "+DataType().RegularTypeName()+ " *oldValues"; if(LocalRefValues()) code += std::string(", __local ")+DataType().RegularTypeName()+" *localValues"; code += ")\n" "{\n"; code += ProgramCore(); code += "}\n" "\n"; return code; } template std::string CBasicTest::KernelCode(cl_uint maxNumDestItems) { std::string aTypeName = DataType().AtomicTypeName(); std::string cTypeName = DataType().RegularTypeName(); std::string addressSpace = LocalMemory() ? "__local " : "__global "; std::string code = "__kernel void test_atomic_kernel(uint threadCount, uint numDestItems, "; // prepare list of arguments for kernel if(LocalMemory()) { code += std::string("__global ")+cTypeName+" *finalDest, __global "+cTypeName+" *oldValues," " volatile "+addressSpace+aTypeName+" *"+(DeclaredInProgram() ? "notUsed" : "")+"destMemory"; } else { code += "volatile "+addressSpace+(DeclaredInProgram() ? (cTypeName+" *finalDest") : (aTypeName+" *destMemory"))+ ", __global "+cTypeName+" *oldValues"; } if(LocalRefValues()) code += std::string(", __local ")+cTypeName+" *localValues"; code += ")\n" "{\n"; if(LocalMemory() && DeclaredInProgram()) { // local atomics declared in kernel scope std::stringstream ss; ss << maxNumDestItems; code += std::string(" __local volatile ")+aTypeName+" destMemory["+ss.str()+"];\n"; } code += " uint tid = get_global_id(0);\n" "\n"; if(LocalMemory()) { code += " // initialize atomics not reachable from host (first thread is doing this, other threads are waiting on barrier)\n" " if(get_local_id(0) == 0)\n" " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n" " {\n"; if(aTypeName == "atomic_flag") { code += " if(finalDest[dstItemIdx])\n" " atomic_flag_test_and_set(destMemory+dstItemIdx);\n" " else\n" " atomic_flag_clear(destMemory+dstItemIdx);\n"; } else { code += " atomic_store(destMemory+dstItemIdx, finalDest[dstItemIdx]);\n"; } code += " }\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "\n"; } if (LocalRefValues()) { code += " // Copy input reference values into local memory\n"; if (NumNonAtomicVariablesPerThread() == 1) code += " localValues[get_local_id(0)] = oldValues[tid];\n"; else { std::stringstream ss; ss << NumNonAtomicVariablesPerThread(); code += " for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n" " localValues[get_local_id(0)*" + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n"; } code += " barrier(CLK_LOCAL_MEM_FENCE);\n" "\n"; } if (UsedInFunction()) code += std::string(" test_atomic_function(tid, threadCount, numDestItems, destMemory, oldValues")+ (LocalRefValues() ? ", localValues" : "")+");\n"; else code += ProgramCore(); code += "\n"; if (LocalRefValues()) { code += " // Copy local reference values into output array\n" " barrier(CLK_LOCAL_MEM_FENCE);\n"; if (NumNonAtomicVariablesPerThread() == 1) code += " oldValues[tid] = localValues[get_local_id(0)];\n"; else { std::stringstream ss; ss << NumNonAtomicVariablesPerThread(); code += " for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n" " oldValues[tid*" + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + "+rfId];\n"; } code += "\n"; } if(LocalMemory() || DeclaredInProgram()) { code += " // Copy final values to host reachable buffer\n"; if(LocalMemory()) code += " barrier(CLK_LOCAL_MEM_FENCE);\n" " if(get_local_id(0) == 0) // first thread in workgroup\n"; else // global atomics declared in program scope code += " if(atomic_fetch_add(&finishedThreads, 1) == get_global_size(0)-1)\n" " // last finished thread\n"; code += " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n"; if(aTypeName == "atomic_flag") { code += " finalDest[dstItemIdx] = atomic_flag_test_and_set(destMemory+dstItemIdx);\n"; } else { code += " finalDest[dstItemIdx] = atomic_load(destMemory+dstItemIdx);\n"; } } code += "}\n" "\n"; return code; } template int CBasicTest::ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) { int error; clProgramWrapper program; clKernelWrapper kernel; size_t threadNum[1]; clMemWrapper streams[2]; std::vector destItems; HostAtomicType *svmAtomicBuffer = 0; std::vector refValues, startRefValues; HostDataType *svmDataBuffer = 0; cl_uint deviceThreadCount, hostThreadCount, threadCount; size_t groupSize = 0; std::string programSource; const char *programLine; MTdata d; size_t typeSize = DataType().Size(deviceID); deviceThreadCount = _maxDeviceThreads; hostThreadCount = MaxHostThreads(); threadCount = deviceThreadCount+hostThreadCount; //log_info("\t%s %s%s...\n", local ? "local" : "global", DataType().AtomicTypeName(), memoryOrderScope.c_str()); log_info("\t%s...\n", SingleTestName().c_str()); if(!LocalMemory() && DeclaredInProgram() && gNoGlobalVariables) // no support for program scope global variables { log_info("\t\tTest disabled\n"); return 0; } if(UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace) { log_info("\t\tTest disabled\n"); return 0; } // set up work sizes based on device capabilities and test configuration error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL); test_error(error, "Unable to obtain max work group size for device"); CurrentGroupSize((cl_uint)groupSize); if(CurrentGroupSize() > deviceThreadCount) CurrentGroupSize(deviceThreadCount); if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI) deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount); threadCount = deviceThreadCount+hostThreadCount; // If we're given a num_results function, we need to determine how many result objects we need. // This is the first assessment for current maximum number of threads (exact thread count is not known here) // - needed for program source code generation (arrays of atomics declared in program) cl_uint numDestItems = NumResults(threadCount, deviceID); // This loop iteratively reduces the workgroup size by 2 and then re-generates the kernel with the reduced // workgroup size until we find a size which is admissible for the kernel being run or reduce the wg size // to the trivial case of 1 (which was separately verified to be accurate for the kernel being run) if(deviceThreadCount > 0) { while((CurrentGroupSize() > 1)) { // Re-generate the kernel code with the current group size if (kernel) clReleaseKernel(kernel); programSource = PragmaHeader(deviceID)+ProgramHeader(numDestItems)+FunctionCode()+KernelCode(numDestItems); programLine = programSource.c_str(); if(create_single_kernel_helper_with_build_options(context, &program, &kernel, 1, &programLine, "test_atomic_kernel", gOldAPI ? "" : "-cl-std=CL2.0")) { return -1; } // Get work group size for the new kernel error = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL); test_error(error, "Unable to obtain max work group size for device and kernel combo"); if (CurrentGroupSize() <= groupSize) break; else CurrentGroupSize(CurrentGroupSize()/2); } if(CurrentGroupSize() > deviceThreadCount) CurrentGroupSize(deviceThreadCount); if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI) deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount); threadCount = deviceThreadCount+hostThreadCount; } if(gDebug) { log_info("Program source:\n"); log_info("%s\n", programLine); } if(deviceThreadCount > 0) log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize()); if(hostThreadCount > 0) log_info("\t\t(host threads %u)\n", hostThreadCount); refValues.resize(threadCount*NumNonAtomicVariablesPerThread()); // Generate ref data if we have a ref generator provided d = init_genrand(gRandomSeed); startRefValues.resize(threadCount*NumNonAtomicVariablesPerThread()); if(GenerateRefs(threadCount, &startRefValues[0], d)) { //copy ref values for host threads memcpy(&refValues[0], &startRefValues[0], sizeof(HostDataType)*threadCount*NumNonAtomicVariablesPerThread()); } else { startRefValues.resize(0); } free_mtdata(d); d = NULL; // If we're given a num_results function, we need to determine how many result objects we need. If // we don't have it, we assume it's just 1 // This is final value (exact thread count is known in this place) numDestItems = NumResults(threadCount, deviceID); destItems.resize(numDestItems); for(cl_uint i = 0; i < numDestItems; i++) destItems[i] = _startValue; // Create main buffer with atomic variables (array size dependent on particular test) if(UseSVM()) { if(gUseHostPtr) svmAtomicBuffer = (HostAtomicType*)malloc(typeSize * numDestItems); else svmAtomicBuffer = (HostAtomicType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, typeSize * numDestItems, 0); if(!svmAtomicBuffer) { log_error("ERROR: clSVMAlloc failed!\n"); return -1; } memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems); streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_USE_HOST_PTR), typeSize * numDestItems, svmAtomicBuffer, NULL); } else { streams[0] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_COPY_HOST_PTR), typeSize * numDestItems, &destItems[0], NULL); } if (!streams[0]) { log_error("ERROR: Creating output array failed!\n"); return -1; } // Create buffer for per-thread input/output data if(UseSVM()) { if(gUseHostPtr) svmDataBuffer = (HostDataType*)malloc(typeSize*threadCount*NumNonAtomicVariablesPerThread()); else svmDataBuffer = (HostDataType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS : 0), typeSize*threadCount*NumNonAtomicVariablesPerThread(), 0); if(!svmDataBuffer) { log_error("ERROR: clSVMAlloc failed!\n"); return -1; } if(startRefValues.size()) memcpy(svmDataBuffer, &startRefValues[0], typeSize*threadCount*NumNonAtomicVariablesPerThread()); streams[1] = clCreateBuffer(context, (cl_mem_flags)(CL_MEM_USE_HOST_PTR), typeSize*threadCount*NumNonAtomicVariablesPerThread(), svmDataBuffer, NULL); } else { streams[1] = clCreateBuffer(context, (cl_mem_flags)((startRefValues.size() ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)), typeSize * threadCount*NumNonAtomicVariablesPerThread(), startRefValues.size() ? &startRefValues[0] : 0, NULL); } if (!streams[1]) { log_error("ERROR: Creating reference array failed!\n"); return -1; } if(deviceThreadCount > 0) { cl_uint argInd = 0; /* Set the arguments */ error = clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount); test_error(error, "Unable to set kernel argument"); error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), &numDestItems); test_error(error, "Unable to set indexed kernel argument"); error = clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]); test_error(error, "Unable to set indexed kernel arguments"); error = clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]); test_error(error, "Unable to set indexed kernel arguments"); if(LocalMemory()) { error = clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL); test_error(error, "Unable to set indexed local kernel argument"); } if(LocalRefValues()) { error = clSetKernelArg(kernel, argInd++, LocalRefValues() ? typeSize*CurrentGroupSize()*NumNonAtomicVariablesPerThread() : 1, NULL); test_error(error, "Unable to set indexed kernel argument"); } } /* Configure host threads */ std::vector hostThreadContexts(hostThreadCount); for(unsigned int t = 0; t < hostThreadCount; t++) { hostThreadContexts[t].test = this; hostThreadContexts[t].tid = deviceThreadCount+t; hostThreadContexts[t].threadCount = threadCount; hostThreadContexts[t].destMemory = UseSVM() ? svmAtomicBuffer : &destItems[0]; hostThreadContexts[t].oldValues = UseSVM() ? svmDataBuffer : &refValues[0]; } if(deviceThreadCount > 0) { /* Run the kernel */ threadNum[0] = deviceThreadCount; groupSize = CurrentGroupSize(); error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, &groupSize, 0, NULL, NULL); test_error(error, "Unable to execute test kernel"); /* start device threads */ error = clFlush(queue); test_error(error, "clFlush failed"); } /* Start host threads and wait for finish */ if(hostThreadCount > 0) ThreadPool_Do(HostThreadFunction, hostThreadCount, &hostThreadContexts[0]); if(UseSVM()) { error = clFinish(queue); test_error(error, "clFinish failed"); memcpy(&destItems[0], svmAtomicBuffer, typeSize*numDestItems); memcpy(&refValues[0], svmDataBuffer, typeSize*threadCount*NumNonAtomicVariablesPerThread()); } else { if(deviceThreadCount > 0) { error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL); test_error(error, "Unable to read result value!"); error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize * deviceThreadCount*NumNonAtomicVariablesPerThread(), &refValues[0], 0, NULL, NULL); test_error(error, "Unable to read reference values!"); } } bool dataVerified = false; // If we have an expectedFn, then we need to generate a final value to compare against. If we don't // have one, it's because we're comparing ref values only for(cl_uint i = 0; i < numDestItems; i++) { HostDataType expected; if(!ExpectedValue(expected, threadCount, startRefValues.size() ? &startRefValues[0] : 0, i)) break; // no expected value function provided if(expected != destItems[i]) { std::stringstream logLine; logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected << ", was " << destItems[i] << ")\n"; log_error("%s", logLine.str().c_str()); for(i = 0; i < threadCount; i++) { logLine.str(""); logLine << " --- " << i << " - "; if(startRefValues.size()) logLine << startRefValues[i] << " -> " << refValues[i]; else logLine << refValues[i]; logLine << " --- "; if(i < numDestItems) logLine << destItems[i]; logLine << "\n"; log_info("%s", logLine.str().c_str()); } if(!gDebug) { log_info("Program source:\n"); log_info("%s\n", programLine); } return -1; } dataVerified = true; } bool dataCorrect = false; /* Use the verify function (if provided) to also check the results */ if(VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0])) { if(!dataCorrect) { log_error("ERROR: Reference values did not validate!\n"); std::stringstream logLine; for(cl_uint i = 0; i < threadCount; i++) for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++) { logLine.str(""); logLine << " --- " << i << " - " << refValues[i*NumNonAtomicVariablesPerThread()+j] << " --- "; if(j == 0 && i < numDestItems) logLine << destItems[i]; logLine << "\n"; log_info("%s", logLine.str().c_str()); } if(!gDebug) { log_info("Program source:\n"); log_info("%s\n", programLine); } return -1; } } else if(!dataVerified) { log_error("ERROR: Test doesn't check total or refs; no values are verified!\n"); return -1; } if(OldValueCheck() && !(DeclaredInProgram() && !LocalMemory())) // don't test for programs scope global atomics // 'old' value has been overwritten by previous clEnqueueNDRangeKernel { /* Re-write the starting value */ for(size_t i = 0; i < numDestItems; i++) destItems[i] = _startValue; refValues[0] = 0; if(deviceThreadCount > 0) { error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL); test_error(error, "Unable to write starting values!"); /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */ threadNum[0] = 1; error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, threadNum, 0, NULL, NULL); test_error(error, "Unable to execute test kernel"); error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, &refValues[0], 0, NULL, NULL); test_error(error, "Unable to read reference values!"); } else { /* Start host thread */ HostFunction(0, 1, &destItems[0], &refValues[0]); } if(refValues[0] != _startValue)//destItems[0]) { std::stringstream logLine; logLine << "ERROR: atomic function operated correctly but did NOT return correct 'old' value " " (should have been " << destItems[0] << ", returned " << refValues[0] << ")!\n"; log_error("%s", logLine.str().c_str()); if(!gDebug) { log_info("Program source:\n"); log_info("%s\n", programLine); } return -1; } } if(UseSVM()) { // the buffer object must first be released before the SVM buffer is freed error = clReleaseMemObject(streams[0]); streams[0] = 0; test_error(error, "clReleaseMemObject failed"); if(gUseHostPtr) free(svmAtomicBuffer); else clSVMFree(context, svmAtomicBuffer); error = clReleaseMemObject(streams[1]); streams[1] = 0; test_error(error, "clReleaseMemObject failed"); if(gUseHostPtr) free(svmDataBuffer); else clSVMFree(context, svmDataBuffer); } _passCount++; return 0; } #endif //_COMMON_H_