From 6da9c6b68f9643a077f7281451b59f444a77a991 Mon Sep 17 00:00:00 2001 From: Grzegorz Wawiorko Date: Wed, 11 Aug 2021 19:06:10 +0200 Subject: [PATCH] Fix double free in c11_atomics tests for SVM allocations (#1286) * Only Clang format changes * Fix double free object for SVM allocations * Fix double free - review fixes --- test_conformance/c11_atomics/common.h | 2510 +++++++++++++------------ 1 file changed, 1358 insertions(+), 1152 deletions(-) diff --git a/test_conformance/c11_atomics/common.h b/test_conformance/c11_atomics/common.h index bbcc68c6..d30259f0 100644 --- a/test_conformance/c11_atomics/common.h +++ b/test_conformance/c11_atomics/common.h @@ -28,10 +28,9 @@ #define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads) #define MAX_HOST_THREADS GetThreadCount() -#define EXECUTE_TEST(error, test)\ - error |= test;\ - if(error && !gContinueOnError)\ - return error; +#define EXECUTE_TEST(error, test) \ + error |= test; \ + if (error && !gContinueOnError) return error; enum TExplicitAtomicType { @@ -57,764 +56,918 @@ enum TExplicitMemoryScopeType MEMORY_SCOPE_ALL_SVM_DEVICES }; -extern bool gHost; // temporary flag for testing native host threads (test verification) +extern bool + gHost; // temporary flag for testing native host threads (test verification) extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2) extern bool gContinueOnError; // execute all cases even when errors detected -extern bool gNoGlobalVariables; // disable cases with global atomics in program scope +extern bool + gNoGlobalVariables; // disable cases with global atomics in program scope extern bool gNoGenericAddressSpace; // disable cases with generic address space extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree extern bool gDebug; // print OpenCL kernel code -extern int gInternalIterations; // internal test iterations for atomic operation, sufficient to verify atomicity -extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device +extern int gInternalIterations; // internal test iterations for atomic + // operation, sufficient to verify atomicity +extern int + gMaxDeviceThreads; // maximum number of threads executed on OCL device extern cl_device_atomic_capabilities gAtomicMemCap, gAtomicFenceCap; // atomic memory and fence capabilities for this device -extern const char *get_memory_order_type_name(TExplicitMemoryOrderType orderType); -extern const char *get_memory_scope_type_name(TExplicitMemoryScopeType scopeType); +extern const char * +get_memory_order_type_name(TExplicitMemoryOrderType orderType); +extern const char * +get_memory_scope_type_name(TExplicitMemoryScopeType scopeType); extern cl_int getSupportedMemoryOrdersAndScopes( cl_device_id device, std::vector &memoryOrders, std::vector &memoryScopes); -class AtomicTypeInfo -{ +class AtomicTypeInfo { public: - TExplicitAtomicType _type; - AtomicTypeInfo(TExplicitAtomicType type): _type(type) {} - cl_uint Size(cl_device_id device); - const char* AtomicTypeName(); - const char* RegularTypeName(); - const char* AddSubOperandTypeName(); - int IsSupported(cl_device_id device); + TExplicitAtomicType _type; + AtomicTypeInfo(TExplicitAtomicType type): _type(type) {} + cl_uint Size(cl_device_id device); + const char *AtomicTypeName(); + const char *RegularTypeName(); + const char *AddSubOperandTypeName(); + int IsSupported(cl_device_id device); }; -template -class AtomicTypeExtendedInfo : public AtomicTypeInfo -{ +template +class AtomicTypeExtendedInfo : public AtomicTypeInfo { public: - AtomicTypeExtendedInfo(TExplicitAtomicType type) : AtomicTypeInfo(type) {} - HostDataType MinValue(); - HostDataType MaxValue(); - HostDataType SpecialValue(cl_uchar x) - { - HostDataType tmp; - cl_uchar *ptr = (cl_uchar*)&tmp; - for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_uchar); i++) - ptr[i] = x; - return tmp; - } - HostDataType SpecialValue(cl_ushort x) - { - HostDataType tmp; - cl_ushort *ptr = (cl_ushort*)&tmp; - for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_ushort); i++) - ptr[i] = x; - return tmp; - } + AtomicTypeExtendedInfo(TExplicitAtomicType type): AtomicTypeInfo(type) {} + HostDataType MinValue(); + HostDataType MaxValue(); + HostDataType SpecialValue(cl_uchar x) + { + HostDataType tmp; + cl_uchar *ptr = (cl_uchar *)&tmp; + for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_uchar); i++) + ptr[i] = x; + return tmp; + } + HostDataType SpecialValue(cl_ushort x) + { + HostDataType tmp; + cl_ushort *ptr = (cl_ushort *)&tmp; + for (cl_uint i = 0; i < sizeof(HostDataType) / sizeof(cl_ushort); i++) + ptr[i] = x; + return tmp; + } }; -class CTest { +class CTest { public: - virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) = 0; + virtual int Execute(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) = 0; }; -template -class CBasicTest : CTest -{ +template +class CBasicTest : CTest { public: - typedef struct { - CBasicTest *test; - cl_uint tid; - cl_uint threadCount; - volatile HostAtomicType *destMemory; - HostDataType *oldValues; - } THostThreadContext; - static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, void *userInfo) - { - THostThreadContext *threadContext = ((THostThreadContext*)userInfo)+job_id; - threadContext->test->HostFunction(threadContext->tid, threadContext->threadCount, threadContext->destMemory, threadContext->oldValues); - return 0; - } - CBasicTest(TExplicitAtomicType dataType, bool useSVM) : CTest(), - _maxDeviceThreads(MAX_DEVICE_THREADS), - _dataType(dataType), _useSVM(useSVM), _startValue(255), - _localMemory(false), _declaredInProgram(false), - _usedInFunction(false), _genericAddrSpace(false), - _oldValueCheck(true), _localRefValues(false), - _maxGroupSize(0), _passCount(0), _iterations(gInternalIterations) - { - } - virtual ~CBasicTest() - { - if(_passCount) - log_info(" %u tests executed successfully for %s\n", _passCount, DataType().AtomicTypeName()); - } - virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) - { - return 1; - } - virtual cl_uint NumNonAtomicVariablesPerThread() - { - return 1; - } - virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue) - { - return false; - } - virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) - { - return false; - } - virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) - { - return false; - } - virtual std::string PragmaHeader(cl_device_id deviceID); - virtual std::string ProgramHeader(cl_uint maxNumDestItems); - virtual std::string FunctionCode(); - virtual std::string KernelCode(cl_uint maxNumDestItems); - virtual std::string ProgramCore() = 0; - virtual std::string SingleTestName() - { - std::string testName = LocalMemory() ? "local" : "global"; - testName += " "; - testName += DataType().AtomicTypeName(); - if(DeclaredInProgram()) + typedef struct { - testName += " declared in program"; - } - if(DeclaredInProgram() && UsedInFunction()) - testName += ","; - if(UsedInFunction()) + CBasicTest *test; + cl_uint tid; + cl_uint threadCount; + volatile HostAtomicType *destMemory; + HostDataType *oldValues; + } THostThreadContext; + static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, + void *userInfo) { - testName += " used in "; - if(GenericAddrSpace()) - testName += "generic "; - testName += "function"; - } - return testName; - } - virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue); - int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - int error = 0; - UsedInFunction(false); - EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); - UsedInFunction(true); - GenericAddrSpace(false); - EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); - GenericAddrSpace(true); - EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); - GenericAddrSpace(false); - return error; - } - int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - int error = 0; - DeclaredInProgram(false); - EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue)); - if(!UseSVM()) - { - DeclaredInProgram(true); - EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue)); - } - return error; - } - virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - int error = 0; - if(_maxDeviceThreads > 0 && !UseSVM()) - { - LocalMemory(true); - EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue)); - } - if(_maxDeviceThreads+MaxHostThreads() > 0) - { - LocalMemory(false); - EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue)); - } - return error; - } - virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) - { - if(sizeof(HostAtomicType) != DataType().Size(deviceID)) - { - log_info("Invalid test: Host atomic type size (%u) is different than OpenCL type size (%u)\n", (cl_uint)sizeof(HostAtomicType), DataType().Size(deviceID)); - return -1; - } - if(sizeof(HostAtomicType) != sizeof(HostDataType)) - { - log_info("Invalid test: Host atomic type size (%u) is different than corresponding type size (%u)\n", (cl_uint)sizeof(HostAtomicType), (cl_uint)sizeof(HostDataType)); - return -1; - } - // Verify we can run first - if(UseSVM() && !gUseHostPtr) - { - cl_device_svm_capabilities caps; - cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0); - test_error(error, "clGetDeviceInfo failed"); - if((caps & CL_DEVICE_SVM_ATOMICS) == 0) - { - log_info("\t%s - SVM_ATOMICS not supported\n", DataType().AtomicTypeName()); - // implicit pass + THostThreadContext *threadContext = + ((THostThreadContext *)userInfo) + job_id; + threadContext->test->HostFunction( + threadContext->tid, threadContext->threadCount, + threadContext->destMemory, threadContext->oldValues); return 0; - } } - if(!DataType().IsSupported(deviceID)) + CBasicTest(TExplicitAtomicType dataType, bool useSVM) + : CTest(), _maxDeviceThreads(MAX_DEVICE_THREADS), _dataType(dataType), + _useSVM(useSVM), _startValue(255), _localMemory(false), + _declaredInProgram(false), _usedInFunction(false), + _genericAddrSpace(false), _oldValueCheck(true), + _localRefValues(false), _maxGroupSize(0), _passCount(0), + _iterations(gInternalIterations) + {} + virtual ~CBasicTest() { - log_info("\t%s not supported\n", DataType().AtomicTypeName()); - // implicit pass or host test (debug feature) - if(UseSVM()) - return 0; - _maxDeviceThreads = 0; + if (_passCount) + log_info(" %u tests executed successfully for %s\n", _passCount, + DataType().AtomicTypeName()); } - if(_maxDeviceThreads+MaxHostThreads() == 0) - return 0; - return ExecuteForEachParameterSet(deviceID, context, queue); - } - virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues) - { - log_info("Empty thread function %u\n", (cl_uint)tid); - } - AtomicTypeExtendedInfo DataType() const - { - return AtomicTypeExtendedInfo(_dataType); - } - cl_uint _maxDeviceThreads; - virtual cl_uint MaxHostThreads() - { - if(UseSVM() || gHost) - return MAX_HOST_THREADS; - else - return 0; - } - - int CheckCapabilities(TExplicitMemoryScopeType memoryScope, - TExplicitMemoryOrderType memoryOrder) - { - /* - Differentiation between atomic fence and other atomic operations - does not need to occur here. - - The initialisation of this test checks that the minimum required - capabilities are supported by this device. - - The following switches allow the test to skip if optional capabilites - are not supported by the device. - */ - switch (memoryScope) - { - case MEMORY_SCOPE_EMPTY: { - break; - } - case MEMORY_SCOPE_WORK_GROUP: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - case MEMORY_SCOPE_DEVICE: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - case MEMORY_SCOPE_ALL_DEVICES: // fallthough - case MEMORY_SCOPE_ALL_SVM_DEVICES: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - default: { - log_info("Invalid memory scope\n"); - break; - } - } - - switch (memoryOrder) - { - case MEMORY_ORDER_EMPTY: { - break; - } - case MEMORY_ORDER_RELAXED: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - case MEMORY_ORDER_ACQUIRE: - case MEMORY_ORDER_RELEASE: - case MEMORY_ORDER_ACQ_REL: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - case MEMORY_ORDER_SEQ_CST: { - if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0) - { - return TEST_SKIPPED_ITSELF; - } - break; - } - default: { - log_info("Invalid memory order\n"); - break; - } - } - - return 0; - } - virtual bool SVMDataBufferAllSVMConsistent() {return false;} - bool UseSVM() {return _useSVM;} - void StartValue(HostDataType startValue) {_startValue = startValue;} - HostDataType StartValue() {return _startValue;} - void LocalMemory(bool local) {_localMemory = local;} - bool LocalMemory() {return _localMemory;} - void DeclaredInProgram(bool declaredInProgram) {_declaredInProgram = declaredInProgram;} - bool DeclaredInProgram() {return _declaredInProgram;} - void UsedInFunction(bool local) {_usedInFunction = local;} - bool UsedInFunction() {return _usedInFunction;} - void GenericAddrSpace(bool genericAddrSpace) {_genericAddrSpace = genericAddrSpace;} - bool GenericAddrSpace() {return _genericAddrSpace;} - void OldValueCheck(bool check) {_oldValueCheck = check;} - bool OldValueCheck() {return _oldValueCheck;} - void LocalRefValues(bool localRefValues) {_localRefValues = localRefValues;} - bool LocalRefValues() {return _localRefValues;} - void MaxGroupSize(cl_uint maxGroupSize) {_maxGroupSize = maxGroupSize;} - cl_uint MaxGroupSize() {return _maxGroupSize;} - void CurrentGroupSize(cl_uint currentGroupSize) - { - if(MaxGroupSize() && MaxGroupSize() < currentGroupSize) - _currentGroupSize = MaxGroupSize(); - else - _currentGroupSize = currentGroupSize; - } - cl_uint CurrentGroupSize() {return _currentGroupSize;} - virtual cl_uint CurrentGroupNum(cl_uint threadCount) - { - if(threadCount == 0) - return 0; - if(LocalMemory()) - return 1; - return threadCount/CurrentGroupSize(); - } - cl_int Iterations() {return _iterations;} - std::string IterationsStr() {std::stringstream ss; ss << _iterations; return ss.str();} -private: - const TExplicitAtomicType _dataType; - const bool _useSVM; - HostDataType _startValue; - bool _localMemory; - bool _declaredInProgram; - bool _usedInFunction; - bool _genericAddrSpace; - bool _oldValueCheck; - bool _localRefValues; - cl_uint _maxGroupSize; - cl_uint _currentGroupSize; - cl_uint _passCount; - const cl_int _iterations; -}; - -template -class CBasicTestMemOrderScope : public CBasicTest -{ -public: - using CBasicTest::LocalMemory; - using CBasicTest::MaxGroupSize; - using CBasicTest::CheckCapabilities; - CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTest(dataType, useSVM) - { - } - virtual std::string ProgramHeader(cl_uint maxNumDestItems) - { - std::string header; - if(gOldAPI) + virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) { - std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s"; - header += - "#define atomic_store_explicit(x,y,o"+s+") atomic_store(x,y)\n" - "#define atomic_load_explicit(x,o"+s+") atomic_load(x)\n" - "#define atomic_exchange_explicit(x,y,o"+s+") atomic_exchange(x,y)\n" - "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_strong(x,y,z)\n" - "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_weak(x,y,z)\n" - "#define atomic_fetch_add_explicit(x,y,o"+s+") atomic_fetch_add(x,y)\n" - "#define atomic_fetch_sub_explicit(x,y,o"+s+") atomic_fetch_sub(x,y)\n" - "#define atomic_fetch_or_explicit(x,y,o"+s+") atomic_fetch_or(x,y)\n" - "#define atomic_fetch_xor_explicit(x,y,o"+s+") atomic_fetch_xor(x,y)\n" - "#define atomic_fetch_and_explicit(x,y,o"+s+") atomic_fetch_and(x,y)\n" - "#define atomic_fetch_min_explicit(x,y,o"+s+") atomic_fetch_min(x,y)\n" - "#define atomic_fetch_max_explicit(x,y,o"+s+") atomic_fetch_max(x,y)\n" - "#define atomic_flag_test_and_set_explicit(x,o"+s+") atomic_flag_test_and_set(x)\n" - "#define atomic_flag_clear_explicit(x,o"+s+") atomic_flag_clear(x)\n"; + return 1; } - return header+CBasicTest::ProgramHeader(maxNumDestItems); - } - virtual std::string SingleTestName() - { - std::string testName = CBasicTest::SingleTestName(); - if(MemoryOrder() != MEMORY_ORDER_EMPTY) + virtual cl_uint NumNonAtomicVariablesPerThread() { return 1; } + virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) { - testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory")); + return false; } - if(MemoryScope() != MEMORY_SCOPE_EMPTY) + virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, + MTdata d) { - testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory")); + return false; } - return testName; - } - virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - if(LocalMemory() && - MemoryScope() != MEMORY_SCOPE_EMPTY && - MemoryScope() != MEMORY_SCOPE_WORK_GROUP) //memory scope should only be used for global memory - return 0; - if(MemoryScope() == MEMORY_SCOPE_DEVICE) - MaxGroupSize(16); // increase number of groups by forcing smaller group size - else - MaxGroupSize(0); // group size limited by device capabilities - - if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF) - return 0; // skip test - not applicable - - return CBasicTest::ExecuteSingleTest(deviceID, context, queue); - } - virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - // repeat test for each reasonable memory order/scope combination - std::vector memoryOrder; - std::vector memoryScope; - int error = 0; - - // For OpenCL-3.0 and later some orderings and scopes are optional, so here - // we query for the supported ones. - test_error_ret( - getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope), - "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL); - - for(unsigned oi = 0; oi < memoryOrder.size(); oi++) + virtual bool VerifyRefs(bool &correct, cl_uint threadCount, + HostDataType *refValues, + HostAtomicType *finalValues) { - for(unsigned si = 0; si < memoryScope.size(); si++) - { - if(memoryOrder[oi] == MEMORY_ORDER_EMPTY && memoryScope[si] != MEMORY_SCOPE_EMPTY) - continue; - MemoryOrder(memoryOrder[oi]); - MemoryScope(memoryScope[si]); - EXECUTE_TEST(error, (CBasicTest::ExecuteForEachParameterSet(deviceID, context, queue))); - } + return false; } - return error; - } - void MemoryOrder(TExplicitMemoryOrderType memoryOrder) {_memoryOrder = memoryOrder;} - TExplicitMemoryOrderType MemoryOrder() {return _memoryOrder;} - std::string MemoryOrderStr() - { - if(MemoryOrder() != MEMORY_ORDER_EMPTY) - return std::string(", ")+get_memory_order_type_name(MemoryOrder()); - return ""; - } - void MemoryScope(TExplicitMemoryScopeType memoryScope) {_memoryScope = memoryScope;} - TExplicitMemoryScopeType MemoryScope() {return _memoryScope;} - std::string MemoryScopeStr() - { - if(MemoryScope() != MEMORY_SCOPE_EMPTY) - return std::string(", ")+get_memory_scope_type_name(MemoryScope()); - return ""; - } - std::string MemoryOrderScopeStr() - { - return MemoryOrderStr()+MemoryScopeStr(); - } - virtual cl_uint CurrentGroupNum(cl_uint threadCount) - { - if(MemoryScope() == MEMORY_SCOPE_WORK_GROUP) - return 1; - return CBasicTest::CurrentGroupNum(threadCount); - } - virtual cl_uint MaxHostThreads() - { - // block host threads execution for memory scope different than - // memory_scope_all_svm_devices - if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES - || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost) - { - return CBasicTest::MaxHostThreads(); - } - else - { - return 0; - } - } -private: - TExplicitMemoryOrderType _memoryOrder; - TExplicitMemoryScopeType _memoryScope; -}; - -template -class CBasicTestMemOrder2Scope : public CBasicTestMemOrderScope -{ -public: - using CBasicTestMemOrderScope::LocalMemory; - using CBasicTestMemOrderScope::MemoryOrder; - using CBasicTestMemOrderScope::MemoryScope; - using CBasicTestMemOrderScope::MemoryOrderStr; - using CBasicTestMemOrderScope::MemoryScopeStr; - using CBasicTest::CheckCapabilities; - - CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTestMemOrderScope(dataType, useSVM) - { - } - virtual std::string SingleTestName() - { - std::string testName = CBasicTest::SingleTestName(); - if(MemoryOrder() != MEMORY_ORDER_EMPTY) - testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory")); - if(MemoryOrder2() != MEMORY_ORDER_EMPTY) - testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder2())).substr(sizeof("memory")); - if(MemoryScope() != MEMORY_SCOPE_EMPTY) - testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory")); - return testName; - } - virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue) - { - // repeat test for each reasonable memory order/scope combination - std::vector memoryOrder; - std::vector memoryScope; - int error = 0; - - // For OpenCL-3.0 and later some orderings and scopes are optional, so here - // we query for the supported ones. - test_error_ret( - getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope), - "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL); - - for(unsigned oi = 0; oi < memoryOrder.size(); oi++) + virtual std::string PragmaHeader(cl_device_id deviceID); + virtual std::string ProgramHeader(cl_uint maxNumDestItems); + virtual std::string FunctionCode(); + virtual std::string KernelCode(cl_uint maxNumDestItems); + virtual std::string ProgramCore() = 0; + virtual std::string SingleTestName() { - for(unsigned o2i = 0; o2i < memoryOrder.size(); o2i++) - { - for(unsigned si = 0; si < memoryScope.size(); si++) + std::string testName = LocalMemory() ? "local" : "global"; + testName += " "; + testName += DataType().AtomicTypeName(); + if (DeclaredInProgram()) { - if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY) - && memoryOrder[oi] != memoryOrder[o2i]) - continue; // both memory order arguments must be set (or none) - if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY) - && memoryScope[si] != MEMORY_SCOPE_EMPTY) - continue; // memory scope without memory order is not allowed - MemoryOrder(memoryOrder[oi]); - MemoryOrder2(memoryOrder[o2i]); - MemoryScope(memoryScope[si]); - - if (CheckCapabilities(MemoryScope(), MemoryOrder()) - == TEST_SKIPPED_ITSELF) - continue; // skip test - not applicable - - if (CheckCapabilities(MemoryScope(), MemoryOrder2()) - == TEST_SKIPPED_ITSELF) - continue; // skip test - not applicable - - EXECUTE_TEST(error, (CBasicTest::ExecuteForEachParameterSet(deviceID, context, queue))); + testName += " declared in program"; } - } + if (DeclaredInProgram() && UsedInFunction()) testName += ","; + if (UsedInFunction()) + { + testName += " used in "; + if (GenericAddrSpace()) testName += "generic "; + testName += "function"; + } + return testName; } - return error; - } - void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) {_memoryOrder2 = memoryOrderFail;} - TExplicitMemoryOrderType MemoryOrder2() {return _memoryOrder2;} - std::string MemoryOrderFailStr() - { - if(MemoryOrder2() != MEMORY_ORDER_EMPTY) - return std::string(", ")+get_memory_order_type_name(MemoryOrder2()); - return ""; - } - std::string MemoryOrderScope() - { - return MemoryOrderStr()+MemoryOrderFailStr()+MemoryScopeStr(); - } + virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, + cl_command_queue queue); + int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + { + int error = 0; + UsedInFunction(false); + EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); + UsedInFunction(true); + GenericAddrSpace(false); + EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); + GenericAddrSpace(true); + EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue)); + GenericAddrSpace(false); + return error; + } + int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + { + int error = 0; + DeclaredInProgram(false); + EXECUTE_TEST(error, + ExecuteForEachPointerType(deviceID, context, queue)); + if (!UseSVM()) + { + DeclaredInProgram(true); + EXECUTE_TEST(error, + ExecuteForEachPointerType(deviceID, context, queue)); + } + return error; + } + virtual int ExecuteForEachParameterSet(cl_device_id deviceID, + cl_context context, + cl_command_queue queue) + { + int error = 0; + if (_maxDeviceThreads > 0 && !UseSVM()) + { + LocalMemory(true); + EXECUTE_TEST( + error, ExecuteForEachDeclarationType(deviceID, context, queue)); + } + if (_maxDeviceThreads + MaxHostThreads() > 0) + { + LocalMemory(false); + EXECUTE_TEST( + error, ExecuteForEachDeclarationType(deviceID, context, queue)); + } + return error; + } + virtual int Execute(cl_device_id deviceID, cl_context context, + cl_command_queue queue, int num_elements) + { + if (sizeof(HostAtomicType) != DataType().Size(deviceID)) + { + log_info("Invalid test: Host atomic type size (%u) is different " + "than OpenCL type size (%u)\n", + (cl_uint)sizeof(HostAtomicType), + DataType().Size(deviceID)); + return -1; + } + if (sizeof(HostAtomicType) != sizeof(HostDataType)) + { + log_info("Invalid test: Host atomic type size (%u) is different " + "than corresponding type size (%u)\n", + (cl_uint)sizeof(HostAtomicType), + (cl_uint)sizeof(HostDataType)); + return -1; + } + // Verify we can run first + if (UseSVM() && !gUseHostPtr) + { + cl_device_svm_capabilities caps; + cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, + sizeof(caps), &caps, 0); + test_error(error, "clGetDeviceInfo failed"); + if ((caps & CL_DEVICE_SVM_ATOMICS) == 0) + { + log_info("\t%s - SVM_ATOMICS not supported\n", + DataType().AtomicTypeName()); + // implicit pass + return 0; + } + } + if (!DataType().IsSupported(deviceID)) + { + log_info("\t%s not supported\n", DataType().AtomicTypeName()); + // implicit pass or host test (debug feature) + if (UseSVM()) return 0; + _maxDeviceThreads = 0; + } + if (_maxDeviceThreads + MaxHostThreads() == 0) return 0; + return ExecuteForEachParameterSet(deviceID, context, queue); + } + virtual void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) + { + log_info("Empty thread function %u\n", (cl_uint)tid); + } + AtomicTypeExtendedInfo DataType() const + { + return AtomicTypeExtendedInfo(_dataType); + } + cl_uint _maxDeviceThreads; + virtual cl_uint MaxHostThreads() + { + if (UseSVM() || gHost) + return MAX_HOST_THREADS; + else + return 0; + } + + int CheckCapabilities(TExplicitMemoryScopeType memoryScope, + TExplicitMemoryOrderType memoryOrder) + { + /* + Differentiation between atomic fence and other atomic operations + does not need to occur here. + + The initialisation of this test checks that the minimum required + capabilities are supported by this device. + + The following switches allow the test to skip if optional + capabilites are not supported by the device. + */ + switch (memoryScope) + { + case MEMORY_SCOPE_EMPTY: { + break; + } + case MEMORY_SCOPE_WORK_GROUP: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + case MEMORY_SCOPE_DEVICE: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + case MEMORY_SCOPE_ALL_DEVICES: // fallthough + case MEMORY_SCOPE_ALL_SVM_DEVICES: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + default: { + log_info("Invalid memory scope\n"); + break; + } + } + + switch (memoryOrder) + { + case MEMORY_ORDER_EMPTY: { + break; + } + case MEMORY_ORDER_RELAXED: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + case MEMORY_ORDER_ACQUIRE: + case MEMORY_ORDER_RELEASE: + case MEMORY_ORDER_ACQ_REL: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + case MEMORY_ORDER_SEQ_CST: { + if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0) + { + return TEST_SKIPPED_ITSELF; + } + break; + } + default: { + log_info("Invalid memory order\n"); + break; + } + } + + return 0; + } + virtual bool SVMDataBufferAllSVMConsistent() { return false; } + bool UseSVM() { return _useSVM; } + void StartValue(HostDataType startValue) { _startValue = startValue; } + HostDataType StartValue() { return _startValue; } + void LocalMemory(bool local) { _localMemory = local; } + bool LocalMemory() { return _localMemory; } + void DeclaredInProgram(bool declaredInProgram) + { + _declaredInProgram = declaredInProgram; + } + bool DeclaredInProgram() { return _declaredInProgram; } + void UsedInFunction(bool local) { _usedInFunction = local; } + bool UsedInFunction() { return _usedInFunction; } + void GenericAddrSpace(bool genericAddrSpace) + { + _genericAddrSpace = genericAddrSpace; + } + bool GenericAddrSpace() { return _genericAddrSpace; } + void OldValueCheck(bool check) { _oldValueCheck = check; } + bool OldValueCheck() { return _oldValueCheck; } + void LocalRefValues(bool localRefValues) + { + _localRefValues = localRefValues; + } + bool LocalRefValues() { return _localRefValues; } + void MaxGroupSize(cl_uint maxGroupSize) { _maxGroupSize = maxGroupSize; } + cl_uint MaxGroupSize() { return _maxGroupSize; } + void CurrentGroupSize(cl_uint currentGroupSize) + { + if (MaxGroupSize() && MaxGroupSize() < currentGroupSize) + _currentGroupSize = MaxGroupSize(); + else + _currentGroupSize = currentGroupSize; + } + cl_uint CurrentGroupSize() { return _currentGroupSize; } + virtual cl_uint CurrentGroupNum(cl_uint threadCount) + { + if (threadCount == 0) return 0; + if (LocalMemory()) return 1; + return threadCount / CurrentGroupSize(); + } + cl_int Iterations() { return _iterations; } + std::string IterationsStr() + { + std::stringstream ss; + ss << _iterations; + return ss.str(); + } + private: - TExplicitMemoryOrderType _memoryOrder2; + const TExplicitAtomicType _dataType; + const bool _useSVM; + HostDataType _startValue; + bool _localMemory; + bool _declaredInProgram; + bool _usedInFunction; + bool _genericAddrSpace; + bool _oldValueCheck; + bool _localRefValues; + cl_uint _maxGroupSize; + cl_uint _currentGroupSize; + cl_uint _passCount; + const cl_int _iterations; }; -template -std::string CBasicTest::PragmaHeader(cl_device_id deviceID) -{ - std::string pragma; - - if(gOldAPI) - { - pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"; - pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"; - pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"; - pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"; - } - // Create the pragma lines for this kernel - if(DataType().Size(deviceID) == 8) - { - pragma += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"; - pragma += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"; - } - if(_dataType == TYPE_ATOMIC_DOUBLE) - pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; - return pragma; -} - -template -std::string CBasicTest::ProgramHeader(cl_uint maxNumDestItems) -{ - // Create the program header - std::string header; - std::string aTypeName = DataType().AtomicTypeName(); - std::string cTypeName = DataType().RegularTypeName(); - std::string argListForKernel; - std::string argListForFunction; - std::string argListNoTypes; - std::string functionPrototype; - std::string addressSpace = LocalMemory() ? "__local " : "__global "; - - if(gOldAPI) - { - header += std::string("#define ")+aTypeName+" "+cTypeName+"\n" - "#define atomic_store(x,y) (*(x) = y)\n" - "#define atomic_load(x) (*(x))\n" - "#define ATOMIC_VAR_INIT(x) (x)\n" - "#define ATOMIC_FLAG_INIT 0\n" - "#define atomic_init(x,y) atomic_store(x,y)\n"; - if(aTypeName == "atomic_float") - header += "#define atomic_exchange(x,y) atomic_xchg(x,y)\n"; - else if(aTypeName == "atomic_double") - header += "double atomic_exchange(volatile "+addressSpace+"atomic_double *x, double y)\n" - "{\n" - " long tmp = *(long*)&y, res;\n" - " volatile "+addressSpace+"long *tmpA = (volatile "+addressSpace+"long)x;\n" - " res = atom_xchg(tmpA,tmp);\n" - " return *(double*)&res;\n" - "}\n"; - else - header += "#define atomic_exchange(x,y) atom_xchg(x,y)\n"; - if(aTypeName != "atomic_float" && aTypeName != "atomic_double") - header += - "bool atomic_compare_exchange_strong(volatile "+addressSpace+" "+aTypeName+" *a, "+cTypeName+" *expected, "+cTypeName+" desired)\n" - "{\n" - " "+cTypeName+" old = atom_cmpxchg(a, *expected, desired);\n" - " if(old == *expected)\n" - " return true;\n" - " *expected = old;\n" - " return false;\n" - "}\n" - "#define atomic_compare_exchange_weak atomic_compare_exchange_strong\n"; - header += - "#define atomic_fetch_add(x,y) atom_add(x,y)\n" - "#define atomic_fetch_sub(x,y) atom_sub(x,y)\n" - "#define atomic_fetch_or(x,y) atom_or(x,y)\n" - "#define atomic_fetch_xor(x,y) atom_xor(x,y)\n" - "#define atomic_fetch_and(x,y) atom_and(x,y)\n" - "#define atomic_fetch_min(x,y) atom_min(x,y)\n" - "#define atomic_fetch_max(x,y) atom_max(x,y)\n" - "#define atomic_flag_test_and_set(x) atomic_exchange(x,1)\n" - "#define atomic_flag_clear(x) atomic_store(x,0)\n" - "\n"; - } - if(!LocalMemory() && DeclaredInProgram()) - { - // additional atomic variable for results copying (last thread will do this) - header += "__global volatile atomic_uint finishedThreads = ATOMIC_VAR_INIT(0);\n"; - // atomic variables declared in program scope - test data - std::stringstream ss; - ss << maxNumDestItems; - header += std::string("__global volatile ")+aTypeName+" destMemory["+ss.str()+"] = {\n"; - ss.str(""); - ss << _startValue; - for(cl_uint i = 0; i < maxNumDestItems; i++) +template +class CBasicTestMemOrderScope + : public CBasicTest { +public: + using CBasicTest::LocalMemory; + using CBasicTest::MaxGroupSize; + using CBasicTest::CheckCapabilities; + CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) + : CBasicTest(dataType, useSVM) + {} + virtual std::string ProgramHeader(cl_uint maxNumDestItems) { - if(aTypeName == "atomic_flag") - header += " ATOMIC_FLAG_INIT"; - else - header += " ATOMIC_VAR_INIT("+ss.str()+")"; - if(i+1 < maxNumDestItems) - header += ","; - header += "\n"; + std::string header; + if (gOldAPI) + { + std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s"; + header += "#define atomic_store_explicit(x,y,o" + s + + ") atomic_store(x,y)\n" + "#define atomic_load_explicit(x,o" + + s + + ") atomic_load(x)\n" + "#define atomic_exchange_explicit(x,y,o" + + s + + ") atomic_exchange(x,y)\n" + "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of" + + s + + ") atomic_compare_exchange_strong(x,y,z)\n" + "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of" + + s + + ") atomic_compare_exchange_weak(x,y,z)\n" + "#define atomic_fetch_add_explicit(x,y,o" + + s + + ") atomic_fetch_add(x,y)\n" + "#define atomic_fetch_sub_explicit(x,y,o" + + s + + ") atomic_fetch_sub(x,y)\n" + "#define atomic_fetch_or_explicit(x,y,o" + + s + + ") atomic_fetch_or(x,y)\n" + "#define atomic_fetch_xor_explicit(x,y,o" + + s + + ") atomic_fetch_xor(x,y)\n" + "#define atomic_fetch_and_explicit(x,y,o" + + s + + ") atomic_fetch_and(x,y)\n" + "#define atomic_fetch_min_explicit(x,y,o" + + s + + ") atomic_fetch_min(x,y)\n" + "#define atomic_fetch_max_explicit(x,y,o" + + s + + ") atomic_fetch_max(x,y)\n" + "#define atomic_flag_test_and_set_explicit(x,o" + + s + + ") atomic_flag_test_and_set(x)\n" + "#define atomic_flag_clear_explicit(x,o" + + s + ") atomic_flag_clear(x)\n"; + } + return header + + CBasicTest::ProgramHeader( + maxNumDestItems); } - header+= - "};\n" - "\n"; - } - return header; + virtual std::string SingleTestName() + { + std::string testName = + CBasicTest::SingleTestName(); + if (MemoryOrder() != MEMORY_ORDER_EMPTY) + { + testName += std::string(", ") + + std::string(get_memory_order_type_name(MemoryOrder())) + .substr(sizeof("memory")); + } + if (MemoryScope() != MEMORY_SCOPE_EMPTY) + { + testName += std::string(", ") + + std::string(get_memory_scope_type_name(MemoryScope())) + .substr(sizeof("memory")); + } + return testName; + } + virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, + cl_command_queue queue) + { + if (LocalMemory() && MemoryScope() != MEMORY_SCOPE_EMPTY + && MemoryScope() + != MEMORY_SCOPE_WORK_GROUP) // memory scope should only be used + // for global memory + return 0; + if (MemoryScope() == MEMORY_SCOPE_DEVICE) + MaxGroupSize( + 16); // increase number of groups by forcing smaller group size + else + MaxGroupSize(0); // group size limited by device capabilities + + if (CheckCapabilities(MemoryScope(), MemoryOrder()) + == TEST_SKIPPED_ITSELF) + return 0; // skip test - not applicable + + return CBasicTest::ExecuteSingleTest( + deviceID, context, queue); + } + virtual int ExecuteForEachParameterSet(cl_device_id deviceID, + cl_context context, + cl_command_queue queue) + { + // repeat test for each reasonable memory order/scope combination + std::vector memoryOrder; + std::vector memoryScope; + int error = 0; + + // For OpenCL-3.0 and later some orderings and scopes are optional, so + // here we query for the supported ones. + test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, + memoryScope), + "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL); + + for (unsigned oi = 0; oi < memoryOrder.size(); oi++) + { + for (unsigned si = 0; si < memoryScope.size(); si++) + { + if (memoryOrder[oi] == MEMORY_ORDER_EMPTY + && memoryScope[si] != MEMORY_SCOPE_EMPTY) + continue; + MemoryOrder(memoryOrder[oi]); + MemoryScope(memoryScope[si]); + EXECUTE_TEST( + error, + (CBasicTest:: + ExecuteForEachParameterSet(deviceID, context, queue))); + } + } + return error; + } + void MemoryOrder(TExplicitMemoryOrderType memoryOrder) + { + _memoryOrder = memoryOrder; + } + TExplicitMemoryOrderType MemoryOrder() { return _memoryOrder; } + std::string MemoryOrderStr() + { + if (MemoryOrder() != MEMORY_ORDER_EMPTY) + return std::string(", ") + + get_memory_order_type_name(MemoryOrder()); + return ""; + } + void MemoryScope(TExplicitMemoryScopeType memoryScope) + { + _memoryScope = memoryScope; + } + TExplicitMemoryScopeType MemoryScope() { return _memoryScope; } + std::string MemoryScopeStr() + { + if (MemoryScope() != MEMORY_SCOPE_EMPTY) + return std::string(", ") + + get_memory_scope_type_name(MemoryScope()); + return ""; + } + std::string MemoryOrderScopeStr() + { + return MemoryOrderStr() + MemoryScopeStr(); + } + virtual cl_uint CurrentGroupNum(cl_uint threadCount) + { + if (MemoryScope() == MEMORY_SCOPE_WORK_GROUP) return 1; + return CBasicTest::CurrentGroupNum( + threadCount); + } + virtual cl_uint MaxHostThreads() + { + // block host threads execution for memory scope different than + // memory_scope_all_svm_devices + if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES + || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost) + { + return CBasicTest::MaxHostThreads(); + } + else + { + return 0; + } + } + +private: + TExplicitMemoryOrderType _memoryOrder; + TExplicitMemoryScopeType _memoryScope; +}; + +template +class CBasicTestMemOrder2Scope + : public CBasicTestMemOrderScope { +public: + using CBasicTestMemOrderScope::LocalMemory; + using CBasicTestMemOrderScope::MemoryOrder; + using CBasicTestMemOrderScope::MemoryScope; + using CBasicTestMemOrderScope::MemoryOrderStr; + using CBasicTestMemOrderScope::MemoryScopeStr; + using CBasicTest::CheckCapabilities; + + CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) + : CBasicTestMemOrderScope(dataType, + useSVM) + {} + virtual std::string SingleTestName() + { + std::string testName = + CBasicTest::SingleTestName(); + if (MemoryOrder() != MEMORY_ORDER_EMPTY) + testName += std::string(", ") + + std::string(get_memory_order_type_name(MemoryOrder())) + .substr(sizeof("memory")); + if (MemoryOrder2() != MEMORY_ORDER_EMPTY) + testName += std::string(", ") + + std::string(get_memory_order_type_name(MemoryOrder2())) + .substr(sizeof("memory")); + if (MemoryScope() != MEMORY_SCOPE_EMPTY) + testName += std::string(", ") + + std::string(get_memory_scope_type_name(MemoryScope())) + .substr(sizeof("memory")); + return testName; + } + virtual int ExecuteForEachParameterSet(cl_device_id deviceID, + cl_context context, + cl_command_queue queue) + { + // repeat test for each reasonable memory order/scope combination + std::vector memoryOrder; + std::vector memoryScope; + int error = 0; + + // For OpenCL-3.0 and later some orderings and scopes are optional, so + // here we query for the supported ones. + test_error_ret(getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, + memoryScope), + "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL); + + for (unsigned oi = 0; oi < memoryOrder.size(); oi++) + { + for (unsigned o2i = 0; o2i < memoryOrder.size(); o2i++) + { + for (unsigned si = 0; si < memoryScope.size(); si++) + { + if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY + || memoryOrder[o2i] == MEMORY_ORDER_EMPTY) + && memoryOrder[oi] != memoryOrder[o2i]) + continue; // both memory order arguments must be set (or + // none) + if ((memoryOrder[oi] == MEMORY_ORDER_EMPTY + || memoryOrder[o2i] == MEMORY_ORDER_EMPTY) + && memoryScope[si] != MEMORY_SCOPE_EMPTY) + continue; // memory scope without memory order is not + // allowed + MemoryOrder(memoryOrder[oi]); + MemoryOrder2(memoryOrder[o2i]); + MemoryScope(memoryScope[si]); + + if (CheckCapabilities(MemoryScope(), MemoryOrder()) + == TEST_SKIPPED_ITSELF) + continue; // skip test - not applicable + + if (CheckCapabilities(MemoryScope(), MemoryOrder2()) + == TEST_SKIPPED_ITSELF) + continue; // skip test - not applicable + + EXECUTE_TEST(error, + (CBasicTest:: + ExecuteForEachParameterSet( + deviceID, context, queue))); + } + } + } + return error; + } + void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) + { + _memoryOrder2 = memoryOrderFail; + } + TExplicitMemoryOrderType MemoryOrder2() { return _memoryOrder2; } + std::string MemoryOrderFailStr() + { + if (MemoryOrder2() != MEMORY_ORDER_EMPTY) + return std::string(", ") + + get_memory_order_type_name(MemoryOrder2()); + return ""; + } + std::string MemoryOrderScope() + { + return MemoryOrderStr() + MemoryOrderFailStr() + MemoryScopeStr(); + } + +private: + TExplicitMemoryOrderType _memoryOrder2; +}; + +template +std::string +CBasicTest::PragmaHeader(cl_device_id deviceID) +{ + std::string pragma; + + if (gOldAPI) + { + pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : " + "enable\n"; + pragma += "#pragma OPENCL EXTENSION " + "cl_khr_local_int32_extended_atomics : enable\n"; + pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : " + "enable\n"; + pragma += "#pragma OPENCL EXTENSION " + "cl_khr_global_int32_extended_atomics : enable\n"; + } + // Create the pragma lines for this kernel + if (DataType().Size(deviceID) == 8) + { + pragma += + "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"; + pragma += + "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"; + } + if (_dataType == TYPE_ATOMIC_DOUBLE) + pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; + return pragma; } -template +template +std::string +CBasicTest::ProgramHeader(cl_uint maxNumDestItems) +{ + // Create the program header + std::string header; + std::string aTypeName = DataType().AtomicTypeName(); + std::string cTypeName = DataType().RegularTypeName(); + std::string argListForKernel; + std::string argListForFunction; + std::string argListNoTypes; + std::string functionPrototype; + std::string addressSpace = LocalMemory() ? "__local " : "__global "; + + if (gOldAPI) + { + header += std::string("#define ") + aTypeName + " " + cTypeName + + "\n" + "#define atomic_store(x,y) (*(x) " + "= y)\n" + "#define atomic_load(x) " + "(*(x))\n" + "#define ATOMIC_VAR_INIT(x) (x)\n" + "#define ATOMIC_FLAG_INIT 0\n" + "#define atomic_init(x,y) " + "atomic_store(x,y)\n"; + if (aTypeName == "atomic_float") + header += "#define atomic_exchange(x,y) " + " atomic_xchg(x,y)\n"; + else if (aTypeName == "atomic_double") + header += "double atomic_exchange(volatile " + addressSpace + + "atomic_double *x, double y)\n" + "{\n" + " long tmp = *(long*)&y, res;\n" + " volatile " + + addressSpace + "long *tmpA = (volatile " + addressSpace + + "long)x;\n" + " res = atom_xchg(tmpA,tmp);\n" + " return *(double*)&res;\n" + "}\n"; + else + header += "#define atomic_exchange(x,y) " + " atom_xchg(x,y)\n"; + if (aTypeName != "atomic_float" && aTypeName != "atomic_double") + header += "bool atomic_compare_exchange_strong(volatile " + + addressSpace + " " + aTypeName + " *a, " + cTypeName + + " *expected, " + cTypeName + + " desired)\n" + "{\n" + " " + + cTypeName + + " old = atom_cmpxchg(a, *expected, desired);\n" + " if(old == *expected)\n" + " return true;\n" + " *expected = old;\n" + " return false;\n" + "}\n" + "#define atomic_compare_exchange_weak " + "atomic_compare_exchange_strong\n"; + header += "#define atomic_fetch_add(x,y) " + "atom_add(x,y)\n" + "#define atomic_fetch_sub(x,y) " + "atom_sub(x,y)\n" + "#define atomic_fetch_or(x,y) " + "atom_or(x,y)\n" + "#define atomic_fetch_xor(x,y) " + "atom_xor(x,y)\n" + "#define atomic_fetch_and(x,y) " + "atom_and(x,y)\n" + "#define atomic_fetch_min(x,y) " + "atom_min(x,y)\n" + "#define atomic_fetch_max(x,y) " + "atom_max(x,y)\n" + "#define atomic_flag_test_and_set(x) " + "atomic_exchange(x,1)\n" + "#define atomic_flag_clear(x) " + "atomic_store(x,0)\n" + "\n"; + } + if (!LocalMemory() && DeclaredInProgram()) + { + // additional atomic variable for results copying (last thread will do + // this) + header += "__global volatile atomic_uint finishedThreads = " + "ATOMIC_VAR_INIT(0);\n"; + // atomic variables declared in program scope - test data + std::stringstream ss; + ss << maxNumDestItems; + header += std::string("__global volatile ") + aTypeName + " destMemory[" + + ss.str() + "] = {\n"; + ss.str(""); + ss << _startValue; + for (cl_uint i = 0; i < maxNumDestItems; i++) + { + if (aTypeName == "atomic_flag") + header += " ATOMIC_FLAG_INIT"; + else + header += " ATOMIC_VAR_INIT(" + ss.str() + ")"; + if (i + 1 < maxNumDestItems) header += ","; + header += "\n"; + } + header += "};\n" + "\n"; + } + return header; +} + +template std::string CBasicTest::FunctionCode() { - if(!UsedInFunction()) - return ""; - std::string addressSpace = LocalMemory() ? "__local " : "__global "; - std::string code = "void test_atomic_function(uint tid, uint threadCount, uint numDestItems, volatile "; - if(!GenericAddrSpace()) - code += addressSpace; - code += std::string(DataType().AtomicTypeName())+" *destMemory, __global "+DataType().RegularTypeName()+ - " *oldValues"; - if(LocalRefValues()) - code += std::string(", __local ")+DataType().RegularTypeName()+" *localValues"; - code += ")\n" - "{\n"; - code += ProgramCore(); - code += "}\n" - "\n"; - return code; + if (!UsedInFunction()) return ""; + std::string addressSpace = LocalMemory() ? "__local " : "__global "; + std::string code = "void test_atomic_function(uint tid, uint threadCount, " + "uint numDestItems, volatile "; + if (!GenericAddrSpace()) code += addressSpace; + code += std::string(DataType().AtomicTypeName()) + " *destMemory, __global " + + DataType().RegularTypeName() + " *oldValues"; + if (LocalRefValues()) + code += std::string(", __local ") + DataType().RegularTypeName() + + " *localValues"; + code += ")\n" + "{\n"; + code += ProgramCore(); + code += "}\n" + "\n"; + return code; } -template -std::string CBasicTest::KernelCode(cl_uint maxNumDestItems) +template +std::string +CBasicTest::KernelCode(cl_uint maxNumDestItems) { - std::string aTypeName = DataType().AtomicTypeName(); - std::string cTypeName = DataType().RegularTypeName(); - std::string addressSpace = LocalMemory() ? "__local " : "__global "; - std::string code = "__kernel void test_atomic_kernel(uint threadCount, uint numDestItems, "; + std::string aTypeName = DataType().AtomicTypeName(); + std::string cTypeName = DataType().RegularTypeName(); + std::string addressSpace = LocalMemory() ? "__local " : "__global "; + std::string code = "__kernel void test_atomic_kernel(uint threadCount, " + "uint numDestItems, "; - // prepare list of arguments for kernel - if(LocalMemory()) - { - code += std::string("__global ")+cTypeName+" *finalDest, __global "+cTypeName+" *oldValues," - " volatile "+addressSpace+aTypeName+" *"+(DeclaredInProgram() ? "notUsed" : "")+"destMemory"; - } - else - { - code += "volatile "+addressSpace+(DeclaredInProgram() ? (cTypeName+" *finalDest") : (aTypeName+" *destMemory"))+ - ", __global "+cTypeName+" *oldValues"; - } - if(LocalRefValues()) - code += std::string(", __local ")+cTypeName+" *localValues"; - code += ")\n" - "{\n"; - if(LocalMemory() && DeclaredInProgram()) - { - // local atomics declared in kernel scope - std::stringstream ss; - ss << maxNumDestItems; - code += std::string(" __local volatile ")+aTypeName+" destMemory["+ss.str()+"];\n"; - } - code += " uint tid = get_global_id(0);\n" - "\n"; - if(LocalMemory()) - { - // memory_order_relaxed is sufficient for these initialization operations - // as the barrier below will act as a fence, providing an order to the - // operations. memory_scope_work_group is sufficient as local memory is - // only visible within the work-group. - code += R"( + // prepare list of arguments for kernel + if (LocalMemory()) + { + code += std::string("__global ") + cTypeName + " *finalDest, __global " + + cTypeName + + " *oldValues," + " volatile " + + addressSpace + aTypeName + " *" + + (DeclaredInProgram() ? "notUsed" : "") + "destMemory"; + } + else + { + code += "volatile " + addressSpace + + (DeclaredInProgram() ? (cTypeName + " *finalDest") + : (aTypeName + " *destMemory")) + + ", __global " + cTypeName + " *oldValues"; + } + if (LocalRefValues()) + code += std::string(", __local ") + cTypeName + " *localValues"; + code += ")\n" + "{\n"; + if (LocalMemory() && DeclaredInProgram()) + { + // local atomics declared in kernel scope + std::stringstream ss; + ss << maxNumDestItems; + code += std::string(" __local volatile ") + aTypeName + " destMemory[" + + ss.str() + "];\n"; + } + code += " uint tid = get_global_id(0);\n" + "\n"; + if (LocalMemory()) + { + // memory_order_relaxed is sufficient for these initialization + // operations as the barrier below will act as a fence, providing an + // order to the operations. memory_scope_work_group is sufficient as + // local memory is only visible within the work-group. + code += R"( // initialize atomics not reachable from host (first thread // is doing this, other threads are waiting on barrier) if(get_local_id(0) == 0) for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++) {)"; - if (aTypeName == "atomic_flag") - { - code += R"( + if (aTypeName == "atomic_flag") + { + code += R"( if(finalDest[dstItemIdx]) atomic_flag_test_and_set_explicit(destMemory+dstItemIdx, memory_order_relaxed, @@ -823,512 +976,565 @@ std::string CBasicTest::KernelCode(cl_uint maxNumD atomic_flag_clear_explicit(destMemory+dstItemIdx, memory_order_relaxed, memory_scope_work_group);)"; - } - else - { - code += R"( + } + else + { + code += R"( atomic_store_explicit(destMemory+dstItemIdx, finalDest[dstItemIdx], memory_order_relaxed, memory_scope_work_group);)"; + } + code += " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + "\n"; } - code += - " }\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n" - "\n"; - } - if (LocalRefValues()) - { - code += - " // Copy input reference values into local memory\n"; - if (NumNonAtomicVariablesPerThread() == 1) - code += " localValues[get_local_id(0)] = oldValues[tid];\n"; - else + if (LocalRefValues()) { - std::stringstream ss; - ss << NumNonAtomicVariablesPerThread(); - code += - " for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n" - " localValues[get_local_id(0)*" + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n"; + code += " // Copy input reference values into local memory\n"; + if (NumNonAtomicVariablesPerThread() == 1) + code += " localValues[get_local_id(0)] = oldValues[tid];\n"; + else + { + std::stringstream ss; + ss << NumNonAtomicVariablesPerThread(); + code += " for(uint rfId = 0; rfId < " + ss.str() + + "; rfId++)\n" + " localValues[get_local_id(0)*" + + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n"; + } + code += " barrier(CLK_LOCAL_MEM_FENCE);\n" + "\n"; } - code += - " barrier(CLK_LOCAL_MEM_FENCE);\n" - "\n"; - } - if (UsedInFunction()) - code += std::string(" test_atomic_function(tid, threadCount, numDestItems, destMemory, oldValues")+ - (LocalRefValues() ? ", localValues" : "")+");\n"; - else - code += ProgramCore(); - code += "\n"; - if (LocalRefValues()) - { - code += - " // Copy local reference values into output array\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n"; - if (NumNonAtomicVariablesPerThread() == 1) - code += " oldValues[tid] = localValues[get_local_id(0)];\n"; + if (UsedInFunction()) + code += std::string(" test_atomic_function(tid, threadCount, " + "numDestItems, destMemory, oldValues") + + (LocalRefValues() ? ", localValues" : "") + ");\n"; else - { - std::stringstream ss; - ss << NumNonAtomicVariablesPerThread(); - code += - " for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n" - " oldValues[tid*" + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + "+rfId];\n"; - } + code += ProgramCore(); code += "\n"; - } - if(LocalMemory() || DeclaredInProgram()) - { - code += " // Copy final values to host reachable buffer\n"; - if(LocalMemory()) - code += - " barrier(CLK_LOCAL_MEM_FENCE);\n" - " if(get_local_id(0) == 0) // first thread in workgroup\n"; - else - // global atomics declared in program scope - code += R"( + if (LocalRefValues()) + { + code += " // Copy local reference values into output array\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n"; + if (NumNonAtomicVariablesPerThread() == 1) + code += " oldValues[tid] = localValues[get_local_id(0)];\n"; + else + { + std::stringstream ss; + ss << NumNonAtomicVariablesPerThread(); + code += " for(uint rfId = 0; rfId < " + ss.str() + + "; rfId++)\n" + " oldValues[tid*" + + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + + "+rfId];\n"; + } + code += "\n"; + } + if (LocalMemory() || DeclaredInProgram()) + { + code += " // Copy final values to host reachable buffer\n"; + if (LocalMemory()) + code += " barrier(CLK_LOCAL_MEM_FENCE);\n" + " if(get_local_id(0) == 0) // first thread in workgroup\n"; + else + // global atomics declared in program scope + code += R"( if(atomic_fetch_add_explicit(&finishedThreads, 1u, memory_order_relaxed, memory_scope_work_group) == get_global_size(0)-1) // last finished thread )"; - code += - " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n"; - if(aTypeName == "atomic_flag") - { - code += R"( + code += " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; " + "dstItemIdx++)\n"; + if (aTypeName == "atomic_flag") + { + code += R"( finalDest[dstItemIdx] = atomic_flag_test_and_set_explicit(destMemory+dstItemIdx, memory_order_relaxed, memory_scope_work_group);)"; - } - else - { - code += R"( + } + else + { + code += R"( finalDest[dstItemIdx] = atomic_load_explicit(destMemory+dstItemIdx, memory_order_relaxed, memory_scope_work_group);)"; + } } - } - code += "}\n" - "\n"; - return code; + code += "}\n" + "\n"; + return code; } template -int CBasicTest::ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) +int CBasicTest::ExecuteSingleTest( + cl_device_id deviceID, cl_context context, cl_command_queue queue) { - int error; - clProgramWrapper program; - clKernelWrapper kernel; - size_t threadNum[1]; - clMemWrapper streams[2]; - std::vector destItems; - HostAtomicType *svmAtomicBuffer = 0; - std::vector refValues, startRefValues; - HostDataType *svmDataBuffer = 0; - cl_uint deviceThreadCount, hostThreadCount, threadCount; - size_t groupSize = 0; - std::string programSource; - const char *programLine; - MTdata d; - size_t typeSize = DataType().Size(deviceID); + int error; + clProgramWrapper program; + clKernelWrapper kernel; + size_t threadNum[1]; + clMemWrapper streams[2]; + std::vector destItems; + HostAtomicType *svmAtomicBuffer = 0; + std::vector refValues, startRefValues; + HostDataType *svmDataBuffer = 0; + cl_uint deviceThreadCount, hostThreadCount, threadCount; + size_t groupSize = 0; + std::string programSource; + const char *programLine; + MTdata d; + size_t typeSize = DataType().Size(deviceID); - deviceThreadCount = _maxDeviceThreads; - hostThreadCount = MaxHostThreads(); - threadCount = deviceThreadCount+hostThreadCount; + deviceThreadCount = _maxDeviceThreads; + hostThreadCount = MaxHostThreads(); + threadCount = deviceThreadCount + hostThreadCount; - //log_info("\t%s %s%s...\n", local ? "local" : "global", DataType().AtomicTypeName(), memoryOrderScope.c_str()); - log_info("\t%s...\n", SingleTestName().c_str()); + // log_info("\t%s %s%s...\n", local ? "local" : "global", + // DataType().AtomicTypeName(), memoryOrderScope.c_str()); + log_info("\t%s...\n", SingleTestName().c_str()); - if(!LocalMemory() && DeclaredInProgram() && gNoGlobalVariables) // no support for program scope global variables - { - log_info("\t\tTest disabled\n"); - return 0; - } - if(UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace) - { - log_info("\t\tTest disabled\n"); - return 0; - } + if (!LocalMemory() && DeclaredInProgram() + && gNoGlobalVariables) // no support for program scope global variables + { + log_info("\t\tTest disabled\n"); + return 0; + } + if (UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace) + { + log_info("\t\tTest disabled\n"); + return 0; + } - // set up work sizes based on device capabilities and test configuration - error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL); - test_error(error, "Unable to obtain max work group size for device"); - CurrentGroupSize((cl_uint)groupSize); - if(CurrentGroupSize() > deviceThreadCount) - CurrentGroupSize(deviceThreadCount); - if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI) - deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount); - threadCount = deviceThreadCount+hostThreadCount; + // set up work sizes based on device capabilities and test configuration + error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(groupSize), &groupSize, NULL); + test_error(error, "Unable to obtain max work group size for device"); + CurrentGroupSize((cl_uint)groupSize); + if (CurrentGroupSize() > deviceThreadCount) + CurrentGroupSize(deviceThreadCount); + if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI) + deviceThreadCount = + CurrentGroupSize() * CurrentGroupNum(deviceThreadCount); + threadCount = deviceThreadCount + hostThreadCount; - // If we're given a num_results function, we need to determine how many result objects we need. - // This is the first assessment for current maximum number of threads (exact thread count is not known here) - // - needed for program source code generation (arrays of atomics declared in program) - cl_uint numDestItems = NumResults(threadCount, deviceID); + // If we're given a num_results function, we need to determine how many + // result objects we need. This is the first assessment for current maximum + // number of threads (exact thread count is not known here) + // - needed for program source code generation (arrays of atomics declared + // in program) + cl_uint numDestItems = NumResults(threadCount, deviceID); - if(deviceThreadCount > 0) - { - // This loop iteratively reduces the workgroup size by 2 and then - // re-generates the kernel with the reduced - // workgroup size until we find a size which is admissible for the kernel - // being run or reduce the wg size - // to the trivial case of 1 (which was separately verified to be accurate - // for the kernel being run) + if (deviceThreadCount > 0) + { + // This loop iteratively reduces the workgroup size by 2 and then + // re-generates the kernel with the reduced + // workgroup size until we find a size which is admissible for the + // kernel being run or reduce the wg size to the trivial case of 1 + // (which was separately verified to be accurate for the kernel being + // run) - while ((CurrentGroupSize() > 1)) - { - // Re-generate the kernel code with the current group size - if (kernel) clReleaseKernel(kernel); - if (program) clReleaseProgram(program); - programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems) - + FunctionCode() + KernelCode(numDestItems); - programLine = programSource.c_str(); - if (create_single_kernel_helper_with_build_options( - context, &program, &kernel, 1, &programLine, - "test_atomic_kernel", gOldAPI ? "" : nullptr)) - { - return -1; - } - // Get work group size for the new kernel - error = clGetKernelWorkGroupInfo(kernel, deviceID, - CL_KERNEL_WORK_GROUP_SIZE, - sizeof(groupSize), &groupSize, NULL); - test_error(error, - "Unable to obtain max work group size for device and " - "kernel combo"); + while ((CurrentGroupSize() > 1)) + { + // Re-generate the kernel code with the current group size + if (kernel) clReleaseKernel(kernel); + if (program) clReleaseProgram(program); + programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems) + + FunctionCode() + KernelCode(numDestItems); + programLine = programSource.c_str(); + if (create_single_kernel_helper_with_build_options( + context, &program, &kernel, 1, &programLine, + "test_atomic_kernel", gOldAPI ? "" : nullptr)) + { + return -1; + } + // Get work group size for the new kernel + error = clGetKernelWorkGroupInfo( + kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(groupSize), + &groupSize, NULL); + test_error(error, + "Unable to obtain max work group size for device and " + "kernel combo"); - if (LocalMemory()) - { - cl_ulong usedLocalMemory; - cl_ulong totalLocalMemory; - cl_uint maxWorkGroupSize; + if (LocalMemory()) + { + cl_ulong usedLocalMemory; + cl_ulong totalLocalMemory; + cl_uint maxWorkGroupSize; - error = clGetKernelWorkGroupInfo( - kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, - sizeof(usedLocalMemory), &usedLocalMemory, NULL); - test_error(error, "clGetKernelWorkGroupInfo failed"); + error = clGetKernelWorkGroupInfo( + kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE, + sizeof(usedLocalMemory), &usedLocalMemory, NULL); + test_error(error, "clGetKernelWorkGroupInfo failed"); - error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, - sizeof(totalLocalMemory), - &totalLocalMemory, NULL); - test_error(error, "clGetDeviceInfo failed"); + error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE, + sizeof(totalLocalMemory), + &totalLocalMemory, NULL); + test_error(error, "clGetDeviceInfo failed"); - // We know that each work-group is going to use typeSize * - // deviceThreadCount bytes of local memory - // so pick the maximum value for deviceThreadCount that uses all - // the local memory. - maxWorkGroupSize = - ((totalLocalMemory - usedLocalMemory) / typeSize); + // We know that each work-group is going to use typeSize * + // deviceThreadCount bytes of local memory + // so pick the maximum value for deviceThreadCount that uses all + // the local memory. + maxWorkGroupSize = + ((totalLocalMemory - usedLocalMemory) / typeSize); - if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize; - } - if (CurrentGroupSize() <= groupSize) - break; - else - CurrentGroupSize(CurrentGroupSize() / 2); - } - if(CurrentGroupSize() > deviceThreadCount) - CurrentGroupSize(deviceThreadCount); - if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI) - deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount); - threadCount = deviceThreadCount+hostThreadCount; - } - if (gDebug) - { - log_info("Program source:\n"); - log_info("%s\n", programLine); - } - if(deviceThreadCount > 0) - log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize()); - if(hostThreadCount > 0) - log_info("\t\t(host threads %u)\n", hostThreadCount); + if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize; + } + if (CurrentGroupSize() <= groupSize) + break; + else + CurrentGroupSize(CurrentGroupSize() / 2); + } + if (CurrentGroupSize() > deviceThreadCount) + CurrentGroupSize(deviceThreadCount); + if (CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI) + deviceThreadCount = + CurrentGroupSize() * CurrentGroupNum(deviceThreadCount); + threadCount = deviceThreadCount + hostThreadCount; + } + if (gDebug) + { + log_info("Program source:\n"); + log_info("%s\n", programLine); + } + if (deviceThreadCount > 0) + log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, + CurrentGroupSize()); + if (hostThreadCount > 0) + log_info("\t\t(host threads %u)\n", hostThreadCount); - refValues.resize(threadCount*NumNonAtomicVariablesPerThread()); + refValues.resize(threadCount * NumNonAtomicVariablesPerThread()); - // Generate ref data if we have a ref generator provided - d = init_genrand(gRandomSeed); - startRefValues.resize(threadCount*NumNonAtomicVariablesPerThread()); - if(GenerateRefs(threadCount, &startRefValues[0], d)) - { - //copy ref values for host threads - memcpy(&refValues[0], &startRefValues[0], sizeof(HostDataType)*threadCount*NumNonAtomicVariablesPerThread()); - } - else - { - startRefValues.resize(0); - } - free_mtdata(d); - d = NULL; - - // If we're given a num_results function, we need to determine how many result objects we need. If - // we don't have it, we assume it's just 1 - // This is final value (exact thread count is known in this place) - numDestItems = NumResults(threadCount, deviceID); - - destItems.resize(numDestItems); - for(cl_uint i = 0; i < numDestItems; i++) - destItems[i] = _startValue; - - // Create main buffer with atomic variables (array size dependent on particular test) - if(UseSVM()) - { - if(gUseHostPtr) - svmAtomicBuffer = (HostAtomicType*)malloc(typeSize * numDestItems); + // Generate ref data if we have a ref generator provided + d = init_genrand(gRandomSeed); + startRefValues.resize(threadCount * NumNonAtomicVariablesPerThread()); + if (GenerateRefs(threadCount, &startRefValues[0], d)) + { + // copy ref values for host threads + memcpy(&refValues[0], &startRefValues[0], + sizeof(HostDataType) * threadCount + * NumNonAtomicVariablesPerThread()); + } else - svmAtomicBuffer = (HostAtomicType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, typeSize * numDestItems, 0); - if(!svmAtomicBuffer) { - log_error("ERROR: clSVMAlloc failed!\n"); - return -1; + startRefValues.resize(0); } - memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems); - streams[0] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, - typeSize * numDestItems, svmAtomicBuffer, NULL); - } - else - { - streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, - typeSize * numDestItems, &destItems[0], NULL); - } - if (!streams[0]) - { - log_error("ERROR: Creating output array failed!\n"); - return -1; - } - // Create buffer for per-thread input/output data - if(UseSVM()) - { - if(gUseHostPtr) - svmDataBuffer = (HostDataType*)malloc(typeSize*threadCount*NumNonAtomicVariablesPerThread()); - else - svmDataBuffer = (HostDataType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS : 0), typeSize*threadCount*NumNonAtomicVariablesPerThread(), 0); - if(!svmDataBuffer) - { - log_error("ERROR: clSVMAlloc failed!\n"); - return -1; - } - if(startRefValues.size()) - memcpy(svmDataBuffer, &startRefValues[0], typeSize*threadCount*NumNonAtomicVariablesPerThread()); - streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, - typeSize * threadCount - * NumNonAtomicVariablesPerThread(), - svmDataBuffer, NULL); - } - else - { - streams[1] = clCreateBuffer( - context, - ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)), - typeSize * threadCount * NumNonAtomicVariablesPerThread(), - startRefValues.size() ? &startRefValues[0] : 0, NULL); - } - if (!streams[1]) - { - log_error("ERROR: Creating reference array failed!\n"); - return -1; - } - if(deviceThreadCount > 0) - { - cl_uint argInd = 0; - /* Set the arguments */ - error = clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount); - test_error(error, "Unable to set kernel argument"); - error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), &numDestItems); - test_error(error, "Unable to set indexed kernel argument"); - error = clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]); - test_error(error, "Unable to set indexed kernel arguments"); - error = clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]); - test_error(error, "Unable to set indexed kernel arguments"); - if(LocalMemory()) - { - error = clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL); - test_error(error, "Unable to set indexed local kernel argument"); - } - if(LocalRefValues()) - { - error = clSetKernelArg(kernel, argInd++, LocalRefValues() ? typeSize*CurrentGroupSize()*NumNonAtomicVariablesPerThread() : 1, NULL); - test_error(error, "Unable to set indexed kernel argument"); - } - } - /* Configure host threads */ - std::vector hostThreadContexts(hostThreadCount); - for(unsigned int t = 0; t < hostThreadCount; t++) - { - hostThreadContexts[t].test = this; - hostThreadContexts[t].tid = deviceThreadCount+t; - hostThreadContexts[t].threadCount = threadCount; - hostThreadContexts[t].destMemory = UseSVM() ? svmAtomicBuffer : &destItems[0]; - hostThreadContexts[t].oldValues = UseSVM() ? svmDataBuffer : &refValues[0]; - } + free_mtdata(d); + d = NULL; - if(deviceThreadCount > 0) - { - /* Run the kernel */ - threadNum[0] = deviceThreadCount; - groupSize = CurrentGroupSize(); - error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, &groupSize, 0, NULL, NULL); - test_error(error, "Unable to execute test kernel"); - /* start device threads */ - error = clFlush(queue); - test_error(error, "clFlush failed"); - } + // If we're given a num_results function, we need to determine how many + // result objects we need. If we don't have it, we assume it's just 1 This + // is final value (exact thread count is known in this place) + numDestItems = NumResults(threadCount, deviceID); - /* Start host threads and wait for finish */ - if(hostThreadCount > 0) - ThreadPool_Do(HostThreadFunction, hostThreadCount, &hostThreadContexts[0]); + destItems.resize(numDestItems); + for (cl_uint i = 0; i < numDestItems; i++) destItems[i] = _startValue; - if(UseSVM()) - { - error = clFinish(queue); - test_error(error, "clFinish failed"); - memcpy(&destItems[0], svmAtomicBuffer, typeSize*numDestItems); - memcpy(&refValues[0], svmDataBuffer, typeSize*threadCount*NumNonAtomicVariablesPerThread()); - } - else - { - if(deviceThreadCount > 0) + // Create main buffer with atomic variables (array size dependent on + // particular test) + if (UseSVM()) { - error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL); - test_error(error, "Unable to read result value!"); - error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize * deviceThreadCount*NumNonAtomicVariablesPerThread(), &refValues[0], 0, NULL, NULL); - test_error(error, "Unable to read reference values!"); - } - } - bool dataVerified = false; - // If we have an expectedFn, then we need to generate a final value to compare against. If we don't - // have one, it's because we're comparing ref values only - for(cl_uint i = 0; i < numDestItems; i++) - { - HostDataType expected; - - if(!ExpectedValue(expected, threadCount, startRefValues.size() ? &startRefValues[0] : 0, i)) - break; // no expected value function provided - - if(expected != destItems[i]) - { - std::stringstream logLine; - logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected << ", was " << destItems[i] << ")\n"; - log_error("%s", logLine.str().c_str()); - for(i = 0; i < threadCount; i++) - { - logLine.str(""); - logLine << " --- " << i << " - "; - if(startRefValues.size()) - logLine << startRefValues[i] << " -> " << refValues[i]; + if (gUseHostPtr) + svmAtomicBuffer = (HostAtomicType *)malloc(typeSize * numDestItems); else - logLine << refValues[i]; - logLine << " --- "; - if(i < numDestItems) - logLine << destItems[i]; - logLine << "\n"; - log_info("%s", logLine.str().c_str()); - } - if(!gDebug) - { - log_info("Program source:\n"); - log_info("%s\n", programLine); - } - return -1; - } - dataVerified = true; - } - - bool dataCorrect = false; - /* Use the verify function (if provided) to also check the results */ - if(VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0])) - { - if(!dataCorrect) - { - log_error("ERROR: Reference values did not validate!\n"); - std::stringstream logLine; - for(cl_uint i = 0; i < threadCount; i++) - for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++) - { - logLine.str(""); - logLine << " --- " << i << " - " << refValues[i*NumNonAtomicVariablesPerThread()+j] << " --- "; - if(j == 0 && i < numDestItems) - logLine << destItems[i]; - logLine << "\n"; - log_info("%s", logLine.str().c_str()); - } - if(!gDebug) - { - log_info("Program source:\n"); - log_info("%s\n", programLine); - } - return -1; - } - } - else if(!dataVerified) - { - log_error("ERROR: Test doesn't check total or refs; no values are verified!\n"); - return -1; - } - - if(OldValueCheck() && - !(DeclaredInProgram() && !LocalMemory())) // don't test for programs scope global atomics - // 'old' value has been overwritten by previous clEnqueueNDRangeKernel - { - /* Re-write the starting value */ - for(size_t i = 0; i < numDestItems; i++) - destItems[i] = _startValue; - refValues[0] = 0; - if(deviceThreadCount > 0) - { - error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL); - test_error(error, "Unable to write starting values!"); - - /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */ - threadNum[0] = 1; - error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, threadNum, 0, NULL, NULL); - test_error(error, "Unable to execute test kernel"); - - error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, &refValues[0], 0, NULL, NULL); - test_error(error, "Unable to read reference values!"); + svmAtomicBuffer = (HostAtomicType *)clSVMAlloc( + context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, + typeSize * numDestItems, 0); + if (!svmAtomicBuffer) + { + log_error("ERROR: clSVMAlloc failed!\n"); + return -1; + } + memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems); + streams[0] = + clCreateBuffer(context, CL_MEM_USE_HOST_PTR, + typeSize * numDestItems, svmAtomicBuffer, NULL); } else { - /* Start host thread */ - HostFunction(0, 1, &destItems[0], &refValues[0]); + streams[0] = + clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + typeSize * numDestItems, &destItems[0], NULL); + } + if (!streams[0]) + { + log_error("ERROR: Creating output array failed!\n"); + return -1; + } + // Create buffer for per-thread input/output data + if (UseSVM()) + { + if (gUseHostPtr) + svmDataBuffer = (HostDataType *)malloc( + typeSize * threadCount * NumNonAtomicVariablesPerThread()); + else + svmDataBuffer = (HostDataType *)clSVMAlloc( + context, + CL_MEM_SVM_FINE_GRAIN_BUFFER + | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS + : 0), + typeSize * threadCount * NumNonAtomicVariablesPerThread(), 0); + if (!svmDataBuffer) + { + log_error("ERROR: clSVMAlloc failed!\n"); + return -1; + } + if (startRefValues.size()) + memcpy(svmDataBuffer, &startRefValues[0], + typeSize * threadCount * NumNonAtomicVariablesPerThread()); + streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, + typeSize * threadCount + * NumNonAtomicVariablesPerThread(), + svmDataBuffer, NULL); + } + else + { + streams[1] = clCreateBuffer( + context, + ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR + : CL_MEM_READ_WRITE)), + typeSize * threadCount * NumNonAtomicVariablesPerThread(), + startRefValues.size() ? &startRefValues[0] : 0, NULL); + } + if (!streams[1]) + { + log_error("ERROR: Creating reference array failed!\n"); + return -1; + } + if (deviceThreadCount > 0) + { + cl_uint argInd = 0; + /* Set the arguments */ + error = + clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount); + test_error(error, "Unable to set kernel argument"); + error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), + &numDestItems); + test_error(error, "Unable to set indexed kernel argument"); + error = + clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]); + test_error(error, "Unable to set indexed kernel arguments"); + error = + clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]); + test_error(error, "Unable to set indexed kernel arguments"); + if (LocalMemory()) + { + error = + clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL); + test_error(error, "Unable to set indexed local kernel argument"); + } + if (LocalRefValues()) + { + error = + clSetKernelArg(kernel, argInd++, + LocalRefValues() ? typeSize * CurrentGroupSize() + * NumNonAtomicVariablesPerThread() + : 1, + NULL); + test_error(error, "Unable to set indexed kernel argument"); + } + } + /* Configure host threads */ + std::vector hostThreadContexts(hostThreadCount); + for (unsigned int t = 0; t < hostThreadCount; t++) + { + hostThreadContexts[t].test = this; + hostThreadContexts[t].tid = deviceThreadCount + t; + hostThreadContexts[t].threadCount = threadCount; + hostThreadContexts[t].destMemory = + UseSVM() ? svmAtomicBuffer : &destItems[0]; + hostThreadContexts[t].oldValues = + UseSVM() ? svmDataBuffer : &refValues[0]; } - if(refValues[0] != _startValue)//destItems[0]) + if (deviceThreadCount > 0) { - std::stringstream logLine; - logLine << "ERROR: atomic function operated correctly but did NOT return correct 'old' value " - " (should have been " << destItems[0] << ", returned " << refValues[0] << ")!\n"; - log_error("%s", logLine.str().c_str()); - if(!gDebug) - { - log_info("Program source:\n"); - log_info("%s\n", programLine); - } - return -1; + /* Run the kernel */ + threadNum[0] = deviceThreadCount; + groupSize = CurrentGroupSize(); + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, + &groupSize, 0, NULL, NULL); + test_error(error, "Unable to execute test kernel"); + /* start device threads */ + error = clFlush(queue); + test_error(error, "clFlush failed"); + } + + /* Start host threads and wait for finish */ + if (hostThreadCount > 0) + ThreadPool_Do(HostThreadFunction, hostThreadCount, + &hostThreadContexts[0]); + + if (UseSVM()) + { + error = clFinish(queue); + test_error(error, "clFinish failed"); + memcpy(&destItems[0], svmAtomicBuffer, typeSize * numDestItems); + memcpy(&refValues[0], svmDataBuffer, + typeSize * threadCount * NumNonAtomicVariablesPerThread()); } - } - if(UseSVM()) - { - // the buffer object must first be released before the SVM buffer is freed - error = clReleaseMemObject(streams[0]); - streams[0] = 0; - test_error(error, "clReleaseMemObject failed"); - if(gUseHostPtr) - free(svmAtomicBuffer); else - clSVMFree(context, svmAtomicBuffer); - error = clReleaseMemObject(streams[1]); - streams[1] = 0; - test_error(error, "clReleaseMemObject failed"); - if(gUseHostPtr) - free(svmDataBuffer); - else - clSVMFree(context, svmDataBuffer); - } - _passCount++; - return 0; + { + if (deviceThreadCount > 0) + { + error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, + typeSize * numDestItems, &destItems[0], + 0, NULL, NULL); + test_error(error, "Unable to read result value!"); + error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, + typeSize * deviceThreadCount + * NumNonAtomicVariablesPerThread(), + &refValues[0], 0, NULL, NULL); + test_error(error, "Unable to read reference values!"); + } + } + bool dataVerified = false; + // If we have an expectedFn, then we need to generate a final value to + // compare against. If we don't have one, it's because we're comparing ref + // values only + for (cl_uint i = 0; i < numDestItems; i++) + { + HostDataType expected; + + if (!ExpectedValue(expected, threadCount, + startRefValues.size() ? &startRefValues[0] : 0, i)) + break; // no expected value function provided + + if (expected != destItems[i]) + { + std::stringstream logLine; + logLine << "ERROR: Result " << i + << " from kernel does not validate! (should be " << expected + << ", was " << destItems[i] << ")\n"; + log_error("%s", logLine.str().c_str()); + for (i = 0; i < threadCount; i++) + { + logLine.str(""); + logLine << " --- " << i << " - "; + if (startRefValues.size()) + logLine << startRefValues[i] << " -> " << refValues[i]; + else + logLine << refValues[i]; + logLine << " --- "; + if (i < numDestItems) logLine << destItems[i]; + logLine << "\n"; + log_info("%s", logLine.str().c_str()); + } + if (!gDebug) + { + log_info("Program source:\n"); + log_info("%s\n", programLine); + } + return -1; + } + dataVerified = true; + } + + bool dataCorrect = false; + /* Use the verify function (if provided) to also check the results */ + if (VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0])) + { + if (!dataCorrect) + { + log_error("ERROR: Reference values did not validate!\n"); + std::stringstream logLine; + for (cl_uint i = 0; i < threadCount; i++) + for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++) + { + logLine.str(""); + logLine + << " --- " << i << " - " + << refValues[i * NumNonAtomicVariablesPerThread() + j] + << " --- "; + if (j == 0 && i < numDestItems) logLine << destItems[i]; + logLine << "\n"; + log_info("%s", logLine.str().c_str()); + } + if (!gDebug) + { + log_info("Program source:\n"); + log_info("%s\n", programLine); + } + return -1; + } + } + else if (!dataVerified) + { + log_error("ERROR: Test doesn't check total or refs; no values are " + "verified!\n"); + return -1; + } + + if (OldValueCheck() + && !(DeclaredInProgram() + && !LocalMemory())) // don't test for programs scope global atomics + // 'old' value has been overwritten by previous + // clEnqueueNDRangeKernel + { + /* Re-write the starting value */ + for (size_t i = 0; i < numDestItems; i++) destItems[i] = _startValue; + refValues[0] = 0; + if (deviceThreadCount > 0) + { + error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, + typeSize * numDestItems, &destItems[0], + 0, NULL, NULL); + test_error(error, "Unable to write starting values!"); + + /* Run the kernel once for a single thread, so we can verify that + * the returned value is the original one */ + threadNum[0] = 1; + error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, + threadNum, 0, NULL, NULL); + test_error(error, "Unable to execute test kernel"); + + error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, + &refValues[0], 0, NULL, NULL); + test_error(error, "Unable to read reference values!"); + } + else + { + /* Start host thread */ + HostFunction(0, 1, &destItems[0], &refValues[0]); + } + + if (refValues[0] != _startValue) // destItems[0]) + { + std::stringstream logLine; + logLine << "ERROR: atomic function operated correctly but did NOT " + "return correct 'old' value " + " (should have been " + << destItems[0] << ", returned " << refValues[0] << ")!\n"; + log_error("%s", logLine.str().c_str()); + if (!gDebug) + { + log_info("Program source:\n"); + log_info("%s\n", programLine); + } + return -1; + } + } + if (UseSVM()) + { + // the buffer object must first be released before the SVM buffer is + // freed. The Wrapper Class method reset() will do that + streams[0].reset(); + if (gUseHostPtr) + free(svmAtomicBuffer); + else + clSVMFree(context, svmAtomicBuffer); + streams[1].reset(); + if (gUseHostPtr) + free(svmDataBuffer); + else + clSVMFree(context, svmDataBuffer); + } + _passCount++; + return 0; } #endif //_COMMON_H_