From 6f38c799c15852feaaae879c1113083dc767025f Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Tue, 27 Jan 2026 17:46:57 +0100 Subject: [PATCH] Added support for cl_ext_float_atomics in CBasicTestFetchAddSpecialFloats with atomic_half (#2386) Related to #2142, according to the work plan, extending CBasicTestFetchAddSpecialFloats with support for atomic_half. --- test_conformance/c11_atomics/common.h | 54 +++-- test_conformance/c11_atomics/main.cpp | 6 + test_conformance/c11_atomics/test_atomics.cpp | 224 ++++++++++++++++++ 3 files changed, 266 insertions(+), 18 deletions(-) diff --git a/test_conformance/c11_atomics/common.h b/test_conformance/c11_atomics/common.h index c9494c6c..5bf4cd0b 100644 --- a/test_conformance/c11_atomics/common.h +++ b/test_conformance/c11_atomics/common.h @@ -25,6 +25,7 @@ #include "CL/cl_half.h" #include +#include #include #include @@ -75,6 +76,10 @@ extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device extern cl_device_atomic_capabilities gAtomicMemCap, gAtomicFenceCap; // atomic memory and fence capabilities for this device +extern cl_half_rounding_mode gHalfRoundingMode; +extern bool gFloatAtomicsSupported; +extern cl_device_fp_atomic_capabilities_ext gHalfAtomicCaps; +extern cl_device_fp_config gHalfFPConfig; extern cl_half_rounding_mode gHalfRoundingMode; extern bool gFloatAtomicsSupported; @@ -154,12 +159,12 @@ public: return 0; } CBasicTest(TExplicitAtomicType dataType, bool useSVM) - : CTest(), _maxDeviceThreads(MAX_DEVICE_THREADS), _dataType(dataType), - _useSVM(useSVM), _startValue(255), _localMemory(false), - _declaredInProgram(false), _usedInFunction(false), - _genericAddrSpace(false), _oldValueCheck(true), - _localRefValues(false), _maxGroupSize(0), _passCount(0), - _iterations(gInternalIterations) + : CTest(), _dataType(dataType), _useSVM(useSVM), _startValue(255), + _localMemory(false), _declaredInProgram(false), + _usedInFunction(false), _genericAddrSpace(false), + _oldValueCheck(true), _localRefValues(false), _maxGroupSize(0), + _passCount(0), _iterations(gInternalIterations), + _maxDeviceThreads(MAX_DEVICE_THREADS), _deviceThreads(0) {} virtual ~CBasicTest() { @@ -240,12 +245,12 @@ public: cl_command_queue queue) { int error = 0; - DeclaredInProgram(false); + SetDeclaredInProgram(false); EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue)); if (!UseSVM()) { - DeclaredInProgram(true); + SetDeclaredInProgram(true); EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue)); } @@ -256,13 +261,13 @@ public: cl_command_queue queue) { int error = 0; - if (_maxDeviceThreads > 0 && !UseSVM()) + if (_deviceThreads > 0 && !UseSVM()) { SetLocalMemory(true); EXECUTE_TEST( error, ExecuteForEachDeclarationType(deviceID, context, queue)); } - if (_maxDeviceThreads + MaxHostThreads() > 0) + if (_deviceThreads + MaxHostThreads() > 0) { SetLocalMemory(false); EXECUTE_TEST( @@ -271,7 +276,7 @@ public: return error; } virtual int Execute(cl_device_id deviceID, cl_context context, - cl_command_queue queue, int num_elements) + cl_command_queue queue, int num_elements) override { if (sizeof(HostAtomicType) != DataType().Size(deviceID)) { @@ -311,7 +316,12 @@ public: if (UseSVM()) return 0; _maxDeviceThreads = 0; } - if (_maxDeviceThreads + MaxHostThreads() == 0) return 0; + + _deviceThreads = (num_elements > 0) + ? std::min(cl_uint(num_elements), _maxDeviceThreads) + : _maxDeviceThreads; + + if (_deviceThreads + MaxHostThreads() == 0) return 0; return ExecuteForEachParameterSet(deviceID, context, queue); } virtual void HostFunction(cl_uint tid, cl_uint threadCount, @@ -324,7 +334,7 @@ public: { return AtomicTypeExtendedInfo(_dataType); } - cl_uint _maxDeviceThreads; + virtual cl_uint MaxHostThreads() { if (UseSVM() || gHost) @@ -421,7 +431,7 @@ public: HostDataType StartValue() { return _startValue; } void SetLocalMemory(bool local) { _localMemory = local; } bool LocalMemory() { return _localMemory; } - void DeclaredInProgram(bool declaredInProgram) + void SetDeclaredInProgram(bool declaredInProgram) { _declaredInProgram = declaredInProgram; } @@ -478,6 +488,8 @@ private: cl_uint _currentGroupSize; cl_uint _passCount; const cl_int _iterations; + cl_uint _maxDeviceThreads; + cl_uint _deviceThreads; }; template @@ -912,9 +924,15 @@ CBasicTest::ProgramHeader(cl_uint maxNumDestItems) + ss.str() + "] = {\n"; ss.str(""); - if constexpr (is_host_fp_v) - ss << std::hexfloat - << _startValue; // use hex format for accurate representation + if constexpr (std::is_same_v) + { + ss << std::setprecision(10) << _startValue; + } + else if constexpr (std::is_same_v) + { + ss << std::setprecision(std::numeric_limits::max_digits10) + << cl_half_to_float(_startValue); + } else ss << _startValue; @@ -1151,7 +1169,7 @@ int CBasicTest::ExecuteSingleTest( MTdata d; size_t typeSize = DataType().Size(deviceID); - deviceThreadCount = _maxDeviceThreads; + deviceThreadCount = _deviceThreads; hostThreadCount = MaxHostThreads(); threadCount = deviceThreadCount + hostThreadCount; diff --git a/test_conformance/c11_atomics/main.cpp b/test_conformance/c11_atomics/main.cpp index 78291f06..1d20bb47 100644 --- a/test_conformance/c11_atomics/main.cpp +++ b/test_conformance/c11_atomics/main.cpp @@ -36,6 +36,7 @@ bool gFloatAtomicsSupported = false; cl_device_fp_atomic_capabilities_ext gHalfAtomicCaps = 0; cl_device_fp_atomic_capabilities_ext gDoubleAtomicCaps = 0; cl_device_fp_atomic_capabilities_ext gFloatAtomicCaps = 0; +cl_device_fp_config gHalfFPConfig = 0; test_status InitCL(cl_device_id device) { auto version = get_device_cl_version(device); @@ -169,6 +170,11 @@ test_status InitCL(cl_device_id device) { log_error("Error while acquiring half rounding mode\n"); return TEST_FAIL; } + + error = + clGetDeviceInfo(device, CL_DEVICE_HALF_FP_CONFIG, + sizeof(gHalfFPConfig), &gHalfFPConfig, NULL); + test_error_ret(error, "clGetDeviceInfo failed!", TEST_FAIL); } } diff --git a/test_conformance/c11_atomics/test_atomics.cpp b/test_conformance/c11_atomics/test_atomics.cpp index 18c86a3b..135574d8 100644 --- a/test_conformance/c11_atomics/test_atomics.cpp +++ b/test_conformance/c11_atomics/test_atomics.cpp @@ -1418,6 +1418,219 @@ public: } }; +template +class CBasicTestFetchAddSpecialFloats + : public CBasicTestMemOrderScope { + + std::vector ref_vals; + +public: + using CBasicTestMemOrderScope::MemoryOrder; + using CBasicTestMemOrderScope::MemoryOrderScopeStr; + using CBasicTestMemOrderScope::StartValue; + using CBasicTestMemOrderScope::DataType; + using CBasicTestMemOrderScope::LocalMemory; + using CBasicTestMemOrderScope::DeclaredInProgram; + CBasicTestFetchAddSpecialFloats(TExplicitAtomicType dataType, bool useSVM) + : CBasicTestMemOrderScope(dataType, + useSVM) + { + if constexpr (std::is_same_v) + { + // StartValue is used as an index divisor in the following test + // logic. It is set to the number of special values, which allows + // threads to be mapped deterministically onto the input data array. + // This enables repeated add operations arranged so that every + // special value is added to every other one (“all-to-all”). + + auto spec_vals = GetSpecialValues(); + StartValue(cl_half_from_float(spec_vals.size(), gHalfRoundingMode)); + CBasicTestMemOrderScope::OldValueCheck(false); + } + } + + static std::vector &GetSpecialValues() + { + static std::vector special_values; + if (special_values.empty()) + { + if constexpr (std::is_same_v) + { + special_values = { + 0xffff, 0x0000, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ + }; + + if (0 != (CL_FP_DENORM & gHalfFPConfig)) + { + special_values.push_back(0x0001 /* Smallest denormal */); + special_values.push_back(0x03ff /* Largest denormal */); + } + } + } + + return special_values; + } + + bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, + MTdata d) override + { + if constexpr (std::is_same_v) + { + if (threadCount > ref_vals.size()) + { + ref_vals.assign(threadCount, 0); + auto spec_vals = GetSpecialValues(); + + cl_uint total_cnt = 0; + while (total_cnt < threadCount) + { + cl_uint block_cnt = + std::min((cl_int)(threadCount - total_cnt), + (cl_int)spec_vals.size()); + memcpy(&ref_vals.at(total_cnt), spec_vals.data(), + sizeof(HostDataType) * block_cnt); + total_cnt += block_cnt; + } + } + + memcpy(startRefValues, ref_vals.data(), + sizeof(HostDataType) * threadCount); + + return true; + } + return false; + } + std::string ProgramCore() override + { + std::string memoryOrderScope = MemoryOrderScopeStr(); + std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); + + if constexpr (std::is_same_v) + { + // The start_value variable (set by StartValue) is used + // as a divisor of the thread index when selecting the operand for + // atomic_fetch_add. This groups threads into blocks corresponding + // to the number of special values and implements an “all-to-all” + // addition pattern. As a result, each destination element is + // updated using different combinations of input values, enabling + // consistent comparison between host and device execution. + + return std::string(DataType().AddSubOperandTypeName()) + + " start_value = atomic_load_explicit(destMemory+tid, " + "memory_order_relaxed, memory_scope_work_group);\n" + " atomic_store_explicit(destMemory+tid, oldValues[tid], " + "memory_order_relaxed, memory_scope_work_group);\n" + " atomic_fetch_add" + + postfix + "(&destMemory[tid], (" + + DataType().AddSubOperandTypeName() + + ")oldValues[tid/(int)start_value]" + memoryOrderScope + + ");\n"; + } + } + void HostFunction(cl_uint tid, cl_uint threadCount, + volatile HostAtomicType *destMemory, + HostDataType *oldValues) override + { + if constexpr (std::is_same_v) + { + auto spec_vals = GetSpecialValues(); + host_atomic_store(&destMemory[tid], (HostDataType)oldValues[tid], + MEMORY_ORDER_SEQ_CST); + host_atomic_fetch_add( + &destMemory[tid], + (HostDataType)oldValues[tid / spec_vals.size()], MemoryOrder()); + } + } + bool ExpectedValue(HostDataType &expected, cl_uint threadCount, + HostDataType *startRefValues, + cl_uint whichDestValue) override + { + expected = StartValue(); + if constexpr (std::is_same_v) + { + auto spec_vals = GetSpecialValues(); + expected = cl_half_from_float( + cl_half_to_float(startRefValues[whichDestValue]) + + cl_half_to_float( + startRefValues[whichDestValue / spec_vals.size()]), + gHalfRoundingMode); + } + + return true; + } + + bool IsTestNotAsExpected(const HostDataType &expected, + const std::vector &testValues, + cl_uint whichDestValue) override + { + + if constexpr (std::is_same_v) + { + return static_cast(expected) != testValues[whichDestValue]; + } + + return CBasicTestMemOrderScope< + HostAtomicType, HostDataType>::IsTestNotAsExpected(expected, + testValues, + whichDestValue); + } + + int ExecuteSingleTest(cl_device_id deviceID, cl_context context, + cl_command_queue queue) override + { + if constexpr (std::is_same_v) + { + if (DeclaredInProgram()) return 0; // skip test - not applicable + + if (LocalMemory() + && (gHalfAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0) + return 0; // skip test - not applicable + + if (!LocalMemory() + && (gHalfAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT) == 0) + return 0; + + if (!CBasicTestMemOrderScope::LocalMemory() + && CBasicTestMemOrderScope::DeclaredInProgram()) + { + if ((gHalfFPConfig & CL_FP_INF_NAN) == 0) return 0; + } + } + return CBasicTestMemOrderScope< + HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, + queue); + } + cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override + { + if constexpr (std::is_same_v) + { + return threadCount; + } + return CBasicTestMemOrderScope::NumResults(threadCount, + deviceID); + } +}; + static int test_atomic_fetch_add_generic(cl_device_id deviceID, cl_context context, cl_command_queue queue, @@ -1443,6 +1656,17 @@ static int test_atomic_fetch_add_generic(cl_device_id deviceID, if (gFloatAtomicsSupported) { + auto spec_vals_halfs = + CBasicTestFetchAddSpecialFloats::GetSpecialValues(); + + CBasicTestFetchAddSpecialFloats + test_spec_half(TYPE_ATOMIC_HALF, useSVM); + EXECUTE_TEST(error, + test_spec_half.Execute(deviceID, context, queue, + spec_vals_halfs.size() + * spec_vals_halfs.size())); + CBasicTestFetchAdd test_half( TYPE_ATOMIC_HALF, useSVM); EXECUTE_TEST(error,