Added support for cl_ext_float_atomics in CBasicTestFetchAddSpecialFloats with atomic_half (#2386)

Related to #2142, according to the work plan, extending CBasicTestFetchAddSpecialFloats with support for atomic_half.
2026-03-19 06:09:01 +00:00 · 2026-01-27 17:46:57 +01:00
parent 584f27afd4
commit 6f38c799c1
3 changed files with 266 additions and 18 deletions
--- a/test_conformance/c11_atomics/common.h
+++ b/test_conformance/c11_atomics/common.h
@@ -25,6 +25,7 @@
 #include "CL/cl_half.h"
 #include <iomanip>
 #include <limits>
 #include <sstream>
 #include <vector>
@@ -75,6 +76,10 @@ extern int
    gMaxDeviceThreads; // maximum number of threads executed on OCL device
 extern cl_device_atomic_capabilities gAtomicMemCap,
    gAtomicFenceCap; // atomic memory and fence capabilities for this device
 extern cl_half_rounding_mode gHalfRoundingMode;
 extern bool gFloatAtomicsSupported;
 extern cl_device_fp_atomic_capabilities_ext gHalfAtomicCaps;
 extern cl_device_fp_config gHalfFPConfig;
 extern cl_half_rounding_mode gHalfRoundingMode;
 extern bool gFloatAtomicsSupported;
@@ -154,12 +159,12 @@ public:
        return 0;
    }
    CBasicTest(TExplicitAtomicType dataType, bool useSVM)
-        : CTest(), _maxDeviceThreads(MAX_DEVICE_THREADS), _dataType(dataType),
+        : CTest(), _dataType(dataType), _useSVM(useSVM), _startValue(255),
-          _useSVM(useSVM), _startValue(255), _localMemory(false),
+          _localMemory(false), _declaredInProgram(false),
-          _declaredInProgram(false), _usedInFunction(false),
+          _usedInFunction(false), _genericAddrSpace(false),
-          _genericAddrSpace(false), _oldValueCheck(true),
+          _oldValueCheck(true), _localRefValues(false), _maxGroupSize(0),
-          _localRefValues(false), _maxGroupSize(0), _passCount(0),
+          _passCount(0), _iterations(gInternalIterations),
-          _iterations(gInternalIterations)
+          _maxDeviceThreads(MAX_DEVICE_THREADS), _deviceThreads(0)
    {}
    virtual ~CBasicTest()
    {
@@ -240,12 +245,12 @@ public:
                                      cl_command_queue queue)
    {
        int error = 0;
-        DeclaredInProgram(false);
+        SetDeclaredInProgram(false);
        EXECUTE_TEST(error,
                     ExecuteForEachPointerType(deviceID, context, queue));
        if (!UseSVM())
        {
-            DeclaredInProgram(true);
+            SetDeclaredInProgram(true);
            EXECUTE_TEST(error,
                         ExecuteForEachPointerType(deviceID, context, queue));
        }
@@ -256,13 +261,13 @@ public:
                                           cl_command_queue queue)
    {
        int error = 0;
-        if (_maxDeviceThreads > 0 && !UseSVM())
+        if (_deviceThreads > 0 && !UseSVM())
        {
            SetLocalMemory(true);
            EXECUTE_TEST(
                error, ExecuteForEachDeclarationType(deviceID, context, queue));
        }
-        if (_maxDeviceThreads + MaxHostThreads() > 0)
+        if (_deviceThreads + MaxHostThreads() > 0)
        {
            SetLocalMemory(false);
            EXECUTE_TEST(
@@ -271,7 +276,7 @@ public:
        return error;
    }
    virtual int Execute(cl_device_id deviceID, cl_context context,
-                        cl_command_queue queue, int num_elements)
+                        cl_command_queue queue, int num_elements) override
    {
        if (sizeof(HostAtomicType) != DataType().Size(deviceID))
        {
@@ -311,7 +316,12 @@ public:
            if (UseSVM()) return 0;
            _maxDeviceThreads = 0;
        }
-        if (_maxDeviceThreads + MaxHostThreads() == 0) return 0;
+
        _deviceThreads = (num_elements > 0)
            ? std::min(cl_uint(num_elements), _maxDeviceThreads)
            : _maxDeviceThreads;
        if (_deviceThreads + MaxHostThreads() == 0) return 0;
        return ExecuteForEachParameterSet(deviceID, context, queue);
    }
    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
@@ -324,7 +334,7 @@ public:
    {
        return AtomicTypeExtendedInfo<HostDataType>(_dataType);
    }
-    cl_uint _maxDeviceThreads;
+
    virtual cl_uint MaxHostThreads()
    {
        if (UseSVM() || gHost)
@@ -421,7 +431,7 @@ public:
    HostDataType StartValue() { return _startValue; }
    void SetLocalMemory(bool local) { _localMemory = local; }
    bool LocalMemory() { return _localMemory; }
-    void DeclaredInProgram(bool declaredInProgram)
+    void SetDeclaredInProgram(bool declaredInProgram)
    {
        _declaredInProgram = declaredInProgram;
    }
@@ -478,6 +488,8 @@ private:
    cl_uint _currentGroupSize;
    cl_uint _passCount;
    const cl_int _iterations;
    cl_uint _maxDeviceThreads;
    cl_uint _deviceThreads;
 };
 template <typename HostAtomicType, typename HostDataType>
@@ -912,9 +924,15 @@ CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
            + ss.str() + "] = {\n";
        ss.str("");
-        if constexpr (is_host_fp_v<HostDataType>)
+        if constexpr (std::is_same_v<HostDataType, HOST_FLOAT>)
-            ss << std::hexfloat
+        {
-               << _startValue; // use hex format for accurate representation
+            ss << std::setprecision(10) << _startValue;
        }
        else if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            ss << std::setprecision(std::numeric_limits<float>::max_digits10)
               << cl_half_to_float(_startValue);
        }
        else
            ss << _startValue;
@@ -1151,7 +1169,7 @@ int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
    MTdata d;
    size_t typeSize = DataType().Size(deviceID);
-    deviceThreadCount = _maxDeviceThreads;
+    deviceThreadCount = _deviceThreads;
    hostThreadCount = MaxHostThreads();
    threadCount = deviceThreadCount + hostThreadCount;
--- a/test_conformance/c11_atomics/main.cpp
+++ b/test_conformance/c11_atomics/main.cpp
@@ -36,6 +36,7 @@ bool gFloatAtomicsSupported = false;
 cl_device_fp_atomic_capabilities_ext gHalfAtomicCaps = 0;
 cl_device_fp_atomic_capabilities_ext gDoubleAtomicCaps = 0;
 cl_device_fp_atomic_capabilities_ext gFloatAtomicCaps = 0;
 cl_device_fp_config gHalfFPConfig = 0;
 test_status InitCL(cl_device_id device) {
    auto version = get_device_cl_version(device);
@@ -169,6 +170,11 @@ test_status InitCL(cl_device_id device) {
                log_error("Error while acquiring half rounding mode\n");
                return TEST_FAIL;
            }
            error =
                clGetDeviceInfo(device, CL_DEVICE_HALF_FP_CONFIG,
                                sizeof(gHalfFPConfig), &gHalfFPConfig, NULL);
            test_error_ret(error, "clGetDeviceInfo failed!", TEST_FAIL);
        }
    }
--- a/test_conformance/c11_atomics/test_atomics.cpp
+++ b/test_conformance/c11_atomics/test_atomics.cpp
@@ -1418,6 +1418,219 @@ public:
    }
 };
 template <typename HostAtomicType, typename HostDataType>
 class CBasicTestFetchAddSpecialFloats
    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
    std::vector<HostDataType> ref_vals;
 public:
    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
    using CBasicTestMemOrderScope<HostAtomicType,
                                  HostDataType>::MemoryOrderScopeStr;
    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
    using CBasicTestMemOrderScope<HostAtomicType,
                                  HostDataType>::DeclaredInProgram;
    CBasicTestFetchAddSpecialFloats(TExplicitAtomicType dataType, bool useSVM)
        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
                                                                useSVM)
    {
        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            // StartValue is used as an index divisor in the following test
            // logic. It is set to the number of special values, which allows
            // threads to be mapped deterministically onto the input data array.
            // This enables repeated add operations arranged so that every
            // special value is added to every other one (“all-to-all”).
            auto spec_vals = GetSpecialValues();
            StartValue(cl_half_from_float(spec_vals.size(), gHalfRoundingMode));
            CBasicTestMemOrderScope<HostAtomicType,
                                    HostDataType>::OldValueCheck(false);
        }
    }
    static std::vector<HostDataType> &GetSpecialValues()
    {
        static std::vector<HostDataType> special_values;
        if (special_values.empty())
        {
            if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
            {
                special_values = {
                    0xffff, 0x0000, 0x7c00, /*INFINITY*/
                    0xfc00, /*-INFINITY*/
                    0x8000, /*-0*/
                    0x7bff, /*HALF_MAX*/
                    0x0400, /*HALF_MIN*/
                    0x3c00, /* 1 */
                    0xbc00, /* -1 */
                    0x3555, /*nearest value to 1/3*/
                    0x3bff, /*largest number less than one*/
                    0xc000, /* -2 */
                    0xfbff, /* -HALF_MAX */
                    0x8400, /* -HALF_MIN */
                    0x4248, /* M_PI_H */
                    0xc248, /* -M_PI_H */
                    0xbbff, /* Largest negative fraction */
                };
                if (0 != (CL_FP_DENORM & gHalfFPConfig))
                {
                    special_values.push_back(0x0001 /* Smallest denormal */);
                    special_values.push_back(0x03ff /* Largest denormal */);
                }
            }
        }
        return special_values;
    }
    bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
                      MTdata d) override
    {
        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            if (threadCount > ref_vals.size())
            {
                ref_vals.assign(threadCount, 0);
                auto spec_vals = GetSpecialValues();
                cl_uint total_cnt = 0;
                while (total_cnt < threadCount)
                {
                    cl_uint block_cnt =
                        std::min((cl_int)(threadCount - total_cnt),
                                 (cl_int)spec_vals.size());
                    memcpy(&ref_vals.at(total_cnt), spec_vals.data(),
                           sizeof(HostDataType) * block_cnt);
                    total_cnt += block_cnt;
                }
            }
            memcpy(startRefValues, ref_vals.data(),
                   sizeof(HostDataType) * threadCount);
            return true;
        }
        return false;
    }
    std::string ProgramCore() override
    {
        std::string memoryOrderScope = MemoryOrderScopeStr();
        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            // The start_value variable (set by StartValue) is used
            // as a divisor of the thread index when selecting the operand for
            // atomic_fetch_add. This groups threads into blocks corresponding
            // to the number of special values and implements an “all-to-all”
            // addition pattern. As a result, each destination element is
            // updated using different combinations of input values, enabling
            // consistent comparison between host and device execution.
            return std::string(DataType().AddSubOperandTypeName())
                + " start_value = atomic_load_explicit(destMemory+tid, "
                  "memory_order_relaxed, memory_scope_work_group);\n"
                  "  atomic_store_explicit(destMemory+tid, oldValues[tid], "
                  "memory_order_relaxed, memory_scope_work_group);\n"
                  "  atomic_fetch_add"
                + postfix + "(&destMemory[tid], ("
                + DataType().AddSubOperandTypeName()
                + ")oldValues[tid/(int)start_value]" + memoryOrderScope
                + ");\n";
        }
    }
    void HostFunction(cl_uint tid, cl_uint threadCount,
                      volatile HostAtomicType *destMemory,
                      HostDataType *oldValues) override
    {
        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            auto spec_vals = GetSpecialValues();
            host_atomic_store(&destMemory[tid], (HostDataType)oldValues[tid],
                              MEMORY_ORDER_SEQ_CST);
            host_atomic_fetch_add(
                &destMemory[tid],
                (HostDataType)oldValues[tid / spec_vals.size()], MemoryOrder());
        }
    }
    bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
                       HostDataType *startRefValues,
                       cl_uint whichDestValue) override
    {
        expected = StartValue();
        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            auto spec_vals = GetSpecialValues();
            expected = cl_half_from_float(
                cl_half_to_float(startRefValues[whichDestValue])
                    + cl_half_to_float(
                        startRefValues[whichDestValue / spec_vals.size()]),
                gHalfRoundingMode);
        }
        return true;
    }
    bool IsTestNotAsExpected(const HostDataType &expected,
                             const std::vector<HostAtomicType> &testValues,
                             cl_uint whichDestValue) override
    {
        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            return static_cast<cl_half>(expected) != testValues[whichDestValue];
        }
        return CBasicTestMemOrderScope<
            HostAtomicType, HostDataType>::IsTestNotAsExpected(expected,
                                                               testValues,
                                                               whichDestValue);
    }
    int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
                          cl_command_queue queue) override
    {
        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            if (DeclaredInProgram()) return 0; // skip test - not applicable
            if (LocalMemory()
                && (gHalfAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0)
                return 0; // skip test - not applicable
            if (!LocalMemory()
                && (gHalfAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT) == 0)
                return 0;
            if (!CBasicTestMemOrderScope<HostAtomicType,
                                         HostDataType>::LocalMemory()
                && CBasicTestMemOrderScope<HostAtomicType,
                                           HostDataType>::DeclaredInProgram())
            {
                if ((gHalfFPConfig & CL_FP_INF_NAN) == 0) return 0;
            }
        }
        return CBasicTestMemOrderScope<
            HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context,
                                                             queue);
    }
    cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override
    {
        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            return threadCount;
        }
        return CBasicTestMemOrderScope<HostAtomicType,
                                       HostDataType>::NumResults(threadCount,
                                                                 deviceID);
    }
 };
 static int test_atomic_fetch_add_generic(cl_device_id deviceID,
                                         cl_context context,
                                         cl_command_queue queue,
@@ -1443,6 +1656,17 @@ static int test_atomic_fetch_add_generic(cl_device_id deviceID,
    if (gFloatAtomicsSupported)
    {
        auto spec_vals_halfs =
            CBasicTestFetchAddSpecialFloats<HOST_ATOMIC_HALF,
                                            HOST_HALF>::GetSpecialValues();
        CBasicTestFetchAddSpecialFloats<HOST_ATOMIC_HALF, HOST_HALF>
            test_spec_half(TYPE_ATOMIC_HALF, useSVM);
        EXECUTE_TEST(error,
                     test_spec_half.Execute(deviceID, context, queue,
                                            spec_vals_halfs.size()
                                                * spec_vals_halfs.size()));
        CBasicTestFetchAdd<HOST_ATOMIC_HALF, HOST_HALF> test_half(
            TYPE_ATOMIC_HALF, useSVM);
        EXECUTE_TEST(error,