Added support for cl_ext_float_atomics in CBasicTestFetchSub with atomic_half (#2366)

Related to #2142, according to the work plan, extending CBasicTestFetchSub with support for atomic_half. I wasn't able to test that PR entirely due to missing CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT/CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT capabilities for atomic_half. I appreciate reviewers' attention, thanks.
2026-03-19 06:09:01 +00:00 · 2025-09-09 17:40:50 +02:00
parent 913e6e4388
commit 1aeca1360b
2 changed files with 245 additions and 27 deletions
--- a/test_conformance/c11_atomics/host_atomics.h
+++ b/test_conformance/c11_atomics/host_atomics.h
@@ -19,6 +19,8 @@
 #include "harness/testHarness.h"
 #include <mutex>

+#include "CL/cl_half.h"
+
 #ifdef WIN32
 #include "Windows.h"
 #endif
@@ -88,6 +90,8 @@ enum TExplicitMemoryOrderType

 #define HOST_FLAG cl_int

+extern cl_half_rounding_mode gHalfRoundingMode;
+
 // host atomic functions
 void host_atomic_thread_fence(TExplicitMemoryOrderType order);

@@ -120,7 +124,18 @@ template <typename AtomicType, typename CorrespondingType>
 CorrespondingType host_atomic_fetch_sub(volatile AtomicType *a, CorrespondingType c,
                                        TExplicitMemoryOrderType order)
 {
-#if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32))
+    if constexpr (std::is_same_v<AtomicType, HOST_ATOMIC_HALF>)
+    {
+        static std::mutex mx;
+        std::lock_guard<std::mutex> lock(mx);
+        CorrespondingType old_value = *a;
+        *a = cl_half_from_float((cl_half_to_float(*a) - cl_half_to_float(c)),
+                                gHalfRoundingMode);
+        return old_value;
+    }
+    else
+    {
+#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
        return InterlockedExchangeSubtract(a, c);
 #elif defined(__GNUC__)
        return __sync_fetch_and_sub(a, c);
@@ -128,6 +143,7 @@ CorrespondingType host_atomic_fetch_sub(volatile AtomicType *a, CorrespondingTyp
        log_info("Host function not implemented: atomic_fetch_sub\n");
        return 0;
 #endif
+    }
 }

 template <typename AtomicType, typename CorrespondingType>
--- a/test_conformance/c11_atomics/test_atomics.cpp
+++ b/test_conformance/c11_atomics/test_atomics.cpp
@@ -1488,46 +1488,239 @@ REGISTER_TEST(svm_atomic_fetch_add)
 template <typename HostAtomicType, typename HostDataType>
 class CBasicTestFetchSub
    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
+
+    double min_range;
+    double max_range;
+    double max_error;
+    std::vector<HostDataType> ref_vals;
+
 public:
    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
    using CBasicTestMemOrderScope<HostAtomicType,
                                  HostDataType>::MemoryOrderScopeStr;
    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
+    using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
    CBasicTestFetchSub(TExplicitAtomicType dataType, bool useSVM)
        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
-                                                                useSVM)
-    {}
-    virtual std::string ProgramCore()
+                                                                useSVM),
+          min_range(-999.0), max_range(999.0), max_error(0.0)
+    {
+        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        {
+            StartValue(0);
+            CBasicTestMemOrderScope<HostAtomicType,
+                                    HostDataType>::OldValueCheck(false);
+        }
+    }
+    template <typename Iterator>
+    float subtract_halfs(Iterator begin, Iterator end)
+    {
+        cl_half res = 0;
+        for (auto it = begin; it != end; ++it)
+        {
+            res = cl_half_from_float(cl_half_to_float(res)
+                                         - cl_half_to_float(*it),
+                                     gHalfRoundingMode);
+        }
+        return cl_half_to_float(res);
+    }
+    bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
+                      MTdata d) override
+    {
+        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        {
+            if (threadCount > ref_vals.size())
+            {
+                ref_vals.resize(threadCount);
+
+                for (cl_uint i = 0; i < threadCount; i++)
+                    ref_vals[i] = cl_half_from_float(
+                        get_random_float(min_range, max_range, d),
+                        gHalfRoundingMode);
+
+                memcpy(startRefValues, ref_vals.data(),
+                       sizeof(HostDataType) * ref_vals.size());
+
+                // Estimate highest possible summation error for given set.
+                std::vector<float> sums;
+                std::sort(ref_vals.begin(), ref_vals.end(),
+                          [](cl_half a, cl_half b) {
+                              return cl_half_to_float(a) < cl_half_to_float(b);
+                          });
+
+                sums.push_back(
+                    subtract_halfs(ref_vals.begin(), ref_vals.end()));
+                sums.push_back(
+                    subtract_halfs(ref_vals.rbegin(), ref_vals.rend()));
+
+                std::sort(ref_vals.begin(), ref_vals.end(),
+                          [](cl_half a, cl_half b) {
+                              return std::abs(cl_half_to_float(a))
+                                  < std::abs(cl_half_to_float(b));
+                          });
+
+                float precise = 0.f;
+                for (auto elem : ref_vals) precise -= cl_half_to_float(elem);
+                sums.push_back(precise);
+
+                sums.push_back(
+                    subtract_halfs(ref_vals.begin(), ref_vals.end()));
+                sums.push_back(
+                    subtract_halfs(ref_vals.rbegin(), ref_vals.rend()));
+
+                std::sort(sums.begin(), sums.end());
+                max_error = std::abs(sums.front() - sums.back());
+
+                // restore unsorted order
+                memcpy(ref_vals.data(), startRefValues,
+                       sizeof(HostDataType) * ref_vals.size());
+            }
+            else
+            {
+                memcpy(startRefValues, ref_vals.data(),
+                       sizeof(HostDataType) * threadCount);
+            }
+            return true;
+        }
+        return false;
+    }
+    std::string ProgramCore() override
    {
        std::string memoryOrderScope = MemoryOrderScopeStr();
        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
+
+        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        {
+            return "  atomic_fetch_sub" + postfix + "(&destMemory[0], ("
+                + DataType().AddSubOperandTypeName() + ")oldValues[tid]"
+                + memoryOrderScope + ");\n"
+                + "  oldValues[tid] = atomic_fetch_sub" + postfix
+                + "(&destMemory[tid], (" + DataType().AddSubOperandTypeName()
+                + ")0" + memoryOrderScope + ");\n";
+        }
+        else
+        {
            return "  oldValues[tid] = atomic_fetch_sub" + postfix
                + "(&destMemory[0], tid + 3 +((("
                + DataType().AddSubOperandTypeName() + ")tid + 3) << (sizeof("
-            + DataType().AddSubOperandTypeName() + ")-1)*8)" + memoryOrderScope
-            + ");\n";
+                + DataType().AddSubOperandTypeName() + ")-1)*8)"
+                + memoryOrderScope + ");\n";
        }
-    virtual void HostFunction(cl_uint tid, cl_uint threadCount,
+    }
+    void HostFunction(cl_uint tid, cl_uint threadCount,
                      volatile HostAtomicType *destMemory,
-                              HostDataType *oldValues)
+                      HostDataType *oldValues) override
    {
+        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        {
+            host_atomic_fetch_sub(&destMemory[0], (HostDataType)oldValues[tid],
+                                  MemoryOrder());
            oldValues[tid] = host_atomic_fetch_sub(
-            &destMemory[0],
+                &destMemory[tid], (HostDataType)0, MemoryOrder());
+        }
+        else
+        {
+            oldValues[tid] =
+                host_atomic_fetch_sub(&destMemory[0],
                                      (HostDataType)tid + 3
-                + (((HostDataType)tid + 3) << (sizeof(HostDataType) - 1) * 8),
+                                          + (((HostDataType)tid + 3)
+                                             << (sizeof(HostDataType) - 1) * 8),
                                      MemoryOrder());
        }
-    virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
+    }
+    bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
                       HostDataType *startRefValues,
-                               cl_uint whichDestValue)
+                       cl_uint whichDestValue) override
    {
        expected = StartValue();
+        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        {
+            if (whichDestValue == 0)
+            {
+                for (cl_uint i = 0; i < threadCount; i++)
+                {
+                    expected = cl_half_from_float(
+                        cl_half_to_float(expected)
+                            - cl_half_to_float(startRefValues[i]),
+                        gHalfRoundingMode);
+                }
+            }
+        }
+        else
+        {
            for (cl_uint i = 0; i < threadCount; i++)
                expected -= (HostDataType)i + 3
                    + (((HostDataType)i + 3) << (sizeof(HostDataType) - 1) * 8);
+        }
        return true;
    }
+    bool IsTestNotAsExpected(const HostDataType &expected,
+                             const std::vector<HostAtomicType> &testValues,
+                             cl_uint whichDestValue) override
+    {
+        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        {
+            if (whichDestValue == 0)
+                return std::abs(cl_half_to_float(expected)
+                                - cl_half_to_float(testValues[whichDestValue]))
+                    > max_error;
+        }
+        return CBasicTestMemOrderScope<
+            HostAtomicType, HostDataType>::IsTestNotAsExpected(expected,
+                                                               testValues,
+                                                               whichDestValue);
+    }
+    bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues,
+                    HostAtomicType *finalValues) override
+    {
+        if (std::is_same<HostDataType, HOST_ATOMIC_FLOAT>::value)
+        {
+            correct = true;
+            for (cl_uint i = 1; i < threadCount; i++)
+            {
+                if (refValues[i] != StartValue())
+                {
+                    log_error("Thread %d found %d mismatch(es)\n", i,
+                              (cl_uint)refValues[i]);
+                    correct = false;
+                }
+            }
+            return !correct;
+        }
+        return CBasicTestMemOrderScope<HostAtomicType,
+                                       HostDataType>::VerifyRefs(correct,
+                                                                 threadCount,
+                                                                 refValues,
+                                                                 finalValues);
+    }
+    int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
+                          cl_command_queue queue) override
+    {
+        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        {
+            if (LocalMemory()
+                && (gHalfAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0)
+                return 0; // skip test - not applicable
+
+            if (!LocalMemory()
+                && (gHalfAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT) == 0)
+                return 0;
+        }
+        return CBasicTestMemOrderScope<
+            HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context,
+                                                             queue);
+    }
+    cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override
+    {
+        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        {
+            return threadCount;
+        }
+        return CBasicTestMemOrderScope<HostAtomicType,
+                                       HostDataType>::NumResults(threadCount,
+                                                                 deviceID);
+    }
 };

 static int test_atomic_fetch_sub_generic(cl_device_id deviceID,
@@ -1552,6 +1745,15 @@ static int test_atomic_fetch_sub_generic(cl_device_id deviceID,
        TYPE_ATOMIC_ULONG, useSVM);
    EXECUTE_TEST(error,
                 test_ulong.Execute(deviceID, context, queue, num_elements));
+
+    if (gFloatAtomicsSupported)
+    {
+        CBasicTestFetchSub<HOST_ATOMIC_HALF, HOST_HALF> test_half(
+            TYPE_ATOMIC_HALF, useSVM);
+        EXECUTE_TEST(error,
+                     test_half.Execute(deviceID, context, queue, num_elements));
+    }
+
    if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
    {
        CBasicTestFetchSub<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>