From 62972418c3c9c9a06e84738c926f278f59a518f2 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Wed, 22 Oct 2025 16:01:48 +0200 Subject: [PATCH] Added support for cl_ext_float_atomics in CBasicTestFetchAdd with atomic_half (#2350) Related to https://github.com/KhronosGroup/OpenCL-CTS/issues/2142, according to the work plan, extending CBasicTestFetchAdd with support for atomic_half. I wasn't able to test that PR completely due to missing `CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT`/`CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT` capabilities for atomic_half. I appreciate reviewers' attention, thanks. --- test_conformance/c11_atomics/host_atomics.h | 13 +- test_conformance/c11_atomics/test_atomics.cpp | 125 ++++++++++++++++-- 2 files changed, 123 insertions(+), 15 deletions(-) diff --git a/test_conformance/c11_atomics/host_atomics.h b/test_conformance/c11_atomics/host_atomics.h index 4471897b..27560822 100644 --- a/test_conformance/c11_atomics/host_atomics.h +++ b/test_conformance/c11_atomics/host_atomics.h @@ -99,7 +99,16 @@ template CorrespondingType host_atomic_fetch_add(volatile AtomicType *a, CorrespondingType c, TExplicitMemoryOrderType order) { - if constexpr ( + if constexpr (std::is_same_v) + { + static std::mutex mx; + std::lock_guard lock(mx); + CorrespondingType old_value = *a; + *a = cl_half_from_float((cl_half_to_float(*a) + cl_half_to_float(c)), + gHalfRoundingMode); + return old_value; + } + else if constexpr ( std::is_same_v< AtomicType, HOST_ATOMIC_FLOAT> || std::is_same_v) @@ -112,7 +121,7 @@ CorrespondingType host_atomic_fetch_add(volatile AtomicType *a, CorrespondingTyp } else { -#if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32)) +#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32)) return InterlockedExchangeAdd(a, c); #elif defined(__GNUC__) return __sync_fetch_and_add(a, c); diff --git a/test_conformance/c11_atomics/test_atomics.cpp b/test_conformance/c11_atomics/test_atomics.cpp index d73bb6b8..0712f109 100644 --- a/test_conformance/c11_atomics/test_atomics.cpp +++ b/test_conformance/c11_atomics/test_atomics.cpp @@ -417,7 +417,7 @@ public: correct = true; for (cl_uint i = 0; i < threadCount; i++) { - if constexpr (std::is_same::value) + if constexpr (std::is_same_v) { HostDataType test = cl_half_from_float(static_cast(i), gHalfRoundingMode); @@ -1204,17 +1204,79 @@ public: if constexpr ( std::is_same_v< HostDataType, - HOST_FLOAT> || std::is_same_v) + HOST_HALF> || std::is_same_v || std::is_same_v) { StartValue((HostDataType)0.0); CBasicTestMemOrderScope::OldValueCheck(false); } } + template float accum_halfs(Iterator begin, Iterator end) + { + cl_half sum = 0; + for (auto it = begin; it != end; ++it) + { + sum = cl_half_from_float(cl_half_to_float(sum) + + cl_half_to_float(*it), + gHalfRoundingMode); + } + return cl_half_to_float(sum); + } bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) override { - if constexpr ( + if constexpr (std::is_same_v) + { + if (threadCount > ref_vals.size()) + { + ref_vals.resize(threadCount); + + for (cl_uint i = 0; i < threadCount; i++) + ref_vals[i] = cl_half_from_float( + get_random_float(min_range, max_range, d), + gHalfRoundingMode); + + memcpy(startRefValues, ref_vals.data(), + sizeof(HostDataType) * ref_vals.size()); + + // Estimate highest possible summation error for given set. + std::vector sums; + std::sort(ref_vals.begin(), ref_vals.end(), + [](cl_half a, cl_half b) { + return cl_half_to_float(a) < cl_half_to_float(b); + }); + + sums.push_back(accum_halfs(ref_vals.begin(), ref_vals.end())); + sums.push_back(accum_halfs(ref_vals.rbegin(), ref_vals.rend())); + + std::sort(ref_vals.begin(), ref_vals.end(), + [](cl_half a, cl_half b) { + return std::abs(cl_half_to_float(a)) + < std::abs(cl_half_to_float(b)); + }); + + float precise = 0.f; + for (auto elem : ref_vals) precise += cl_half_to_float(elem); + sums.push_back(precise); + + sums.push_back(accum_halfs(ref_vals.begin(), ref_vals.end())); + sums.push_back(accum_halfs(ref_vals.rbegin(), ref_vals.rend())); + + std::sort(sums.begin(), sums.end()); + max_error = std::abs(sums.front() - sums.back()); + + // restore unsorted order + memcpy(ref_vals.data(), startRefValues, + sizeof(HostDataType) * ref_vals.size()); + } + else + { + memcpy(startRefValues, ref_vals.data(), + sizeof(HostDataType) * threadCount); + } + return true; + } + else if constexpr ( std::is_same_v< HostDataType, HOST_FLOAT> || std::is_same_v) @@ -1286,7 +1348,7 @@ public: if constexpr ( std::is_same_v< HostDataType, - HOST_DOUBLE> || std::is_same_v) + HOST_HALF> || std::is_same_v || std::is_same_v) { return " atomic_fetch_add" + postfix + "(&destMemory[0], (" + DataType().AddSubOperandTypeName() + ")oldValues[tid]" @@ -1323,7 +1385,7 @@ public: if constexpr ( std::is_same_v< HostDataType, - HOST_DOUBLE> || std::is_same_v) + HOST_HALF> || std::is_same_v || std::is_same_v) { host_atomic_fetch_add(&destMemory[0], (HostDataType)oldValues[tid], MemoryOrder()); @@ -1349,7 +1411,20 @@ public: cl_uint whichDestValue) override { expected = StartValue(); - if constexpr ( + if constexpr (std::is_same_v) + { + if (whichDestValue == 0) + { + for (cl_uint i = 0; i < threadCount; i++) + { + expected = cl_half_from_float( + cl_half_to_float(expected) + + cl_half_to_float(startRefValues[i]), + gHalfRoundingMode); + } + } + } + else if constexpr ( std::is_same_v< HostDataType, HOST_DOUBLE> || std::is_same_v) @@ -1371,10 +1446,17 @@ public: const std::vector &testValues, cl_uint whichDestValue) override { - if constexpr ( + if constexpr (std::is_same_v) + { + if (whichDestValue == 0) + return std::abs(cl_half_to_float(expected) + - cl_half_to_float(testValues[whichDestValue])) + > max_error; + } + else if constexpr ( std::is_same_v< HostDataType, - HOST_DOUBLE> || std::is_same::value) + HOST_DOUBLE> || std::is_same_v) { if (whichDestValue == 0) return std::abs((HostDataType)expected @@ -1389,8 +1471,10 @@ public: bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) override { - if (std::is_same::value - || std::is_same::value) + if constexpr ( + std::is_same_v< + HostDataType, + HOST_HALF> || std::is_same_v || std::is_same_v) { correct = true; for (cl_uint i = 1; i < threadCount; i++) @@ -1413,7 +1497,17 @@ public: int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) override { - if constexpr (std::is_same_v) + if constexpr (std::is_same_v) + { + if (LocalMemory() + && (gHalfAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0) + return 0; // skip test - not applicable + + if (!LocalMemory() + && (gHalfAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT) == 0) + return 0; + } + else if constexpr (std::is_same_v) { if (LocalMemory() && (gDoubleAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0) @@ -1443,7 +1537,7 @@ public: if constexpr ( std::is_same_v< HostDataType, - HOST_DOUBLE> || std::is_same_v) + HOST_HALF> || std::is_same_v || std::is_same_v) { return threadCount; } @@ -1478,6 +1572,11 @@ static int test_atomic_fetch_add_generic(cl_device_id deviceID, if (gFloatAtomicsSupported) { + CBasicTestFetchAdd test_half( + TYPE_ATOMIC_HALF, useSVM); + EXECUTE_TEST(error, + test_half.Execute(deviceID, context, queue, num_elements)); + CBasicTestFetchAdd test_double( TYPE_ATOMIC_DOUBLE, useSVM); EXECUTE_TEST( @@ -1737,7 +1836,7 @@ public: bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) override { - if (std::is_same::value) + if (std::is_same_v) { correct = true; for (cl_uint i = 1; i < threadCount; i++)