From 119af24d54d1ba75b03abdf056aa5dc2d3cbefb7 Mon Sep 17 00:00:00 2001 From: Yilong Guo Date: Wed, 17 Dec 2025 00:37:33 +0800 Subject: [PATCH] c11_atomics: unify host half representation and conversion with wrapper class (#2503) Introduce `HostHalf` wrapper class to eliminate explicit `cl_half_from_float` and `cl_half_to_float` conversions throughout the test code. The wrapper provides semantic value constructors/operators and automatic conversions, simplifying half-precision arithmetic operations. Key improvements: - `HostHalf` class with operator overloading for arithmetic and comparisons - Type traits `is_host_atomic_fp_v` and `is_host_fp_v` for generic FP handling - Unified floating-point atomic operations (add/sub/min/max/exchange) - Removed 300+ lines of half-specific conditional branches - Consistent calculation for all FP types --- test_conformance/c11_atomics/common.cpp | 8 +- test_conformance/c11_atomics/common.h | 21 +- test_conformance/c11_atomics/host_atomics.h | 173 ++++-- test_conformance/c11_atomics/test_atomics.cpp | 540 ++++-------------- 4 files changed, 248 insertions(+), 494 deletions(-) diff --git a/test_conformance/c11_atomics/common.cpp b/test_conformance/c11_atomics/common.cpp index 3be3fbc1..7bb2da76 100644 --- a/test_conformance/c11_atomics/common.cpp +++ b/test_conformance/c11_atomics/common.cpp @@ -194,9 +194,9 @@ template<> cl_int AtomicTypeExtendedInfo::MinValue() {return CL_INT_MIN; template<> cl_uint AtomicTypeExtendedInfo::MinValue() {return 0;} template<> cl_long AtomicTypeExtendedInfo::MinValue() {return CL_LONG_MIN;} template <> cl_ulong AtomicTypeExtendedInfo::MinValue() { return 0; } -template <> cl_half AtomicTypeExtendedInfo::MinValue() +template <> HostHalf AtomicTypeExtendedInfo::MinValue() { - return cl_half_from_float(-CL_HALF_MAX, gHalfRoundingMode); + return -CL_HALF_MAX; } template <> cl_float AtomicTypeExtendedInfo::MinValue() { @@ -217,9 +217,9 @@ template <> cl_uint AtomicTypeExtendedInfo::MaxValue() } template<> cl_long AtomicTypeExtendedInfo::MaxValue() {return CL_LONG_MAX;} template<> cl_ulong AtomicTypeExtendedInfo::MaxValue() {return CL_ULONG_MAX;} -template <> cl_half AtomicTypeExtendedInfo::MaxValue() +template <> HostHalf AtomicTypeExtendedInfo::MaxValue() { - return cl_half_from_float(CL_HALF_MAX, gHalfRoundingMode); + return CL_HALF_MAX; } template <> cl_float AtomicTypeExtendedInfo::MaxValue() { diff --git a/test_conformance/c11_atomics/common.h b/test_conformance/c11_atomics/common.h index 09535a88..c9494c6c 100644 --- a/test_conformance/c11_atomics/common.h +++ b/test_conformance/c11_atomics/common.h @@ -183,7 +183,8 @@ public: const std::vector &testValues, cl_uint whichDestValue) { - return expected != testValues[whichDestValue]; + return expected + != static_cast(testValues[whichDestValue]); } virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) @@ -911,12 +912,9 @@ CBasicTest::ProgramHeader(cl_uint maxNumDestItems) + ss.str() + "] = {\n"; ss.str(""); - if (CBasicTest::DataType()._type - == TYPE_ATOMIC_FLOAT) - ss << std::setprecision(10) << _startValue; - else if (CBasicTest::DataType()._type - == TYPE_ATOMIC_HALF) - ss << cl_half_to_float(static_cast(_startValue)); + if constexpr (is_host_fp_v) + ss << std::hexfloat + << _startValue; // use hex format for accurate representation else ss << _startValue; @@ -1305,7 +1303,8 @@ int CBasicTest::ExecuteSingleTest( numDestItems = NumResults(threadCount, deviceID); destItems.resize(numDestItems); - for (cl_uint i = 0; i < numDestItems; i++) destItems[i] = _startValue; + for (cl_uint i = 0; i < numDestItems; i++) + destItems[i] = static_cast(_startValue); // Create main buffer with atomic variables (array size dependent on // particular test) @@ -1483,7 +1482,8 @@ int CBasicTest::ExecuteSingleTest( std::stringstream logLine; logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected - << ", was " << destItems[i] << ")\n"; + << ", was " << static_cast(destItems[i]) + << ")\n"; log_error("%s", logLine.str().c_str()); for (i = 0; i < threadCount; i++) { @@ -1550,7 +1550,8 @@ int CBasicTest::ExecuteSingleTest( // clEnqueueNDRangeKernel { /* Re-write the starting value */ - for (size_t i = 0; i < numDestItems; i++) destItems[i] = _startValue; + for (size_t i = 0; i < numDestItems; i++) + destItems[i] = static_cast(_startValue); refValues[0] = 0; if (deviceThreadCount > 0) { diff --git a/test_conformance/c11_atomics/host_atomics.h b/test_conformance/c11_atomics/host_atomics.h index fac21642..8d875bc9 100644 --- a/test_conformance/c11_atomics/host_atomics.h +++ b/test_conformance/c11_atomics/host_atomics.h @@ -24,6 +24,8 @@ #include "Windows.h" #endif +extern cl_half_rounding_mode gHalfRoundingMode; + //flag for test verification (good test should discover non-atomic functions and fail) //#define NON_ATOMIC_FUNCTIONS @@ -37,6 +39,93 @@ enum TExplicitMemoryOrderType MEMORY_ORDER_SEQ_CST }; +// Wrapper class for half-precision +class HostHalf { +public: + // Convert from semantic values + HostHalf(cl_uint value = 0) + : value( + cl_half_from_float(static_cast(value), gHalfRoundingMode)) + {} + HostHalf(int value): HostHalf(static_cast(value)) {} + HostHalf(float value): value(cl_half_from_float(value, gHalfRoundingMode)) + {} + HostHalf(double value): HostHalf(static_cast(value)) {} + + // Convert to semantic values + operator cl_uint() const + { + return static_cast(cl_half_to_float(value)); + } + operator float() const { return cl_half_to_float(value); } + operator double() const + { + return static_cast(cl_half_to_float(value)); + } + + // Construct from bit representation + HostHalf(cl_half value): value(value) {} + + // Get the underlying bit representation + operator cl_half() const { return value; } + + HostHalf operator-() const + { + return HostHalf( + cl_half_from_float(-cl_half_to_float(value), gHalfRoundingMode)); + } + +#define GENERIC_OP(RetType, op) \ + RetType operator op(const HostHalf &other) const \ + { \ + return RetType(cl_half_to_float(value) \ + op cl_half_to_float(other.value)); \ + } + + GENERIC_OP(bool, ==) + GENERIC_OP(bool, !=) + GENERIC_OP(bool, <) + GENERIC_OP(bool, <=) + GENERIC_OP(bool, >) + GENERIC_OP(bool, >=) + GENERIC_OP(HostHalf, +) + GENERIC_OP(HostHalf, -) + GENERIC_OP(HostHalf, *) + GENERIC_OP(HostHalf, /) +#undef GENERIC_OP + +#define INPLACE_OP(op) \ + HostHalf &operator op##=(const HostHalf &other) \ + { \ + value = cl_half_from_float(cl_half_to_float(value) \ + op cl_half_to_float(other.value), \ + gHalfRoundingMode); \ + return *this; \ + } + INPLACE_OP(+) + INPLACE_OP(-) + INPLACE_OP(*) + INPLACE_OP(/) +#undef INPLACE_OP + + friend std::ostream &operator<<(std::ostream &os, const HostHalf &hh) + { + float f = cl_half_to_float(hh.value); + os << f; + return os; + } + +private: + cl_half value; +}; + +namespace std { +inline HostHalf abs(const HostHalf &value) +{ + return value < HostHalf(0) ? -value : value; +} +} // namespace std + // host atomic types (applicable for atomic functions supported on host OS) #ifdef WIN32 #define HOST_ATOMIC_INT unsigned long @@ -73,7 +162,7 @@ enum TExplicitMemoryOrderType #define HOST_UINT cl_uint #define HOST_LONG cl_long #define HOST_ULONG cl_ulong -#define HOST_HALF cl_half +#define HOST_HALF HostHalf #define HOST_FLOAT cl_float #define HOST_DOUBLE cl_double @@ -91,6 +180,18 @@ enum TExplicitMemoryOrderType extern cl_half_rounding_mode gHalfRoundingMode; +template +constexpr bool is_host_atomic_fp_v = + std::disjunction_v, + std::is_same, + std::is_same>; + +template +constexpr bool is_host_fp_v = + std::disjunction_v, + std::is_same, + std::is_same>; + // host atomic functions void host_atomic_thread_fence(TExplicitMemoryOrderType order); @@ -98,24 +199,13 @@ template CorrespondingType host_atomic_fetch_add(volatile AtomicType *a, CorrespondingType c, TExplicitMemoryOrderType order) { - if constexpr (std::is_same_v) + if constexpr (is_host_atomic_fp_v) { static std::mutex mx; std::lock_guard lock(mx); CorrespondingType old_value = *a; - *a = cl_half_from_float((cl_half_to_float(*a) + cl_half_to_float(c)), - gHalfRoundingMode); - return old_value; - } - else if constexpr ( - std::is_same_v< - AtomicType, - HOST_ATOMIC_FLOAT> || std::is_same_v) - { - static std::mutex mx; - std::lock_guard lock(mx); - CorrespondingType old_value = *a; - *a += c; + CorrespondingType new_value = old_value + c; + *a = static_cast(new_value); return old_value; } else @@ -135,24 +225,13 @@ template CorrespondingType host_atomic_fetch_sub(volatile AtomicType *a, CorrespondingType c, TExplicitMemoryOrderType order) { - if constexpr ( - std::is_same_v< - AtomicType, - HOST_ATOMIC_DOUBLE> || std::is_same_v) + if constexpr (is_host_atomic_fp_v) { static std::mutex mx; std::lock_guard lock(mx); CorrespondingType old_value = *a; - *a -= c; - return old_value; - } - else if constexpr (std::is_same_v) - { - static std::mutex mx; - std::lock_guard lock(mx); - CorrespondingType old_value = *a; - *a = cl_half_from_float((cl_half_to_float(*a) - cl_half_to_float(c)), - gHalfRoundingMode); + CorrespondingType new_value = old_value - c; + *a = static_cast(new_value); return old_value; } else @@ -173,12 +252,14 @@ CorrespondingType host_atomic_exchange(volatile AtomicType *a, CorrespondingType TExplicitMemoryOrderType order) { #if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32)) - if (sizeof(CorrespondingType) == 2) - return InterlockedExchange16(reinterpret_cast(a), c); + if constexpr (sizeof(CorrespondingType) == 2) + return InterlockedExchange16(reinterpret_cast(a), + *reinterpret_cast(&c)); else - return InterlockedExchange(reinterpret_cast(a), c); + return InterlockedExchange(reinterpret_cast(a), + *reinterpret_cast(&c)); #elif defined(__GNUC__) - return __sync_lock_test_and_set(a, c); + return __sync_lock_test_and_set(a, *reinterpret_cast(&c)); #else log_info("Host function not implemented: atomic_exchange\n"); return 0; @@ -195,30 +276,14 @@ bool host_atomic_compare_exchange(volatile AtomicType *a, CorrespondingType *exp TExplicitMemoryOrderType order_failure) { CorrespondingType tmp; - if constexpr (std::is_same_v) + if constexpr (is_host_atomic_fp_v) { static std::mutex mtx; std::lock_guard lock(mtx); - tmp = *reinterpret_cast(a); - - if (cl_half_to_float(tmp) == cl_half_to_float(*expected)) - { - *reinterpret_cast(a) = desired; - return true; - } - *expected = tmp; - } - else if constexpr ( - std::is_same_v< - AtomicType, - HOST_ATOMIC_DOUBLE> || std::is_same_v) - { - static std::mutex mtx; - std::lock_guard lock(mtx); - tmp = *reinterpret_cast(a); + tmp = static_cast(*a); if (tmp == *expected) { - *a = desired; + *a = static_cast(desired); return true; } *expected = tmp; @@ -244,8 +309,8 @@ CorrespondingType host_atomic_load(volatile AtomicType *a, TExplicitMemoryOrderType order) { #if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32)) - if (sizeof(CorrespondingType) == 2) - auto prev = InterlockedOr16(reinterpret_cast(a), 0); + if constexpr (sizeof(CorrespondingType) == 2) + return InterlockedOr16(reinterpret_cast(a), 0); else return InterlockedExchangeAdd(reinterpret_cast(a), 0); #elif defined(__GNUC__) diff --git a/test_conformance/c11_atomics/test_atomics.cpp b/test_conformance/c11_atomics/test_atomics.cpp index 56f350ad..b1326e28 100644 --- a/test_conformance/c11_atomics/test_atomics.cpp +++ b/test_conformance/c11_atomics/test_atomics.cpp @@ -94,13 +94,7 @@ public: HostDataType *startRefValues, cl_uint whichDestValue) { - if (CBasicTestMemOrderScope::DataType() - ._type - != TYPE_ATOMIC_HALF) - expected = (HostDataType)whichDestValue; - else - expected = cl_half_from_float(static_cast(whichDestValue), - gHalfRoundingMode); + expected = static_cast(whichDestValue); return true; } }; @@ -401,13 +395,7 @@ public: HostDataType *startRefValues, cl_uint whichDestValue) { - if (CBasicTestMemOrderScope::DataType() - ._type - != TYPE_ATOMIC_HALF) - expected = (HostDataType)whichDestValue; - else - expected = cl_half_from_float(static_cast(whichDestValue), - gHalfRoundingMode); + expected = static_cast(whichDestValue); return true; } virtual bool VerifyRefs(bool &correct, cl_uint threadCount, @@ -417,25 +405,11 @@ public: correct = true; for (cl_uint i = 0; i < threadCount; i++) { - if constexpr (std::is_same_v) + if (refValues[i] != (HostDataType)i) { - HostDataType test = cl_half_from_float(static_cast(i), - gHalfRoundingMode); - if (refValues[i] != test) - { - log_error("Invalid value for thread %u\n", (cl_uint)i); - correct = false; - return true; - } - } - else - { - if (refValues[i] != (HostDataType)i) - { - log_error("Invalid value for thread %u\n", (cl_uint)i); - correct = false; - return true; - } + log_error("Invalid value for thread %u\n", (cl_uint)i); + correct = false; + return true; } } return true; @@ -553,11 +527,7 @@ public: : CBasicTestMemOrderScope(dataType, useSVM) { - if constexpr (std::is_same_v) - StartValue(cl_half_from_float(static_cast(1234), - gHalfRoundingMode)); - else - StartValue(123456); + StartValue(1234); } virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue) @@ -619,32 +589,19 @@ public: /* Any repeated value is treated as an error */ std::vector tidFound(threadCount); bool startValueFound = false; - cl_uint startVal = StartValue(); - - if constexpr (std::is_same_v) - startVal = static_cast( - cl_half_to_float(static_cast(StartValue()))); + cl_uint startVal = static_cast(StartValue()); for (cl_uint i = 0; i <= threadCount; i++) { cl_uint value = 0; if (i == threadCount) { - if constexpr (!std::is_same_v) - value = - (cl_uint)finalValues[0]; // additional value from atomic - // variable (last written) - else - value = - cl_half_to_float(static_cast(finalValues[0])); + value = static_cast( + static_cast(finalValues[0])); } else { - if constexpr (!std::is_same_v) - value = (cl_uint)refValues[i]; - else - value = - cl_half_to_float(static_cast(refValues[i])); + value = static_cast(refValues[i]); } if (value == startVal) @@ -1201,85 +1158,24 @@ public: useSVM), min_range(-999.0), max_range(999.0), max_error(0.0) { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { StartValue((HostDataType)0.0); CBasicTestMemOrderScope::OldValueCheck(false); + + // Narrow down range for half to avoid overflow to infinity + if constexpr (std::is_same_v) + { + min_range = -50.0; + max_range = 50.0; + } } } - template float accum_halfs(Iterator begin, Iterator end) - { - cl_half sum = 0; - for (auto it = begin; it != end; ++it) - { - sum = cl_half_from_float(cl_half_to_float(sum) - + cl_half_to_float(*it), - gHalfRoundingMode); - } - return cl_half_to_float(sum); - } bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) override { - if constexpr (std::is_same_v) - { - if (threadCount > ref_vals.size()) - { - ref_vals.resize(threadCount); - - for (cl_uint i = 0; i < threadCount; i++) - ref_vals[i] = cl_half_from_float( - get_random_float(min_range, max_range, d), - gHalfRoundingMode); - - memcpy(startRefValues, ref_vals.data(), - sizeof(HostDataType) * ref_vals.size()); - - // Estimate highest possible summation error for given set. - std::vector sums; - std::sort(ref_vals.begin(), ref_vals.end(), - [](cl_half a, cl_half b) { - return cl_half_to_float(a) < cl_half_to_float(b); - }); - - sums.push_back(accum_halfs(ref_vals.begin(), ref_vals.end())); - sums.push_back(accum_halfs(ref_vals.rbegin(), ref_vals.rend())); - - std::sort(ref_vals.begin(), ref_vals.end(), - [](cl_half a, cl_half b) { - return std::abs(cl_half_to_float(a)) - < std::abs(cl_half_to_float(b)); - }); - - float precise = 0.f; - for (auto elem : ref_vals) precise += cl_half_to_float(elem); - sums.push_back(precise); - - sums.push_back(accum_halfs(ref_vals.begin(), ref_vals.end())); - sums.push_back(accum_halfs(ref_vals.rbegin(), ref_vals.rend())); - - std::sort(sums.begin(), sums.end()); - max_error = std::abs(sums.front() - sums.back()); - - // restore unsorted order - memcpy(ref_vals.data(), startRefValues, - sizeof(HostDataType) * ref_vals.size()); - } - else - { - memcpy(startRefValues, ref_vals.data(), - sizeof(HostDataType) * threadCount); - } - return true; - } - else if constexpr ( - std::is_same_v< - HostDataType, - HOST_FLOAT> || std::is_same_v) + if constexpr (is_host_fp_v) { if (threadCount > ref_vals.size()) { @@ -1299,11 +1195,12 @@ public: std::vector sums; std::sort(ref_vals.begin(), ref_vals.end()); - sums.push_back( - std::accumulate(ref_vals.begin(), ref_vals.end(), 0.f)); + sums.push_back(std::accumulate(ref_vals.begin(), ref_vals.end(), + static_cast(0.f))); - sums.push_back( - std::accumulate(ref_vals.rbegin(), ref_vals.rend(), 0.f)); + sums.push_back(std::accumulate(ref_vals.rbegin(), + ref_vals.rend(), + static_cast(0.f))); std::sort(ref_vals.begin(), ref_vals.end(), [](HostDataType a, HostDataType b) { @@ -1318,15 +1215,25 @@ public: sums.push_back(precise); - sums.push_back( - std::accumulate(ref_vals.begin(), ref_vals.end(), 0.f)); + sums.push_back(std::accumulate(ref_vals.begin(), ref_vals.end(), + static_cast(0.f))); - sums.push_back( - std::accumulate(ref_vals.rbegin(), ref_vals.rend(), 0.f)); + sums.push_back(std::accumulate(ref_vals.rbegin(), + ref_vals.rend(), + static_cast(0.f))); std::sort(sums.begin(), sums.end()); + assert(std::all_of(sums.begin(), sums.end(), + [](const HostDataType &val) { + return std::isfinite( + static_cast(val)); + }) + && "Infinite summation value detected!"); max_error = std::abs(sums.front() - sums.back()); + log_info("Max allowed error for %u elements: %.10f\n", + threadCount, max_error); + // restore unsorted order memcpy(ref_vals.data(), startRefValues, sizeof(HostDataType) * ref_vals.size()); @@ -1345,10 +1252,7 @@ public: std::string memoryOrderScope = MemoryOrderScopeStr(); std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { return " atomic_fetch_add" + postfix + "(&destMemory[0], (" + DataType().AddSubOperandTypeName() + ")oldValues[tid]" @@ -1382,10 +1286,7 @@ public: volatile HostAtomicType *destMemory, HostDataType *oldValues) override { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { host_atomic_fetch_add(&destMemory[0], (HostDataType)oldValues[tid], MemoryOrder()); @@ -1411,23 +1312,7 @@ public: cl_uint whichDestValue) override { expected = StartValue(); - if constexpr (std::is_same_v) - { - if (whichDestValue == 0) - { - for (cl_uint i = 0; i < threadCount; i++) - { - expected = cl_half_from_float( - cl_half_to_float(expected) - + cl_half_to_float(startRefValues[i]), - gHalfRoundingMode); - } - } - } - else if constexpr ( - std::is_same_v< - HostDataType, - HOST_DOUBLE> || std::is_same_v) + if constexpr (is_host_fp_v) { if (whichDestValue == 0) for (cl_uint i = 0; i < threadCount; i++) @@ -1446,21 +1331,13 @@ public: const std::vector &testValues, cl_uint whichDestValue) override { - if constexpr (std::is_same_v) + if constexpr (is_host_fp_v) { if (whichDestValue == 0) - return std::abs(cl_half_to_float(expected) - - cl_half_to_float(testValues[whichDestValue])) - > max_error; - } - else if constexpr ( - std::is_same_v< - HostDataType, - HOST_DOUBLE> || std::is_same_v) - { - if (whichDestValue == 0) - return std::abs((HostDataType)expected - - testValues[whichDestValue]) + return std::abs( + static_cast(expected + - static_cast( + testValues[whichDestValue]))) > max_error; } return CBasicTestMemOrderScope< @@ -1471,10 +1348,7 @@ public: bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) override { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { correct = true; for (cl_uint i = 1; i < threadCount; i++) @@ -1534,10 +1408,7 @@ public: } cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { return threadCount; } @@ -1657,8 +1528,8 @@ template <> double kahan_sub(const std::vector &nums) double compensation = 0.0; for (double num : nums) { - double y = num - compensation; - double t = sum - y; + double y = -num - compensation; + double t = sum + y; compensation = (t - sum) - y; sum = t; } @@ -1685,14 +1556,18 @@ public: useSVM), min_range(-999.0), max_range(999.0), max_error(0.0) { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_DOUBLE> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { StartValue(0); CBasicTestMemOrderScope::OldValueCheck(false); + + // Narrow down range for half to avoid overflow to infinity + if constexpr (std::is_same_v) + { + min_range = -50.0; + max_range = 50.0; + } } } template @@ -1702,25 +1577,10 @@ public: for (auto it = begin; it != end; ++it) res = res - *it; return res; } - template - float subtract_halfs(Iterator begin, Iterator end) - { - cl_half res = 0; - for (auto it = begin; it != end; ++it) - { - res = cl_half_from_float(cl_half_to_float(res) - - cl_half_to_float(*it), - gHalfRoundingMode); - } - return cl_half_to_float(res); - } bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) override { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_DOUBLE> || std::is_same_v) + if constexpr (is_host_fp_v) { if (threadCount > ref_vals.size()) { @@ -1736,105 +1596,36 @@ public: // Estimate highest possible subtraction error for given set. std::vector sums; std::sort(ref_vals.begin(), ref_vals.end()); + sums.push_back(subtract(ref_vals.begin(), ref_vals.end())); + sums.push_back(subtract(ref_vals.rbegin(), ref_vals.rend())); + std::sort(ref_vals.begin(), ref_vals.end(), + [](HostDataType a, HostDataType b) { + return std::abs(a) < std::abs(b); + }); + + double precise = 0.0; if constexpr (std::is_same_v) - { - sums.push_back(subtract(ref_vals.begin(), ref_vals.end())); - - sums.push_back( - subtract(ref_vals.rbegin(), ref_vals.rend())); - - std::sort(ref_vals.begin(), ref_vals.end(), - [](double a, double b) { - return std::abs(a) < std::abs(b); - }); - - double precise = kahan_sub(ref_vals); - sums.push_back(precise); - - sums.push_back(subtract(ref_vals.begin(), ref_vals.end())); - - sums.push_back( - subtract(ref_vals.rbegin(), ref_vals.rend())); - - std::sort(sums.begin(), sums.end()); - max_error = std::abs((double)sums.front() - sums.back()); - } + precise = kahan_sub(ref_vals); else - { - sums.push_back(subtract(ref_vals.begin(), ref_vals.end())); - sums.push_back( - subtract(ref_vals.rbegin(), ref_vals.rend())); - - std::sort(ref_vals.begin(), ref_vals.end(), - [](float a, float b) { - return std::abs(a) < std::abs(b); - }); - - double precise = 0.0; for (auto elem : ref_vals) precise += double(elem); - sums.push_back(precise); - sums.push_back(subtract(ref_vals.begin(), ref_vals.end())); - sums.push_back( - subtract(ref_vals.rbegin(), ref_vals.rend())); - - std::sort(sums.begin(), sums.end()); - max_error = - std::abs((HOST_ATOMIC_FLOAT)sums.front() - sums.back()); - } - - // restore unsorted order - memcpy(ref_vals.data(), startRefValues, - sizeof(HostDataType) * ref_vals.size()); - } - else - { - memcpy(startRefValues, ref_vals.data(), - sizeof(HostDataType) * threadCount); - } - return true; - } - else if constexpr (std::is_same_v) - { - if (threadCount > ref_vals.size()) - { - ref_vals.resize(threadCount); - for (cl_uint i = 0; i < threadCount; i++) - ref_vals[i] = cl_half_from_float( - get_random_float(min_range, max_range, d), - gHalfRoundingMode); - - memcpy(startRefValues, ref_vals.data(), - sizeof(HostDataType) * ref_vals.size()); - - // Estimate highest possible summation error for given set. - std::vector sums; - std::sort(ref_vals.begin(), ref_vals.end(), - [](cl_half a, cl_half b) { - return cl_half_to_float(a) < cl_half_to_float(b); - }); - - sums.push_back( - subtract_halfs(ref_vals.begin(), ref_vals.end())); - sums.push_back( - subtract_halfs(ref_vals.rbegin(), ref_vals.rend())); - - std::sort(ref_vals.begin(), ref_vals.end(), - [](cl_half a, cl_half b) { - return std::abs(cl_half_to_float(a)) - < std::abs(cl_half_to_float(b)); - }); - - float precise = 0.f; - for (auto elem : ref_vals) precise -= cl_half_to_float(elem); sums.push_back(precise); - sums.push_back( - subtract_halfs(ref_vals.begin(), ref_vals.end())); - sums.push_back( - subtract_halfs(ref_vals.rbegin(), ref_vals.rend())); + + sums.push_back(subtract(ref_vals.begin(), ref_vals.end())); + sums.push_back(subtract(ref_vals.rbegin(), ref_vals.rend())); std::sort(sums.begin(), sums.end()); + assert(std::all_of(sums.begin(), sums.end(), + [](const HostDataType &val) { + return std::isfinite( + static_cast(val)); + }) + && "Infinite subtraction value detected!"); max_error = std::abs(sums.front() - sums.back()); + + log_info("Max allowed error for %u elements: %.10f\n", + threadCount, max_error); + // restore unsorted order memcpy(ref_vals.data(), startRefValues, sizeof(HostDataType) * ref_vals.size()); @@ -1853,10 +1644,7 @@ public: std::string memoryOrderScope = MemoryOrderScopeStr(); std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - if constexpr ( - std::is_same_v< - HostDataType, - HOST_DOUBLE> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { return " atomic_fetch_sub" + postfix + "(&destMemory[0], (" + DataType().AddSubOperandTypeName() + ")oldValues[tid]" @@ -1878,10 +1666,7 @@ public: volatile HostAtomicType *destMemory, HostDataType *oldValues) override { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_DOUBLE> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { host_atomic_fetch_sub(&destMemory[0], (HostDataType)oldValues[tid], MemoryOrder()); @@ -1903,29 +1688,12 @@ public: cl_uint whichDestValue) override { expected = StartValue(); - - if constexpr ( - std::is_same_v< - HostDataType, - HOST_DOUBLE> || std::is_same_v) + if constexpr (is_host_fp_v) { if (whichDestValue == 0) for (cl_uint i = 0; i < threadCount; i++) expected -= startRefValues[i]; } - else if constexpr (std::is_same_v) - { - if (whichDestValue == 0) - { - for (cl_uint i = 0; i < threadCount; i++) - { - expected = cl_half_from_float( - cl_half_to_float(expected) - - cl_half_to_float(startRefValues[i]), - gHalfRoundingMode); - } - } - } else { for (cl_uint i = 0; i < threadCount; i++) @@ -1938,21 +1706,13 @@ public: const std::vector &testValues, cl_uint whichDestValue) override { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_DOUBLE> || std::is_same_v) + if constexpr (is_host_fp_v) { if (whichDestValue == 0) - return std::abs((HOST_ATOMIC_FLOAT)expected - - testValues[whichDestValue]) - > max_error; - } - else if constexpr (std::is_same_v) - { - if (whichDestValue == 0) - return std::abs(cl_half_to_float(expected) - - cl_half_to_float(testValues[whichDestValue])) + return std::abs( + static_cast(expected + - static_cast( + testValues[whichDestValue]))) > max_error; } return CBasicTestMemOrderScope< @@ -2023,10 +1783,7 @@ public: } cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_DOUBLE> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { return threadCount; } @@ -2947,10 +2704,7 @@ public: min_range(-999.0), max_range(999.0) { StartValue(DataType().MaxValue()); - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v) + if constexpr (is_host_fp_v) { CBasicTestMemOrderScope::OldValueCheck(false); @@ -2960,10 +2714,7 @@ public: { std::string memoryOrderScope = MemoryOrderScopeStr(); std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { return " atomic_fetch_min" + postfix + "(&destMemory[0], oldValues[tid] " + memoryOrderScope + ");\n" @@ -2982,10 +2733,7 @@ public: volatile HostAtomicType *destMemory, HostDataType *oldValues) override { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { host_atomic_fetch_min(&destMemory[0], oldValues[tid], MemoryOrder()); @@ -3040,19 +2788,7 @@ public: cl_uint whichDestValue) override { expected = StartValue(); - if constexpr (std::is_same_v) - { - if (whichDestValue == 0) - { - for (cl_uint i = 0; i < threadCount; i++) - { - if (cl_half_to_float(startRefValues[i]) - < cl_half_to_float(expected)) - expected = startRefValues[i]; - } - } - } - else if constexpr (std::is_same_v) + if constexpr (is_host_fp_v) { if (whichDestValue == 0) for (cl_uint i = 0; i < threadCount; i++) @@ -3072,9 +2808,7 @@ public: const std::vector &testValues, cl_uint whichDestValue) override { - if (std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { if (whichDestValue == 0) return CBasicTestMemOrderScope:: @@ -3089,17 +2823,16 @@ public: bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) override { - if (std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same::value) + if constexpr (is_host_fp_v) { correct = true; for (cl_uint i = 1; i < threadCount; i++) { if (refValues[i] != StartValue()) { - log_error("Thread %d found %d mismatch(es)\n", i, - (cl_uint)refValues[i]); + log_error( + "Thread %d found %lf mismatch(es), start value=%lf\n", + i, (double)refValues[i], (double)StartValue()); correct = false; } } @@ -3156,10 +2889,7 @@ public: } cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { return threadCount; } @@ -3287,21 +3017,11 @@ public: useSVM), min_range(-999.0), max_range(999.0) { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v) + StartValue(DataType().MinValue()); + if constexpr (is_host_fp_v) { CBasicTestMemOrderScope::OldValueCheck(false); - if constexpr (std::is_same_v) - StartValue(cl_half_from_float(-CL_HALF_MAX, gHalfRoundingMode)); - else - StartValue(-DataType().MaxValue()); - } - else - { - StartValue(DataType().MinValue()); } } std::string ProgramCore() override @@ -3330,10 +3050,7 @@ public: volatile HostAtomicType *destMemory, HostDataType *oldValues) override { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { host_atomic_fetch_max(&destMemory[0], oldValues[tid], MemoryOrder()); @@ -3349,23 +3066,12 @@ public: bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d) override { - if constexpr (std::is_same_v) + if constexpr (is_host_fp_v) { for (cl_uint i = 0; i < threadCount; i++) { - startRefValues[i] = cl_half_from_float( - get_random_float(min_range, max_range, d), - gHalfRoundingMode); - } - } - else if constexpr ( - std::is_same_v< - HostDataType, - HOST_FLOAT> || std::is_same_v) - { - for (cl_uint i = 0; i < threadCount; i++) - { - startRefValues[i] = get_random_float(min_range, max_range, d); + startRefValues[i] = static_cast( + get_random_float(min_range, max_range, d)); } } else @@ -3388,19 +3094,7 @@ public: cl_uint whichDestValue) override { expected = StartValue(); - if constexpr (std::is_same_v) - { - if (whichDestValue == 0) - { - for (cl_uint i = 0; i < threadCount; i++) - { - if (cl_half_to_float(startRefValues[i]) - > cl_half_to_float(expected)) - expected = startRefValues[i]; - } - } - } - else if constexpr (std::is_same_v) + if constexpr (is_host_fp_v) { if (whichDestValue == 0) for (cl_uint i = 0; i < threadCount; i++) @@ -3420,9 +3114,7 @@ public: const std::vector &testValues, cl_uint whichDestValue) override { - if (std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { if (whichDestValue == 0) return CBasicTestMemOrderScope:: @@ -3437,17 +3129,16 @@ public: bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues) override { - if (std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { correct = true; for (cl_uint i = 1; i < threadCount; i++) { if (refValues[i] != StartValue()) { - log_error("Thread %d found %d mismatch(es)\n", i, - (cl_uint)refValues[i]); + log_error( + "Thread %d found %lf mismatch(es), start value=%lf\n", + i, (double)refValues[i], (double)StartValue()); correct = false; } } @@ -3504,10 +3195,7 @@ public: } cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override { - if constexpr ( - std::is_same_v< - HostDataType, - HOST_HALF> || std::is_same_v || std::is_same_v) + if constexpr (is_host_fp_v) { return threadCount; }