Added support for cl_ext_float_atomics in CBasicTestFetchSub with atomic_half (#2366)

Related to #2142, according to the work plan, extending
CBasicTestFetchSub with support for atomic_half.

I wasn't able to test that PR entirely due to missing
CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT/CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT
capabilities for atomic_half. I appreciate reviewers' attention, thanks.
This commit is contained in:
Marcin Hajder
2025-09-09 17:40:50 +02:00
committed by GitHub
parent 913e6e4388
commit 1aeca1360b
2 changed files with 245 additions and 27 deletions

View File

@@ -19,6 +19,8 @@
#include "harness/testHarness.h"
#include <mutex>
#include "CL/cl_half.h"
#ifdef WIN32
#include "Windows.h"
#endif
@@ -88,6 +90,8 @@ enum TExplicitMemoryOrderType
#define HOST_FLAG cl_int
extern cl_half_rounding_mode gHalfRoundingMode;
// host atomic functions
void host_atomic_thread_fence(TExplicitMemoryOrderType order);
@@ -120,7 +124,18 @@ template <typename AtomicType, typename CorrespondingType>
CorrespondingType host_atomic_fetch_sub(volatile AtomicType *a, CorrespondingType c,
TExplicitMemoryOrderType order)
{
#if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32))
if constexpr (std::is_same_v<AtomicType, HOST_ATOMIC_HALF>)
{
static std::mutex mx;
std::lock_guard<std::mutex> lock(mx);
CorrespondingType old_value = *a;
*a = cl_half_from_float((cl_half_to_float(*a) - cl_half_to_float(c)),
gHalfRoundingMode);
return old_value;
}
else
{
#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
return InterlockedExchangeSubtract(a, c);
#elif defined(__GNUC__)
return __sync_fetch_and_sub(a, c);
@@ -128,6 +143,7 @@ CorrespondingType host_atomic_fetch_sub(volatile AtomicType *a, CorrespondingTyp
log_info("Host function not implemented: atomic_fetch_sub\n");
return 0;
#endif
}
}
template <typename AtomicType, typename CorrespondingType>

View File

@@ -1488,46 +1488,239 @@ REGISTER_TEST(svm_atomic_fetch_add)
template <typename HostAtomicType, typename HostDataType>
class CBasicTestFetchSub
: public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
double min_range;
double max_range;
double max_error;
std::vector<HostDataType> ref_vals;
public:
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
using CBasicTestMemOrderScope<HostAtomicType,
HostDataType>::MemoryOrderScopeStr;
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
CBasicTestFetchSub(TExplicitAtomicType dataType, bool useSVM)
: CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
useSVM)
{}
virtual std::string ProgramCore()
useSVM),
min_range(-999.0), max_range(999.0), max_error(0.0)
{
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
{
StartValue(0);
CBasicTestMemOrderScope<HostAtomicType,
HostDataType>::OldValueCheck(false);
}
}
template <typename Iterator>
float subtract_halfs(Iterator begin, Iterator end)
{
cl_half res = 0;
for (auto it = begin; it != end; ++it)
{
res = cl_half_from_float(cl_half_to_float(res)
- cl_half_to_float(*it),
gHalfRoundingMode);
}
return cl_half_to_float(res);
}
bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
MTdata d) override
{
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
{
if (threadCount > ref_vals.size())
{
ref_vals.resize(threadCount);
for (cl_uint i = 0; i < threadCount; i++)
ref_vals[i] = cl_half_from_float(
get_random_float(min_range, max_range, d),
gHalfRoundingMode);
memcpy(startRefValues, ref_vals.data(),
sizeof(HostDataType) * ref_vals.size());
// Estimate highest possible summation error for given set.
std::vector<float> sums;
std::sort(ref_vals.begin(), ref_vals.end(),
[](cl_half a, cl_half b) {
return cl_half_to_float(a) < cl_half_to_float(b);
});
sums.push_back(
subtract_halfs(ref_vals.begin(), ref_vals.end()));
sums.push_back(
subtract_halfs(ref_vals.rbegin(), ref_vals.rend()));
std::sort(ref_vals.begin(), ref_vals.end(),
[](cl_half a, cl_half b) {
return std::abs(cl_half_to_float(a))
< std::abs(cl_half_to_float(b));
});
float precise = 0.f;
for (auto elem : ref_vals) precise -= cl_half_to_float(elem);
sums.push_back(precise);
sums.push_back(
subtract_halfs(ref_vals.begin(), ref_vals.end()));
sums.push_back(
subtract_halfs(ref_vals.rbegin(), ref_vals.rend()));
std::sort(sums.begin(), sums.end());
max_error = std::abs(sums.front() - sums.back());
// restore unsorted order
memcpy(ref_vals.data(), startRefValues,
sizeof(HostDataType) * ref_vals.size());
}
else
{
memcpy(startRefValues, ref_vals.data(),
sizeof(HostDataType) * threadCount);
}
return true;
}
return false;
}
std::string ProgramCore() override
{
std::string memoryOrderScope = MemoryOrderScopeStr();
std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
{
return " atomic_fetch_sub" + postfix + "(&destMemory[0], ("
+ DataType().AddSubOperandTypeName() + ")oldValues[tid]"
+ memoryOrderScope + ");\n"
+ " oldValues[tid] = atomic_fetch_sub" + postfix
+ "(&destMemory[tid], (" + DataType().AddSubOperandTypeName()
+ ")0" + memoryOrderScope + ");\n";
}
else
{
return " oldValues[tid] = atomic_fetch_sub" + postfix
+ "(&destMemory[0], tid + 3 +((("
+ DataType().AddSubOperandTypeName() + ")tid + 3) << (sizeof("
+ DataType().AddSubOperandTypeName() + ")-1)*8)" + memoryOrderScope
+ ");\n";
+ DataType().AddSubOperandTypeName() + ")-1)*8)"
+ memoryOrderScope + ");\n";
}
virtual void HostFunction(cl_uint tid, cl_uint threadCount,
}
void HostFunction(cl_uint tid, cl_uint threadCount,
volatile HostAtomicType *destMemory,
HostDataType *oldValues)
HostDataType *oldValues) override
{
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
{
host_atomic_fetch_sub(&destMemory[0], (HostDataType)oldValues[tid],
MemoryOrder());
oldValues[tid] = host_atomic_fetch_sub(
&destMemory[0],
&destMemory[tid], (HostDataType)0, MemoryOrder());
}
else
{
oldValues[tid] =
host_atomic_fetch_sub(&destMemory[0],
(HostDataType)tid + 3
+ (((HostDataType)tid + 3) << (sizeof(HostDataType) - 1) * 8),
+ (((HostDataType)tid + 3)
<< (sizeof(HostDataType) - 1) * 8),
MemoryOrder());
}
virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
}
bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
HostDataType *startRefValues,
cl_uint whichDestValue)
cl_uint whichDestValue) override
{
expected = StartValue();
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
{
if (whichDestValue == 0)
{
for (cl_uint i = 0; i < threadCount; i++)
{
expected = cl_half_from_float(
cl_half_to_float(expected)
- cl_half_to_float(startRefValues[i]),
gHalfRoundingMode);
}
}
}
else
{
for (cl_uint i = 0; i < threadCount; i++)
expected -= (HostDataType)i + 3
+ (((HostDataType)i + 3) << (sizeof(HostDataType) - 1) * 8);
}
return true;
}
bool IsTestNotAsExpected(const HostDataType &expected,
const std::vector<HostAtomicType> &testValues,
cl_uint whichDestValue) override
{
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
{
if (whichDestValue == 0)
return std::abs(cl_half_to_float(expected)
- cl_half_to_float(testValues[whichDestValue]))
> max_error;
}
return CBasicTestMemOrderScope<
HostAtomicType, HostDataType>::IsTestNotAsExpected(expected,
testValues,
whichDestValue);
}
bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues,
HostAtomicType *finalValues) override
{
if (std::is_same<HostDataType, HOST_ATOMIC_FLOAT>::value)
{
correct = true;
for (cl_uint i = 1; i < threadCount; i++)
{
if (refValues[i] != StartValue())
{
log_error("Thread %d found %d mismatch(es)\n", i,
(cl_uint)refValues[i]);
correct = false;
}
}
return !correct;
}
return CBasicTestMemOrderScope<HostAtomicType,
HostDataType>::VerifyRefs(correct,
threadCount,
refValues,
finalValues);
}
int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
cl_command_queue queue) override
{
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
{
if (LocalMemory()
&& (gHalfAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0)
return 0; // skip test - not applicable
if (!LocalMemory()
&& (gHalfAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT) == 0)
return 0;
}
return CBasicTestMemOrderScope<
HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context,
queue);
}
cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override
{
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
{
return threadCount;
}
return CBasicTestMemOrderScope<HostAtomicType,
HostDataType>::NumResults(threadCount,
deviceID);
}
};
static int test_atomic_fetch_sub_generic(cl_device_id deviceID,
@@ -1552,6 +1745,15 @@ static int test_atomic_fetch_sub_generic(cl_device_id deviceID,
TYPE_ATOMIC_ULONG, useSVM);
EXECUTE_TEST(error,
test_ulong.Execute(deviceID, context, queue, num_elements));
if (gFloatAtomicsSupported)
{
CBasicTestFetchSub<HOST_ATOMIC_HALF, HOST_HALF> test_half(
TYPE_ATOMIC_HALF, useSVM);
EXECUTE_TEST(error,
test_half.Execute(deviceID, context, queue, num_elements));
}
if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
{
CBasicTestFetchSub<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>