Added support for cl_ext_float_atomics in CBasicTestFetchAdd with atomic_float (#2345)

Related to #2142, according to the work plan, extending
CBasicTestFetchAdd with support for atomic_float.
This commit is contained in:
Marcin Hajder
2025-09-02 17:38:56 +02:00
committed by GitHub
parent d417d7670d
commit fbba22770d
4 changed files with 253 additions and 42 deletions

View File

@@ -74,9 +74,11 @@ extern int
gMaxDeviceThreads; // maximum number of threads executed on OCL device gMaxDeviceThreads; // maximum number of threads executed on OCL device
extern cl_device_atomic_capabilities gAtomicMemCap, extern cl_device_atomic_capabilities gAtomicMemCap,
gAtomicFenceCap; // atomic memory and fence capabilities for this device gAtomicFenceCap; // atomic memory and fence capabilities for this device
extern cl_half_rounding_mode gHalfRoundingMode; extern cl_half_rounding_mode gHalfRoundingMode;
extern bool gFloatAtomicsSupported; extern bool gFloatAtomicsSupported;
extern cl_device_fp_atomic_capabilities_ext gHalfAtomicCaps; extern cl_device_fp_atomic_capabilities_ext gHalfAtomicCaps;
extern cl_device_fp_atomic_capabilities_ext gFloatAtomicCaps;
extern const char * extern const char *
get_memory_order_type_name(TExplicitMemoryOrderType orderType); get_memory_order_type_name(TExplicitMemoryOrderType orderType);
@@ -174,6 +176,13 @@ public:
{ {
return false; return false;
} }
virtual bool
IsTestNotAsExpected(const HostDataType &expected,
const std::vector<HostAtomicType> &testValues,
cl_uint whichDestValue)
{
return expected != testValues[whichDestValue];
}
virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
MTdata d) MTdata d)
{ {
@@ -1449,7 +1458,7 @@ int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
startRefValues.size() ? &startRefValues[0] : 0, i)) startRefValues.size() ? &startRefValues[0] : 0, i))
break; // no expected value function provided break; // no expected value function provided
if (expected != destItems[i]) if (IsTestNotAsExpected(expected, destItems, i))
{ {
std::stringstream logLine; std::stringstream logLine;
logLine << "ERROR: Result " << i logLine << "ERROR: Result " << i

View File

@@ -17,6 +17,7 @@
#define HOST_ATOMICS_H_ #define HOST_ATOMICS_H_
#include "harness/testHarness.h" #include "harness/testHarness.h"
#include <mutex>
#ifdef WIN32 #ifdef WIN32
#include "Windows.h" #include "Windows.h"
@@ -94,14 +95,25 @@ template <typename AtomicType, typename CorrespondingType>
CorrespondingType host_atomic_fetch_add(volatile AtomicType *a, CorrespondingType c, CorrespondingType host_atomic_fetch_add(volatile AtomicType *a, CorrespondingType c,
TExplicitMemoryOrderType order) TExplicitMemoryOrderType order)
{ {
if constexpr (std::is_same_v<AtomicType, HOST_ATOMIC_FLOAT>)
{
static std::mutex mx;
std::lock_guard<std::mutex> lock(mx);
CorrespondingType old_value = *a;
*a += c;
return old_value;
}
else
{
#if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32)) #if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32))
return InterlockedExchangeAdd(a, c); return InterlockedExchangeAdd(a, c);
#elif defined(__GNUC__) #elif defined(__GNUC__)
return __sync_fetch_and_add(a, c); return __sync_fetch_and_add(a, c);
#else #else
log_info("Host function not implemented: atomic_fetch_add\n"); log_info("Host function not implemented: atomic_fetch_add\n");
return 0; return 0;
#endif #endif
}
} }
template <typename AtomicType, typename CorrespondingType> template <typename AtomicType, typename CorrespondingType>

View File

@@ -34,6 +34,7 @@ cl_device_atomic_capabilities gAtomicMemCap,
cl_half_rounding_mode gHalfRoundingMode = CL_HALF_RTE; cl_half_rounding_mode gHalfRoundingMode = CL_HALF_RTE;
bool gFloatAtomicsSupported = false; bool gFloatAtomicsSupported = false;
cl_device_fp_atomic_capabilities_ext gHalfAtomicCaps = 0; cl_device_fp_atomic_capabilities_ext gHalfAtomicCaps = 0;
cl_device_fp_atomic_capabilities_ext gFloatAtomicCaps = 0;
test_status InitCL(cl_device_id device) { test_status InitCL(cl_device_id device) {
auto version = get_device_cl_version(device); auto version = get_device_cl_version(device);
@@ -132,6 +133,11 @@ test_status InitCL(cl_device_id device) {
if (is_extension_available(device, "cl_ext_float_atomics")) if (is_extension_available(device, "cl_ext_float_atomics"))
{ {
gFloatAtomicsSupported = true; gFloatAtomicsSupported = true;
cl_int error = clGetDeviceInfo(
device, CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT,
sizeof(gFloatAtomicCaps), &gFloatAtomicCaps, nullptr);
test_error_ret(error, "clGetDeviceInfo failed!", TEST_FAIL);
if (is_extension_available(device, "cl_khr_fp16")) if (is_extension_available(device, "cl_khr_fp16"))
{ {
cl_int error = clGetDeviceInfo( cl_int error = clGetDeviceInfo(

View File

@@ -16,10 +16,13 @@
#include "harness/testHarness.h" #include "harness/testHarness.h"
#include "harness/kernelHelpers.h" #include "harness/kernelHelpers.h"
#include "harness/typeWrappers.h" #include "harness/typeWrappers.h"
#include "harness/conversions.h"
#include "common.h" #include "common.h"
#include "host_atomics.h" #include "host_atomics.h"
#include <algorithm>
#include <numeric>
#include <sstream> #include <sstream>
#include <vector> #include <vector>
@@ -1163,61 +1166,233 @@ REGISTER_TEST(svm_atomic_compare_exchange_weak)
template <typename HostAtomicType, typename HostDataType> template <typename HostAtomicType, typename HostDataType>
class CBasicTestFetchAdd class CBasicTestFetchAdd
: public CBasicTestMemOrderScope<HostAtomicType, HostDataType> { : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {
double min_range;
double max_range;
double max_error_fp32;
std::vector<HostDataType> ref_vals;
public: public:
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder; using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
using CBasicTestMemOrderScope<HostAtomicType, using CBasicTestMemOrderScope<HostAtomicType,
HostDataType>::MemoryOrderScopeStr; HostDataType>::MemoryOrderScopeStr;
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue; using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::StartValue;
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType; using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::DataType;
using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
CBasicTestFetchAdd(TExplicitAtomicType dataType, bool useSVM) CBasicTestFetchAdd(TExplicitAtomicType dataType, bool useSVM)
: CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
useSVM) useSVM),
{} min_range(-999.0), max_range(999.0), max_error_fp32(0.0)
virtual std::string ProgramCore() {
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
{
StartValue(0.f);
CBasicTestMemOrderScope<HostAtomicType,
HostDataType>::OldValueCheck(false);
}
}
bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
MTdata d) override
{
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
{
if (threadCount > ref_vals.size())
{
ref_vals.resize(threadCount);
for (cl_uint i = 0; i < threadCount; i++)
ref_vals[i] = get_random_float(min_range, max_range, d);
memcpy(startRefValues, ref_vals.data(),
sizeof(HostDataType) * ref_vals.size());
// Estimate highest possible summation error for given set.
std::vector<HostDataType> sums;
std::sort(ref_vals.begin(), ref_vals.end());
sums.push_back(
std::accumulate(ref_vals.begin(), ref_vals.end(), 0.f));
sums.push_back(
std::accumulate(ref_vals.rbegin(), ref_vals.rend(), 0.f));
std::sort(
ref_vals.begin(), ref_vals.end(),
[](float a, float b) { return std::abs(a) < std::abs(b); });
double precise = 0.0;
for (auto elem : ref_vals) precise += double(elem);
sums.push_back(precise);
sums.push_back(
std::accumulate(ref_vals.begin(), ref_vals.end(), 0.f));
sums.push_back(
std::accumulate(ref_vals.rbegin(), ref_vals.rend(), 0.f));
std::sort(sums.begin(), sums.end());
max_error_fp32 =
std::abs((HOST_ATOMIC_FLOAT)sums.front() - sums.back());
// restore unsorted order
memcpy(ref_vals.data(), startRefValues,
sizeof(HostDataType) * ref_vals.size());
}
else
{
memcpy(startRefValues, ref_vals.data(),
sizeof(HostDataType) * threadCount);
}
return true;
}
return false;
}
std::string ProgramCore() override
{ {
std::string memoryOrderScope = MemoryOrderScopeStr(); std::string memoryOrderScope = MemoryOrderScopeStr();
std::string postfix(memoryOrderScope.empty() ? "" : "_explicit"); std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
return " oldValues[tid] = atomic_fetch_add" + postfix
+ "(&destMemory[0], (" + DataType().AddSubOperandTypeName() if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+ ")tid + 3" + memoryOrderScope + ");\n" + " atomic_fetch_add" {
+ postfix + "(&destMemory[0], (" return " atomic_fetch_add" + postfix + "(&destMemory[0], ("
+ DataType().AddSubOperandTypeName() + ")tid + 3" + memoryOrderScope + DataType().AddSubOperandTypeName() + ")oldValues[tid]"
+ ");\n" + memoryOrderScope + ");\n"
" atomic_fetch_add" + " oldValues[tid] = atomic_fetch_add" + postfix
+ postfix + "(&destMemory[0], (" + "(&destMemory[tid], (" + DataType().AddSubOperandTypeName()
+ DataType().AddSubOperandTypeName() + ")tid + 3" + memoryOrderScope + ")0" + memoryOrderScope + ");\n";
+ ");\n" }
" atomic_fetch_add" else
+ postfix + "(&destMemory[0], ((" {
+ DataType().AddSubOperandTypeName() + ")tid + 3) << (sizeof(" return " oldValues[tid] = atomic_fetch_add" + postfix
+ DataType().AddSubOperandTypeName() + ")-1)*8" + memoryOrderScope + "(&destMemory[0], (" + DataType().AddSubOperandTypeName()
+ ");\n"; + ")tid + 3" + memoryOrderScope + ");\n" + " atomic_fetch_add"
+ postfix + "(&destMemory[0], ("
+ DataType().AddSubOperandTypeName() + ")tid + 3"
+ memoryOrderScope
+ ");\n"
" atomic_fetch_add"
+ postfix + "(&destMemory[0], ("
+ DataType().AddSubOperandTypeName() + ")tid + 3"
+ memoryOrderScope
+ ");\n"
" atomic_fetch_add"
+ postfix + "(&destMemory[0], (("
+ DataType().AddSubOperandTypeName() + ")tid + 3) << (sizeof("
+ DataType().AddSubOperandTypeName() + ")-1)*8"
+ memoryOrderScope + ");\n";
}
} }
virtual void HostFunction(cl_uint tid, cl_uint threadCount, void HostFunction(cl_uint tid, cl_uint threadCount,
volatile HostAtomicType *destMemory, volatile HostAtomicType *destMemory,
HostDataType *oldValues) HostDataType *oldValues) override
{ {
oldValues[tid] = host_atomic_fetch_add( if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
&destMemory[0], (HostDataType)tid + 3, MemoryOrder()); {
host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3, host_atomic_fetch_add(&destMemory[0], (HostDataType)oldValues[tid],
MemoryOrder()); MemoryOrder());
host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3, oldValues[tid] = host_atomic_fetch_add(
MemoryOrder()); &destMemory[tid], (HostDataType)0, MemoryOrder());
host_atomic_fetch_add(&destMemory[0], }
((HostDataType)tid + 3) else
<< (sizeof(HostDataType) - 1) * 8, {
MemoryOrder()); oldValues[tid] = host_atomic_fetch_add(
&destMemory[0], (HostDataType)tid + 3, MemoryOrder());
host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3,
MemoryOrder());
host_atomic_fetch_add(&destMemory[0], (HostDataType)tid + 3,
MemoryOrder());
host_atomic_fetch_add(
&destMemory[0],
(((HostDataType)tid + 3) << (sizeof(HostDataType) - 1) * 8),
MemoryOrder());
}
} }
virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, bool ExpectedValue(HostDataType &expected, cl_uint threadCount,
HostDataType *startRefValues, HostDataType *startRefValues,
cl_uint whichDestValue) cl_uint whichDestValue) override
{ {
expected = StartValue(); expected = StartValue();
for (cl_uint i = 0; i < threadCount; i++) if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
expected += ((HostDataType)i + 3) * 3 {
+ (((HostDataType)i + 3) << (sizeof(HostDataType) - 1) * 8); if (whichDestValue == 0)
for (cl_uint i = 0; i < threadCount; i++)
expected += startRefValues[i];
}
else
{
for (cl_uint i = 0; i < threadCount; i++)
expected += ((HostDataType)i + 3) * 3
+ (((HostDataType)i + 3) << (sizeof(HostDataType) - 1) * 8);
}
return true; return true;
} }
bool IsTestNotAsExpected(const HostDataType &expected,
const std::vector<HostAtomicType> &testValues,
cl_uint whichDestValue) override
{
if (std::is_same<HostDataType, HOST_ATOMIC_FLOAT>::value)
{
if (whichDestValue == 0)
return std::abs((HOST_ATOMIC_FLOAT)expected
- testValues[whichDestValue])
> max_error_fp32;
}
return CBasicTestMemOrderScope<
HostAtomicType, HostDataType>::IsTestNotAsExpected(expected,
testValues,
whichDestValue);
}
bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues,
HostAtomicType *finalValues) override
{
if (std::is_same<HostDataType, HOST_ATOMIC_FLOAT>::value)
{
correct = true;
for (cl_uint i = 1; i < threadCount; i++)
{
if (refValues[i] != StartValue())
{
log_error("Thread %d found %d mismatch(es)\n", i,
(cl_uint)refValues[i]);
correct = false;
}
}
return !correct;
}
return CBasicTestMemOrderScope<HostAtomicType,
HostDataType>::VerifyRefs(correct,
threadCount,
refValues,
finalValues);
}
int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
cl_command_queue queue) override
{
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
{
if (LocalMemory()
&& (gFloatAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0)
return 0; // skip test - not applicable
if (!LocalMemory()
&& (gFloatAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT) == 0)
return 0;
}
return CBasicTestMemOrderScope<
HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context,
queue);
}
cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override
{
if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
{
return threadCount;
}
return CBasicTestMemOrderScope<HostAtomicType,
HostDataType>::NumResults(threadCount,
deviceID);
}
}; };
static int test_atomic_fetch_add_generic(cl_device_id deviceID, static int test_atomic_fetch_add_generic(cl_device_id deviceID,
@@ -1242,6 +1417,15 @@ static int test_atomic_fetch_add_generic(cl_device_id deviceID,
TYPE_ATOMIC_ULONG, useSVM); TYPE_ATOMIC_ULONG, useSVM);
EXECUTE_TEST(error, EXECUTE_TEST(error,
test_ulong.Execute(deviceID, context, queue, num_elements)); test_ulong.Execute(deviceID, context, queue, num_elements));
if (gFloatAtomicsSupported)
{
CBasicTestFetchAdd<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(
TYPE_ATOMIC_FLOAT, useSVM);
EXECUTE_TEST(
error, test_float.Execute(deviceID, context, queue, num_elements));
}
if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4) if (AtomicTypeInfo(TYPE_ATOMIC_SIZE_T).Size(deviceID) == 4)
{ {
CBasicTestFetchAdd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32> CBasicTestFetchAdd<HOST_ATOMIC_INTPTR_T32, HOST_INTPTR_T32>