Files
Ahmed c6e0f416e7 Specify memory scope and memory order for the atomic operations in generic_address_space generic_atomics_variant generic_atomics_invariant (#2550)
Use the explicit version of the atomic_load/store and atomic_fetch_add
with memory order relaxed and memory scope workgroup to allow devices
that only support the minimum CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES which
are (CL_DEVICE_ATOMIC_ORDER_RELAXED | CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP)
to run the tests.

The test should only require the relaxed ordering and memory scope
workgroup anyway.

From the specificiation:

"The non-explicit atomic_store function requires support for OpenCL C
2.0, or OpenCL C 3.0 or newer and both the
__opencl_c_atomic_order_seq_cst and __opencl_c_atomic_scope_device
features"

"The non-explicit atomic_load function requires support for OpenCL C 2.0
or OpenCL C 3.0 or newer and both the __opencl_c_atomic_order_seq_cst
and __opencl_c_atomic_scope_device features."

"The non-explicit atomic_fetch_key functions require support for OpenCL
C 2.0, or OpenCL C 3.0 or newer and both the
__opencl_c_atomic_order_seq_cst and __opencl_c_atomic_scope_device
features."
2025-10-28 09:04:51 -07:00

226 lines
7.9 KiB
C++

//
// Copyright (c) 2024 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "errorHelpers.h"
#include "typeWrappers.h"
namespace {
// In this source, each workgroup will generate one value.
// Every other workgroup will use either a global or local
// pointer on an atomic operation.
const char* KernelSourceInvariant = R"OpenCLC(
kernel void testKernel(global atomic_int* globalPtr, local atomic_int* localPtr) {
int gid = get_global_id(0);
int tid = get_local_id(0);
int wgid = get_group_id(0);
int wgsize = get_local_size(0);
if (tid == 0) atomic_store_explicit(localPtr, 0, memory_order_relaxed, memory_scope_work_group);
barrier(CLK_LOCAL_MEM_FENCE);
// Initialise the generic pointer to
// the global.
generic atomic_int* ptr = globalPtr + wgid;
// In a workgroup-invariant way, select a localPtr instead.
if ((wgid % 2) == 0)
ptr = localPtr;
int inc = atomic_fetch_add_explicit(ptr, 1, memory_order_relaxed, memory_scope_work_group);
// In the cases where the local memory ptr was used,
// save off the final value.
if ((wgid % 2) == 0 && inc == (wgsize-1))
atomic_store_explicit(&globalPtr[wgid], inc, memory_order_relaxed, memory_scope_work_group);
}
)OpenCLC";
// In this source, each workgroup will generate two values.
// Every other work item in the workgroup will select either
// a local or global memory pointer and perform an atomic
// operation on that.
const char* KernelSourceVariant = R"OpenCLC(
kernel void testKernel(global atomic_int* globalPtr, local atomic_int* localPtr) {
int gid = get_global_id(0);
int tid = get_local_id(0);
int wgid = get_group_id(0);
int wgsize = get_local_size(0);
if (tid == 0) atomic_store_explicit(localPtr, 0, memory_order_relaxed, memory_scope_work_group);
barrier(CLK_LOCAL_MEM_FENCE);
// Initialise the generic pointer to
// the global. Two values are written per WG.
generic atomic_int* ptr = globalPtr + (wgid * 2);
// In a workgroup-invariant way, select a localPtr instead.
if ((tid % 2) == 0)
ptr = localPtr;
atomic_fetch_add_explicit(ptr, 1, memory_order_relaxed, memory_scope_work_group);
barrier(CLK_LOCAL_MEM_FENCE);
// In the cases where the local memory ptr was used,
// save off the final value.
if (tid == 0)
atomic_store_explicit(&globalPtr[(wgid * 2) + 1],
atomic_load_explicit(localPtr, memory_order_relaxed, memory_scope_work_group),
memory_order_relaxed,
memory_scope_work_group);
}
)OpenCLC";
}
REGISTER_TEST(generic_atomics_invariant)
{
const auto version = get_device_cl_version(device);
if (version < Version(2, 0)) return TEST_SKIPPED_ITSELF;
cl_int err = CL_SUCCESS;
clProgramWrapper program;
clKernelWrapper kernel;
err = create_single_kernel_helper(context, &program, &kernel, 1,
&KernelSourceInvariant, "testKernel");
test_error(err, "Failed to create test kernel");
size_t wgSize, retSize;
// Attempt to find the simd unit size for the device.
err = clGetKernelWorkGroupInfo(kernel, device,
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof(wgSize), &wgSize, &retSize);
test_error(err, "clGetKernelWorkGroupInfo failed");
// How many workgroups to run for the test.
const int numWGs = 2;
const size_t bufferSize = numWGs * sizeof(cl_uint);
clMemWrapper buffer =
clCreateBuffer(context, CL_MEM_WRITE_ONLY, bufferSize, nullptr, &err);
test_error(err, "clCreateBuffer failed");
const cl_int zero = 0;
err = clEnqueueFillBuffer(queue, buffer, &zero, sizeof(zero), 0, bufferSize,
0, nullptr, nullptr);
test_error(err, "clEnqueueFillBuffer failed");
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
test_error(err, "clSetKernelArg failed");
err = clSetKernelArg(kernel, 1, bufferSize, nullptr);
test_error(err, "clSetKernelArg failed");
const size_t globalSize = wgSize * numWGs;
err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, &globalSize,
&wgSize, 0, nullptr, nullptr);
test_error(err, "clEnqueueNDRangeKernel failed");
std::vector<cl_int> results(numWGs);
err = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, bufferSize,
results.data(), 0, nullptr, nullptr);
test_error(err, "clEnqueueReadBuffer failed");
clFinish(queue);
for (size_t i = 0; i < numWGs; ++i)
{
const cl_int expected = ((i % 2) == 0) ? wgSize - 1 : wgSize;
if (results[i] != expected)
{
log_error("Verification on device failed at index %zu\n", i);
return TEST_FAIL;
}
}
return CL_SUCCESS;
}
REGISTER_TEST(generic_atomics_variant)
{
const auto version = get_device_cl_version(device);
if (version < Version(2, 0)) return TEST_SKIPPED_ITSELF;
cl_int err = CL_SUCCESS;
clProgramWrapper program;
clKernelWrapper kernel;
err = create_single_kernel_helper(context, &program, &kernel, 1,
&KernelSourceVariant, "testKernel");
test_error(err, "Failed to create test kernel");
size_t wgSize, retSize;
// Attempt to find the simd unit size for the device.
err = clGetKernelWorkGroupInfo(kernel, device,
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof(wgSize), &wgSize, &retSize);
test_error(err, "clGetKernelWorkGroupInfo failed");
// How many workgroups to run for the test.
const int numWGs = 2;
const size_t bufferSize = numWGs * sizeof(cl_uint) * 2;
clMemWrapper buffer =
clCreateBuffer(context, CL_MEM_WRITE_ONLY, bufferSize, nullptr, &err);
test_error(err, "clCreateBuffer failed");
const cl_int zero = 0;
err = clEnqueueFillBuffer(queue, buffer, &zero, sizeof(zero), 0, bufferSize,
0, nullptr, nullptr);
test_error(err, "clEnqueueFillBuffer failed");
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
test_error(err, "clSetKernelArg failed");
err = clSetKernelArg(kernel, 1, bufferSize, nullptr);
test_error(err, "clSetKernelArg failed");
const size_t globalSize = wgSize * numWGs;
err = clEnqueueNDRangeKernel(queue, kernel, 1, nullptr, &globalSize,
&wgSize, 0, nullptr, nullptr);
test_error(err, "clEnqueueNDRangeKernel failed");
std::vector<cl_int> results(numWGs * 2);
err = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, bufferSize,
results.data(), 0, nullptr, nullptr);
test_error(err, "clEnqueueReadBuffer failed");
clFinish(queue);
const cl_int expected = wgSize / 2;
for (size_t i = 0; i < (numWGs * 2); i += 2)
{
if (results[i] != expected)
{
log_error("Verification on device failed at index %zu\n", i);
return TEST_FAIL;
}
if (results[i + 1] != expected)
{
const size_t index = i + 1;
log_error("Verification on device failed at index %zu\n", index);
return TEST_FAIL;
}
}
return CL_SUCCESS;
}