Change memory order and scope for atomics that gate final results being stored. (#1377)

* Change memory order and scope for atomics that gate final results being stored.

memory_order_acq_rel with memory_scope_device is now used to guarantee that the correct memory consistency is observed before final results are stored.

Previously it was possible for kernels to be generated that all used relaxed memory ordering, which could lead to false-positive failures.

Fixes #1370

* Disable atomics tests with global, in-program atomics.

If the device does not support `memory_order_relaxed` or `memory_scope_device`, disable atomics tests that declare their atomics in-program with global memory.

There is now an implicit requirement to support `memory_order_relaxed` and `memory_scope_device` for these tests.

* Fix misplaced parentheses.

* Change memory scope for atomic fetch and load calls in kernel

Change the memory scope from memory_scope_work_group to
memory_scope_device so the ordering applies across all work items

Co-authored-by: Sreelakshmi Haridas <sharidas@quicinc.com>
This commit is contained in:
Jeremy Kemp
2022-04-28 23:34:08 +01:00
committed by GitHub
parent 35c21a8e06
commit 5d6ca3e9d1

View File

@@ -1031,20 +1031,11 @@ CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
}
code += "\n";
}
if (LocalMemory() || DeclaredInProgram())
if (LocalMemory())
{
code += " // Copy final values to host reachable buffer\n";
if (LocalMemory())
code += " barrier(CLK_LOCAL_MEM_FENCE);\n"
" if(get_local_id(0) == 0) // first thread in workgroup\n";
else
// global atomics declared in program scope
code += R"(
if(atomic_fetch_add_explicit(&finishedThreads, 1u,
memory_order_relaxed,
memory_scope_work_group)
== get_global_size(0)-1) // last finished thread
)";
code += " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
"dstItemIdx++)\n";
if (aTypeName == "atomic_flag")
@@ -1064,6 +1055,35 @@ CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
memory_scope_work_group);)";
}
}
else if (DeclaredInProgram())
{
// global atomics declared in program scope
code += " // Copy final values to host reachable buffer\n";
code += R"(
if(atomic_fetch_add_explicit(&finishedThreads, 1u,
memory_order_acq_rel,
memory_scope_device)
== get_global_size(0)-1) // last finished thread
)";
code += " for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
"dstItemIdx++)\n";
if (aTypeName == "atomic_flag")
{
code += R"(
finalDest[dstItemIdx] =
atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
memory_order_relaxed,
memory_scope_device);)";
}
else
{
code += R"(
finalDest[dstItemIdx] =
atomic_load_explicit(destMemory+dstItemIdx,
memory_order_relaxed,
memory_scope_device);)";
}
}
code += "}\n"
"\n";
return code;
@@ -1108,6 +1128,15 @@ int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
log_info("\t\tTest disabled\n");
return 0;
}
if (!LocalMemory() && DeclaredInProgram())
{
if (((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
|| ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0))
{
log_info("\t\tTest disabled\n");
return 0;
}
}
// set up work sizes based on device capabilities and test configuration
error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,