Change memory order and scope for atomics that gate final results being stored. (#1377)

* Change memory order and scope for atomics that gate final results being stored. memory_order_acq_rel with memory_scope_device is now used to guarantee that the correct memory consistency is observed before final results are stored. Previously it was possible for kernels to be generated that all used relaxed memory ordering, which could lead to false-positive failures. Fixes #1370 * Disable atomics tests with global, in-program atomics. If the device does not support `memory_order_relaxed` or `memory_scope_device`, disable atomics tests that declare their atomics in-program with global memory. There is now an implicit requirement to support `memory_order_relaxed` and `memory_scope_device` for these tests. * Fix misplaced parentheses. * Change memory scope for atomic fetch and load calls in kernel Change the memory scope from memory_scope_work_group to memory_scope_device so the ordering applies across all work items Co-authored-by: Sreelakshmi Haridas <sharidas@quicinc.com>
2026-03-19 06:09:01 +00:00 · 2022-04-28 23:34:08 +01:00
parent 35c21a8e06
commit 5d6ca3e9d1
1 changed files with 41 additions and 12 deletions
--- a/test_conformance/c11_atomics/common.h
+++ b/test_conformance/c11_atomics/common.h
@@ -1031,20 +1031,11 @@ CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
        }
        code += "\n";
    }
-    if (LocalMemory() || DeclaredInProgram())
+    if (LocalMemory())
    {
        code += "  // Copy final values to host reachable buffer\n";
-        if (LocalMemory())
        code += "  barrier(CLK_LOCAL_MEM_FENCE);\n"
                "  if(get_local_id(0) == 0) // first thread in workgroup\n";
-        else
-            // global atomics declared in program scope
-            code += R"(
-                if(atomic_fetch_add_explicit(&finishedThreads, 1u,
-                                           memory_order_relaxed,
-                                           memory_scope_work_group)
-                   == get_global_size(0)-1) // last finished thread
-                   )";
        code += "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
                "dstItemIdx++)\n";
        if (aTypeName == "atomic_flag")
@@ -1064,6 +1055,35 @@ CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
                                         memory_scope_work_group);)";
        }
    }
+    else if (DeclaredInProgram())
+    {
+        // global atomics declared in program scope
+        code += "  // Copy final values to host reachable buffer\n";
+        code += R"(
+            if(atomic_fetch_add_explicit(&finishedThreads, 1u,
+                                         memory_order_acq_rel,
+                                         memory_scope_device)
+                   == get_global_size(0)-1) // last finished thread
+                )";
+        code += "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; "
+                "dstItemIdx++)\n";
+        if (aTypeName == "atomic_flag")
+        {
+            code += R"(
+                finalDest[dstItemIdx] =
+                    atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
+                                                      memory_order_relaxed,
+                                                      memory_scope_device);)";
+        }
+        else
+        {
+            code += R"(
+                finalDest[dstItemIdx] =
+                    atomic_load_explicit(destMemory+dstItemIdx,
+                                         memory_order_relaxed,
+                                         memory_scope_device);)";
+        }
+    }
    code += "}\n"
            "\n";
    return code;
@@ -1108,6 +1128,15 @@ int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(
        log_info("\t\tTest disabled\n");
        return 0;
    }
+    if (!LocalMemory() && DeclaredInProgram())
+    {
+        if (((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
+            || ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0))
+        {
+            log_info("\t\tTest disabled\n");
+            return 0;
+        }
+    }

    // set up work sizes based on device capabilities and test configuration
    error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,