Merge branch 'main' into cl_khr_unified_svm

2026-03-19 06:09:01 +00:00 · 2025-11-06 14:34:30 -08:00
parent 089e02cdf7 e641de99a5
commit 54008fbf84
63 changed files with 4934 additions and 2745 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,11 +25,18 @@ if(USE_CL_EXPERIMENTAL)
  add_definitions(-DCL_EXPERIMENTAL)
 endif(USE_CL_EXPERIMENTAL)

+option(SANITIZER_THREAD "Build with the thread sanitiser" OFF)
+if (SANITIZER_THREAD)
+    add_compile_options(-fsanitize=thread)
+    add_link_options(-fsanitize=thread)
+endif (SANITIZER_THREAD)
+
 #-----------------------------------------------------------
 # Default Configurable Test Set
 #-----------------------------------------------------------
 option(D3D10_IS_SUPPORTED "Run DirectX 10 interop tests" OFF)
 option(D3D11_IS_SUPPORTED "Run DirectX 11 interop tests" OFF)
+option(D3D12_IS_SUPPORTED "Run DirectX 12 interop tests" OFF)
 option(GL_IS_SUPPORTED "Run OpenGL interop tests" OFF)
 option(GLES_IS_SUPPORTED "Run OpenGL ES interop tests" OFF)
 option(VULKAN_IS_SUPPORTED "Run Vulkan interop tests" OFF)
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -137,10 +137,10 @@ uint32_t get_channel_order_channel_count(cl_channel_order order)
        case CL_RGx: return 2;

        case CL_RGB:
-        case CL_RGBx:
-        case CL_sRGB:
-        case CL_sRGBx: return 3;
+        case CL_sRGB: return 3;

+        case CL_RGBx:
+        case CL_sRGBx:
        case CL_RGBA:
        case CL_ARGB:
        case CL_BGRA:
--- a/test_common/harness/stringHelpers.h
+++ b/test_common/harness/stringHelpers.h
@@ -39,4 +39,20 @@ inline std::string str_sprintf(const std::string &str, Args... args)
    return std::string(buffer.get(), buffer.get() + s - 1);
 }

+// Returns the argument, converted to std::string.
+// The return type of std::filesystem::path::u8string() was
+// std::string in C++17, but became std::u8string in C++20.
+// Use this method to wrap the result when a std::string
+// is desired.
+//
+// Use a template with a specialization for std::string,
+// so the generic template applies when std::u8string exists
+// and is used.
+template <typename STRING_TYPE>
+inline std::string to_string(const STRING_TYPE &str)
+{
+    return std::string(str.begin(), str.end());
+}
+inline std::string to_string(const std::string &str) { return str; }
+
 #endif // STRING_HELPERS_H
--- a/test_conformance/CMakeLists.txt
+++ b/test_conformance/CMakeLists.txt
@@ -56,6 +56,9 @@ if(VULKAN_IS_SUPPORTED)
    add_subdirectory( common/vulkan_wrapper )
    add_subdirectory( vulkan )
 endif()
+if(D3D12_IS_SUPPORTED)
+    add_subdirectory(common/directx_wrapper)
+endif ()

 file(GLOB CSV_FILES "opencl_conformance_tests_*.csv")

--- a/test_conformance/SVM/test_set_kernel_exec_info_svm_ptrs.cpp
+++ b/test_conformance/SVM/test_set_kernel_exec_info_svm_ptrs.cpp
@@ -15,142 +15,167 @@
 //
 #include "common.h"

-typedef struct {
-  cl_int *pA;
-  cl_int *pB;
-  cl_int *pC;
+typedef struct
+{
+    cl_int* pA;
+    cl_int* pB;
+    cl_int* pC;
 } BufPtrs;

-const char *set_kernel_exec_info_svm_ptrs_kernel[] = {
-  "struct BufPtrs;\n"
-  "\n"
-  "typedef struct {\n"
-  "    __global int *pA;\n"
-  "    __global int *pB;\n"
-  "    __global int *pC;\n"
-  "} BufPtrs;\n"
-  "\n"
-  "__kernel void set_kernel_exec_info_test(__global BufPtrs* pBufs)\n"
-  "{\n"
-  "    size_t i;\n"
-  "   i = get_global_id(0);\n"
-  "    pBufs->pA[i]++;\n"
-  "    pBufs->pB[i]++;\n"
-  "    pBufs->pC[i]++;\n"
-  "}\n"
+const char* set_kernel_exec_info_svm_ptrs_kernel[] = {
+    "struct BufPtrs;\n"
+    "\n"
+    "typedef struct {\n"
+    "    __global int *pA;\n"
+    "    __global int *pB;\n"
+    "    __global int *pC;\n"
+    "} BufPtrs;\n"
+    "\n"
+    "__kernel void set_kernel_exec_info_test(__global BufPtrs* pBufs)\n"
+    "{\n"
+    "    size_t i;\n"
+    "   i = get_global_id(0);\n"
+    "    pBufs->pA[i]++;\n"
+    "    pBufs->pB[i]++;\n"
+    "    pBufs->pC[i]++;\n"
+    "}\n"
 };

-// Test that clSetKernelExecInfo works correctly with CL_KERNEL_EXEC_INFO_SVM_PTRS flag.
+// Test that clSetKernelExecInfo works correctly with
+// CL_KERNEL_EXEC_INFO_SVM_PTRS flag.
 //
 REGISTER_TEST(svm_set_kernel_exec_info_svm_ptrs)
 {
-  clContextWrapper    c = NULL;
-  clProgramWrapper    program = NULL;
-  cl_uint     num_devices = 0;
-  cl_int      error = CL_SUCCESS;
-  clCommandQueueWrapper queues[MAXQ];
+    clContextWrapper c = NULL;
+    clProgramWrapper program = NULL;
+    cl_uint num_devices = 0;
+    cl_int error = CL_SUCCESS;
+    clCommandQueueWrapper queues[MAXQ];

-  // error = create_cl_objects(device, &set_kernel_exec_info_svm_ptrs_kernel[0],
-  // &context, &program, &q, &num_devices, CL_DEVICE_SVM_FINE_GRAIN);
-  error = create_cl_objects(device, &set_kernel_exec_info_svm_ptrs_kernel[0],
-                            &c, &program, &queues[0], &num_devices,
-                            CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
-  if(error == 1) return 0; // no devices capable of requested SVM level, so don't execute but count test as passing.
-  if(error < 0) return -1; // fail test.
+    // error = create_cl_objects(device,
+    // &set_kernel_exec_info_svm_ptrs_kernel[0], &context, &program, &q,
+    // &num_devices, CL_DEVICE_SVM_FINE_GRAIN);
+    error = create_cl_objects(device, &set_kernel_exec_info_svm_ptrs_kernel[0],
+                              &c, &program, &queues[0], &num_devices,
+                              CL_DEVICE_SVM_COARSE_GRAIN_BUFFER);
+    if (error == 1)
+        return 0; // no devices capable of requested SVM level, so don't execute
+                  // but count test as passing.
+    if (error < 0) return -1; // fail test.


-  clKernelWrapper k = clCreateKernel(program, "set_kernel_exec_info_test", &error);
-  test_error(error, "clCreateKernel failed");
+    clKernelWrapper k =
+        clCreateKernel(program, "set_kernel_exec_info_test", &error);
+    test_error(error, "clCreateKernel failed");

-  size_t size = num_elements*sizeof(int);
-  //int* pA = (int*) clSVMalloc(c, CL_MEM_READ_WRITE | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM, sizeof(int)*num_elements, 0);
-  //int* pB = (int*) clSVMalloc(c, CL_MEM_READ_WRITE | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM, sizeof(int)*num_elements, 0);
-  //int* pC = (int*) clSVMalloc(c, CL_MEM_READ_WRITE | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM, sizeof(int)*num_elements, 0);
-  int* pA = (int*) clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
-  int* pB = (int*) clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
-  int* pC = (int*) clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
-  BufPtrs* pBuf = (BufPtrs*) clSVMAlloc(c, CL_MEM_READ_WRITE, sizeof(BufPtrs), 0);
+    size_t size = num_elements * sizeof(int);
+    // int* pA = (int*) clSVMalloc(c, CL_MEM_READ_WRITE |
+    // CL_DEVICE_SVM_FINE_GRAIN_SYSTEM, sizeof(int)*num_elements, 0); int* pB =
+    // (int*) clSVMalloc(c, CL_MEM_READ_WRITE | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM,
+    // sizeof(int)*num_elements, 0); int* pC = (int*) clSVMalloc(c,
+    // CL_MEM_READ_WRITE | CL_DEVICE_SVM_FINE_GRAIN_SYSTEM,
+    // sizeof(int)*num_elements, 0);
+    int* pA = (int*)clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
+    int* pB = (int*)clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
+    int* pC = (int*)clSVMAlloc(c, CL_MEM_READ_WRITE, size, 0);
+    BufPtrs* pBuf =
+        (BufPtrs*)clSVMAlloc(c, CL_MEM_READ_WRITE, sizeof(BufPtrs), 0);

-  bool failed = false;
-  {
-    clMemWrapper ba,bb,bc,bBuf;
-    ba = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pA, &error);
-    test_error(error, "clCreateBuffer failed");
-    bb = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pB, &error);
-    test_error(error, "clCreateBuffer failed");
-    bc = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pC, &error);
-    test_error(error, "clCreateBuffer failed");
-    bBuf = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, sizeof(BufPtrs), pBuf, &error);
-    test_error(error, "clCreateBuffer failed");
-
-    clEnqueueMapBuffer(queues[0], ba, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
-    test_error(error, "clEnqueueMapBuffer failed");
-    clEnqueueMapBuffer(queues[0], bb, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
-    test_error(error, "clEnqueueMapBuffer failed");
-    clEnqueueMapBuffer(queues[0], bc, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
-    test_error(error, "clEnqueueMapBuffer failed");
-    clEnqueueMapBuffer(queues[0], bBuf, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(BufPtrs), 0, NULL, NULL, &error);
-    test_error(error, "clEnqueueMapBuffer failed");
-
-    for(int i = 0; i < num_elements; i++) pA[i] = pB[i] = pC[i] = 0;
-
-    pBuf->pA = pA;
-    pBuf->pB = pB;
-    pBuf->pC = pC;
-
-    error = clEnqueueUnmapMemObject(queues[0], ba, pA, 0, NULL, NULL);
-    test_error(error, " clEnqueueUnmapMemObject failed.");
-    error = clEnqueueUnmapMemObject(queues[0], bb, pB, 0, NULL, NULL);
-    test_error(error, " clEnqueueUnmapMemObject failed.");
-    error = clEnqueueUnmapMemObject(queues[0], bc, pC, 0, NULL, NULL);
-    test_error(error, " clEnqueueUnmapMemObject failed.");
-    error = clEnqueueUnmapMemObject(queues[0], bBuf, pBuf, 0, NULL, NULL);
-    test_error(error, " clEnqueueUnmapMemObject failed.");
-
-
-    error = clSetKernelArgSVMPointer(k, 0, pBuf); 
-    test_error(error, "clSetKernelArg failed");
-
-    error = clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_SVM_PTRS, sizeof(BufPtrs), pBuf);
-    test_error(error, "clSetKernelExecInfo failed");
-
-    size_t range =  num_elements;
-    error = clEnqueueNDRangeKernel(queues[0], k, 1, NULL, &range, NULL, 0, NULL, NULL);
-    test_error(error,"clEnqueueNDRangeKernel failed");
-
-    error = clFinish(queues[0]);
-    test_error(error, "clFinish failed.");
-
-    clEnqueueMapBuffer(queues[0], ba, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
-    test_error(error, "clEnqueueMapBuffer failed");
-    clEnqueueMapBuffer(queues[0], bb, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
-    test_error(error, "clEnqueueMapBuffer failed");
-    clEnqueueMapBuffer(queues[0], bc, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL, &error);
-    test_error(error, "clEnqueueMapBuffer failed");
-
-    for(int i = 0; i < num_elements; i++) 
+    bool failed = false;
    {
-      if(pA[i] + pB[i] + pC[i] != 3)
-        failed = true;
+        clMemWrapper ba, bb, bc, bBuf;
+        ba = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pA, &error);
+        test_error(error, "clCreateBuffer failed");
+        bb = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pB, &error);
+        test_error(error, "clCreateBuffer failed");
+        bc = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, size, pC, &error);
+        test_error(error, "clCreateBuffer failed");
+        bBuf = clCreateBuffer(c, CL_MEM_USE_HOST_PTR, sizeof(BufPtrs), pBuf,
+                              &error);
+        test_error(error, "clCreateBuffer failed");
+
+        clEnqueueMapBuffer(queues[0], ba, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                           0, size, 0, NULL, NULL, &error);
+        test_error(error, "clEnqueueMapBuffer failed");
+        clEnqueueMapBuffer(queues[0], bb, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                           0, size, 0, NULL, NULL, &error);
+        test_error(error, "clEnqueueMapBuffer failed");
+        clEnqueueMapBuffer(queues[0], bc, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                           0, size, 0, NULL, NULL, &error);
+        test_error(error, "clEnqueueMapBuffer failed");
+        clEnqueueMapBuffer(queues[0], bBuf, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                           0, sizeof(BufPtrs), 0, NULL, NULL, &error);
+        test_error(error, "clEnqueueMapBuffer failed");
+
+        for (int i = 0; i < num_elements; i++) pA[i] = pB[i] = pC[i] = 0;
+
+        pBuf->pA = pA;
+        pBuf->pB = pB;
+        pBuf->pC = pC;
+
+        error = clEnqueueUnmapMemObject(queues[0], ba, pA, 0, NULL, NULL);
+        test_error(error, " clEnqueueUnmapMemObject failed.");
+        error = clEnqueueUnmapMemObject(queues[0], bb, pB, 0, NULL, NULL);
+        test_error(error, " clEnqueueUnmapMemObject failed.");
+        error = clEnqueueUnmapMemObject(queues[0], bc, pC, 0, NULL, NULL);
+        test_error(error, " clEnqueueUnmapMemObject failed.");
+        error = clEnqueueUnmapMemObject(queues[0], bBuf, pBuf, 0, NULL, NULL);
+        test_error(error, " clEnqueueUnmapMemObject failed.");
+
+
+        error = clSetKernelArgSVMPointer(k, 0, pBuf);
+        test_error(error, "clSetKernelArg failed");
+
+        error = clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_SVM_PTRS,
+                                    sizeof(BufPtrs), pBuf);
+        test_error(error, "clSetKernelExecInfo failed");
+
+        size_t range = num_elements;
+        error = clEnqueueNDRangeKernel(queues[0], k, 1, NULL, &range, NULL, 0,
+                                       NULL, NULL);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+
+        error = clFinish(queues[0]);
+        test_error(error, "clFinish failed.");
+
+        // Special case testing of unsetting previously set SVM pointers
+        error = clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_SVM_PTRS, 0, NULL);
+        test_error(error,
+                   "Unsetting previously set SVM pointers using "
+                   "clSetKernelExecInfo failed");
+
+        clEnqueueMapBuffer(queues[0], ba, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                           0, size, 0, NULL, NULL, &error);
+        test_error(error, "clEnqueueMapBuffer failed");
+        clEnqueueMapBuffer(queues[0], bb, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                           0, size, 0, NULL, NULL, &error);
+        test_error(error, "clEnqueueMapBuffer failed");
+        clEnqueueMapBuffer(queues[0], bc, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                           0, size, 0, NULL, NULL, &error);
+        test_error(error, "clEnqueueMapBuffer failed");
+
+        for (int i = 0; i < num_elements; i++)
+        {
+            if (pA[i] + pB[i] + pC[i] != 3) failed = true;
+        }
+
+        error = clEnqueueUnmapMemObject(queues[0], ba, pA, 0, NULL, NULL);
+        test_error(error, " clEnqueueUnmapMemObject failed.");
+        error = clEnqueueUnmapMemObject(queues[0], bb, pB, 0, NULL, NULL);
+        test_error(error, " clEnqueueUnmapMemObject failed.");
+        error = clEnqueueUnmapMemObject(queues[0], bc, pC, 0, NULL, NULL);
+        test_error(error, " clEnqueueUnmapMemObject failed.");
    }

-    error = clEnqueueUnmapMemObject(queues[0], ba, pA, 0, NULL, NULL);
-    test_error(error, " clEnqueueUnmapMemObject failed.");
-    error = clEnqueueUnmapMemObject(queues[0], bb, pB, 0, NULL, NULL);
-    test_error(error, " clEnqueueUnmapMemObject failed.");
-    error = clEnqueueUnmapMemObject(queues[0], bc, pC, 0, NULL, NULL);
-    test_error(error, " clEnqueueUnmapMemObject failed.");
-  }
+    error = clFinish(queues[0]);
+    test_error(error, " clFinish failed.");

-  error = clFinish(queues[0]);
-  test_error(error, " clFinish failed.");
+    clSVMFree(c, pA);
+    clSVMFree(c, pB);
+    clSVMFree(c, pC);
+    clSVMFree(c, pBuf);

-  clSVMFree(c, pA);
-  clSVMFree(c, pB);
-  clSVMFree(c, pC);
-  clSVMFree(c, pBuf);
+    if (failed) return -1;

-  if(failed) return -1;
-
-  return 0;
+    return 0;
 }
--- a/test_conformance/api/negative_queue.cpp
+++ b/test_conformance/api/negative_queue.cpp
@@ -16,92 +16,184 @@
 #include "testBase.h"
 #include "harness/typeWrappers.h"

+#include <vector>
+
 REGISTER_TEST(negative_create_command_queue)
 {
-    cl_command_queue_properties device_props = 0;
-    cl_int error = clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES,
-                                   sizeof(device_props), &device_props, NULL);
-    test_error(error, "clGetDeviceInfo for CL_DEVICE_QUEUE_PROPERTIES failed");
+    cl_int err = 0;
+    clCreateCommandQueue(nullptr, device, 0, &err);
+    test_failure_error_ret(
+        err, CL_INVALID_CONTEXT,
+        "clCreateCommandQueue should return CL_INVALID_CONTEXT when: \"context "
+        "is not a valid context\" using a nullptr",
+        TEST_FAIL);

-    // CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE is the only optional property to
-    // clCreateCommandQueue, CL_QUEUE_PROFILING_ENABLE is mandatory.
-    const bool out_of_order_device_support =
-        device_props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-    if (out_of_order_device_support)
+    clCreateCommandQueue(context, nullptr, 0, &err);
+    test_failure_error_ret(
+        err, CL_INVALID_DEVICE,
+        "clCreateCommandQueue should return CL_INVALID_DEVICE when: \"device "
+        "is not a valid device\" using a nullptr",
+        TEST_FAIL);
+
+    cl_device_id different_device = GetOpposingDevice(device);
+    if (different_device && device != different_device)
    {
-        // Early return as we can't check correct error is returned for
-        // unsupported property.
-        return TEST_PASS;
+        clCreateCommandQueue(context, different_device, 0, &err);
+        test_failure_error_ret(
+            err, CL_INVALID_DEVICE,
+            "clCreateCommandQueue should return CL_INVALID_DEVICE when: "
+            "\"device is not associated with context\"",
+            TEST_FAIL);
    }

-    // Try create a command queue with out-of-order property and check return
-    // code
-    cl_int test_error = CL_SUCCESS;
-    clCommandQueueWrapper test_queue = clCreateCommandQueue(
-        context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &test_error);
-
+    cl_queue_properties invalid_property{ static_cast<cl_queue_properties>(
+        -1) };
+    clCreateCommandQueue(context, device, invalid_property, &err);
    test_failure_error_ret(
-        test_error, CL_INVALID_QUEUE_PROPERTIES,
-        "clCreateCommandQueue should return CL_INVALID_QUEUE_PROPERTIES if "
-        "values specified in properties are valid but are not supported by "
-        "the "
-        "device.",
+        err, CL_INVALID_VALUE,
+        "clCreateCommandQueue should return CL_INVALID_VALUE when: \"values "
+        "specified in properties are not valid\"",
        TEST_FAIL);
+
+    cl_command_queue_properties device_queue_properties = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES,
+                          sizeof(device_queue_properties),
+                          &device_queue_properties, nullptr);
+    test_error(err, "clGetDeviceInfo");
+    cl_command_queue_properties valid_properties[] = {
+        CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, CL_QUEUE_PROFILING_ENABLE
+    };
+    cl_queue_properties property{ 0 };
+    bool missing_property = false;
+    // Iterate through all possible properties to find one which isn't supported
+    for (auto prop : valid_properties)
+    {
+        if ((device_queue_properties & prop) == 0)
+        {
+            missing_property = true;
+            property = prop;
+            break;
+        }
+    }
+    // This test can only run when a device does not support a property
+    if (missing_property)
+    {
+        clCreateCommandQueue(context, device, property, &err);
+        test_failure_error_ret(
+            err, CL_INVALID_QUEUE_PROPERTIES,
+            "clCreateCommandQueue should return CL_INVALID_QUEUE_PROPERTIES "
+            "when: \"values specified in properties are valid but are not "
+            "supported by the device\"",
+            TEST_FAIL);
+    }
+
    return TEST_PASS;
 }

 REGISTER_TEST_VERSION(negative_create_command_queue_with_properties,
                      Version(2, 0))
 {
-    cl_command_queue_properties device_props = 0;
-    cl_int error = clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES,
-                                   sizeof(device_props), &device_props, NULL);
-    test_error(error, "clGetDeviceInfo for CL_DEVICE_QUEUE_PROPERTIES failed");
+    cl_int err = 0;
+    clCreateCommandQueueWithProperties(nullptr, device, nullptr, &err);
+    test_failure_error_ret(
+        err, CL_INVALID_CONTEXT,
+        "clCreateCommandQueueWithProperties should return CL_INVALID_CONTEXT "
+        "when: \"context is not a valid context\" using a nullptr",
+        TEST_FAIL);

-    cl_command_queue_properties device_on_host_props = 0;
-    error = clGetDeviceInfo(device, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES,
-                            sizeof(device_on_host_props), &device_on_host_props,
-                            NULL);
-    test_error(error,
-               "clGetDeviceInfo for CL_DEVICE_QUEUE_ON_HOST_PROPERTIES failed");
+    clCreateCommandQueueWithProperties(context, nullptr, nullptr, &err);
+    test_failure_error_ret(
+        err, CL_INVALID_DEVICE,
+        "clCreateCommandQueueWithProperties should return CL_INVALID_DEVICE "
+        "when: \"device is not a valid device\" using a nullptr",
+        TEST_FAIL);

-    if (device_on_host_props != device_props)
+    cl_device_id different_device = GetOpposingDevice(device);
+    if (different_device && device != different_device)
    {
-        log_error(
-            "ERROR: CL_DEVICE_QUEUE_PROPERTIES and "
-            "CL_DEVICE_QUEUE_ON_HOST_PROPERTIES properties should match\n");
-        return TEST_FAIL;
+        clCreateCommandQueueWithProperties(context, different_device, nullptr,
+                                           &err);
+        test_failure_error_ret(
+            err, CL_INVALID_DEVICE,
+            "clCreateCommandQueueWithProperties should return "
+            "CL_INVALID_DEVICE when: \"device is not associated with context\"",
+            TEST_FAIL);
    }

-    // CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE is the only optional host-queue
-    // property to clCreateCommandQueueWithProperties,
-    // CL_QUEUE_PROFILING_ENABLE is mandatory.
-    const bool out_of_order_device_support =
-        device_props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-    if (out_of_order_device_support)
-    {
-        // Early return as we can't check correct error is returned for
-        // unsupported property.
-        return TEST_PASS;
-    }
+    cl_queue_properties invalid_property{ static_cast<cl_queue_properties>(
+        -1) };

-    // Try create a command queue with out-of-order property and check return
-    // code
-    cl_command_queue_properties queue_prop_def[] = {
-        CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0
+    // Depending on the OpenCL Version, there can be up to 2 properties which
+    // each take values, and the list should be terminated with a 0
+    cl_queue_properties properties[] = { invalid_property, invalid_property, 0,
+                                         0, 0 };
+    clCreateCommandQueueWithProperties(context, device, properties, &err);
+    test_failure_error_ret(
+        err, CL_INVALID_VALUE,
+        "clCreateCommandQueueWithProperties should return CL_INVALID_VALUE "
+        "when: \"values specified in properties are not valid\"",
+        TEST_FAIL);
+
+    cl_command_queue_properties device_queue_properties = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES,
+                          sizeof(device_queue_properties),
+                          &device_queue_properties, nullptr);
+    test_error(err, "clGetDeviceInfo");
+    cl_command_queue_properties valid_properties[] = {
+        CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, CL_QUEUE_PROFILING_ENABLE
    };
-
-    cl_int test_error = CL_SUCCESS;
-    clCommandQueueWrapper test_queue = clCreateCommandQueueWithProperties(
-        context, device, queue_prop_def, &test_error);
-
-    test_failure_error_ret(test_error, CL_INVALID_QUEUE_PROPERTIES,
-                           "clCreateCommandQueueWithProperties should "
-                           "return CL_INVALID_QUEUE_PROPERTIES if "
-                           "values specified in properties are valid but "
-                           "are not supported by the "
-                           "device.",
-                           TEST_FAIL);
+    properties[0] = CL_QUEUE_PROPERTIES;
+    bool missing_property = false;
+    // Iterate through all possible properties to find one which isn't supported
+    for (auto property : valid_properties)
+    {
+        if ((device_queue_properties & property) == 0)
+        {
+            missing_property = true;
+            properties[1] = property;
+            break;
+        }
+    }
+    if (missing_property)
+    {
+        clCreateCommandQueueWithProperties(context, device, properties, &err);
+        test_failure_error_ret(
+            err, CL_INVALID_QUEUE_PROPERTIES,
+            "clCreateCommandQueueWithProperties should return "
+            "CL_INVALID_QUEUE_PROPERTIES when: \"values specified in "
+            "properties are valid but are not supported by the device\"",
+            TEST_FAIL);
+    }
+    else if (get_device_cl_version(device) >= Version(2, 0))
+    {
+        cl_uint max_size = -1;
+        err = clGetDeviceInfo(device, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE,
+                              sizeof(max_size), &max_size, nullptr);
+        test_error(err, "clGetDeviceInfo");
+        if (max_size > 0 && max_size < CL_UINT_MAX)
+        {
+            properties[0] = CL_QUEUE_PROPERTIES;
+            properties[1] =
+                CL_QUEUE_ON_DEVICE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+            properties[2] = CL_QUEUE_SIZE;
+            properties[3] = max_size + 1;
+            clCreateCommandQueueWithProperties(context, device, properties,
+                                               &err);
+            if (err != CL_INVALID_VALUE && err != CL_INVALID_QUEUE_PROPERTIES)
+            {
+                log_error("ERROR: %s! (Got %s, expected (%s) from %s:%d)\n",
+                          "clCreateCommandQueueWithProperties should return "
+                          "CL_INVALID_VALUE or CL_INVALID_QUEUE_PROPERTIES "
+                          "when: \"values specified in properties are not "
+                          "valid\" using a queue size greather than "
+                          "CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE",
+                          IGetErrorString(err),
+                          "CL_INVALID_VALUE or CL_INVALID_QUEUE_PROPERTIES",
+                          __FILE__, __LINE__);
+                return TEST_FAIL;
+            }
+        }
+    }

    return TEST_PASS;
 }
@@ -166,3 +258,236 @@ REGISTER_TEST(negative_create_command_queue_with_properties_khr)
                           TEST_FAIL);
    return TEST_PASS;
 }
+
+REGISTER_TEST_VERSION(negative_set_default_device_command_queue, Version(2, 1))
+{
+    cl_int err = 0;
+    if (get_device_cl_version(device) >= Version(3, 0))
+    {
+        cl_device_device_enqueue_capabilities device_capabilities = 0;
+        cl_int err = clGetDeviceInfo(
+            device, CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES,
+            sizeof(device_capabilities), &device_capabilities, nullptr);
+        test_error(err, "clGetDeviceInfo");
+        if (((device_capabilities & CL_DEVICE_QUEUE_REPLACEABLE_DEFAULT) == 0)
+            && ((device_capabilities & CL_DEVICE_QUEUE_SUPPORTED) == 1))
+        {
+            const cl_queue_properties properties[] = {
+                CL_QUEUE_PROPERTIES,
+                CL_QUEUE_ON_DEVICE_DEFAULT | CL_QUEUE_ON_DEVICE
+                    | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
+                0
+            };
+            clCommandQueueWrapper cmd_queue =
+                clCreateCommandQueueWithProperties(context, device, properties,
+                                                   &err);
+            test_error(err, "clCreateCommandQueueWithProperties");
+            err = clSetDefaultDeviceCommandQueue(context, device, cmd_queue);
+            test_failure_error_ret(
+                err, CL_INVALID_OPERATION,
+                "clSetDefaultDeviceCommandQueue should return "
+                "CL_INVALID_OPERATION when \"device does not support a "
+                "replaceable default on-device queue\"",
+                TEST_FAIL);
+        }
+    }
+
+    err = clSetDefaultDeviceCommandQueue(nullptr, device, queue);
+    if (err != CL_INVALID_OPERATION && err != CL_INVALID_CONTEXT)
+    {
+        log_error("ERROR: %s! (Got %s, expected (%s) from %s:%d)\n",
+                  "clSetDefaultDeviceCommandQueue should return "
+                  "CL_INVALID_OPERATION or CL_INVALID_CONTEXT when: \"context "
+                  "is not a valid context\" using a nullptr",
+                  IGetErrorString(err),
+                  "CL_INVALID_OPERATION or CL_INVALID_CONTEXT", __FILE__,
+                  __LINE__);
+        return TEST_FAIL;
+    }
+
+    err = clSetDefaultDeviceCommandQueue(context, nullptr, queue);
+    if (err != CL_INVALID_OPERATION && err != CL_INVALID_DEVICE)
+    {
+        log_error("ERROR: %s! (Got %s, expected (%s) from %s:%d)\n",
+                  "clSetDefaultDeviceCommandQueue should return "
+                  "CL_INVALID_OPERATION or CL_INVALID_DEVICE when: \"device "
+                  "is not a valid device\" using a nullptr",
+                  IGetErrorString(err),
+                  "CL_INVALID_OPERATION or CL_INVALID_DEVICE", __FILE__,
+                  __LINE__);
+        return TEST_FAIL;
+    }
+
+    cl_device_id different_device = GetOpposingDevice(device);
+    if (different_device && device != different_device)
+    {
+        err = clSetDefaultDeviceCommandQueue(context, different_device, queue);
+        if (err != CL_INVALID_OPERATION && err != CL_INVALID_DEVICE)
+        {
+            log_error("ERROR: %s! (Got %s, expected (%s) from %s:%d)\n",
+                      "clSetDefaultDeviceCommandQueue should return "
+                      "CL_INVALID_OPERATION or CL_INVALID_DEVICE when: "
+                      "\"device is not associated with context\"",
+                      IGetErrorString(err),
+                      "CL_INVALID_OPERATION or CL_INVALID_DEVICE", __FILE__,
+                      __LINE__);
+            return TEST_FAIL;
+        }
+    }
+    err = clSetDefaultDeviceCommandQueue(context, device, nullptr);
+    if (err != CL_INVALID_OPERATION && err != CL_INVALID_COMMAND_QUEUE)
+    {
+        log_error(
+            "ERROR: %s! (Got %s, expected (%s) from %s:%d)\n",
+            "clSetDefaultDeviceCommandQueue should return CL_INVALID_OPERATION "
+            "or CL_INVALID_COMMAND_QUEUE when: \"command_queue is not a valid "
+            "command-queue for device\" using a nullptr",
+            IGetErrorString(err),
+            "CL_INVALID_OPERATION or CL_INVALID_COMMAND_QUEUE", __FILE__,
+            __LINE__);
+        return TEST_FAIL;
+    }
+
+    {
+        constexpr cl_queue_properties props[] = {
+            CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0
+        };
+        clCommandQueueWrapper not_on_device_queue =
+            clCreateCommandQueueWithProperties(context, device, props, &err);
+        test_error_fail(err, "clCreateCommandQueueWithProperties failed");
+        err = clSetDefaultDeviceCommandQueue(context, device,
+                                             not_on_device_queue);
+        if (err != CL_INVALID_OPERATION && err != CL_INVALID_COMMAND_QUEUE)
+        {
+            log_error("ERROR: %s! (Got %s, expected (%s) from %s:%d)\n",
+                      "clSetDefaultDeviceCommandQueue should return "
+                      "CL_INVALID_OPERATION or CL_INVALID_COMMAND_QUEUE when: "
+                      "\"command_queue is not a valid command-queue for "
+                      "device\" using a command queue that is not on device",
+                      IGetErrorString(err),
+                      "CL_INVALID_OPERATION or CL_INVALID_COMMAND_QUEUE",
+                      __FILE__, __LINE__);
+        }
+    }
+
+    return TEST_PASS;
+}
+
+REGISTER_TEST(negative_retain_command_queue)
+{
+    cl_int err = clRetainCommandQueue(nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_COMMAND_QUEUE,
+        "clRetainCommandQueue should return CL_INVALID_COMMAND_QUEUE when: "
+        "\"command_queue is not a valid command-queue\" using a nullptr",
+        TEST_FAIL);
+
+    return TEST_PASS;
+}
+
+REGISTER_TEST(negative_release_command_queue)
+{
+    cl_int err = clReleaseCommandQueue(nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_COMMAND_QUEUE,
+        "clReleaseCommandQueue should return CL_INVALID_COMMAND_QUEUE when: "
+        "\"command_queue is not a valid command-queue\" using a nullptr",
+        TEST_FAIL);
+
+    return TEST_PASS;
+}
+
+static bool device_supports_on_device_queue(cl_device_id deviceID)
+{
+    cl_command_queue_properties device_queue_properties = 0;
+    if (get_device_cl_version(deviceID) >= Version(2, 0))
+    {
+        cl_int err = clGetDeviceInfo(
+            deviceID, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES,
+            sizeof(device_queue_properties), &device_queue_properties, nullptr);
+        test_error(err, "clGetDeviceInfo");
+        return (device_queue_properties
+                & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
+    }
+    return false;
+}
+
+REGISTER_TEST(negative_get_command_queue_info)
+{
+    cl_int err =
+        clGetCommandQueueInfo(nullptr, CL_QUEUE_CONTEXT, 0, nullptr, nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_COMMAND_QUEUE,
+        "clGetCommandQueueInfo should return CL_INVALID_COMMAND_QUEUE when: "
+        "\"command_queue is not a valid command-queue\" using a nullptr",
+        TEST_FAIL);
+
+    if (device_supports_on_device_queue(device))
+    {
+        const cl_queue_properties properties[] = {
+            CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0
+        };
+        cl_int err = CL_INVALID_VALUE;
+        clCommandQueueWrapper cmd_queue = clCreateCommandQueueWithProperties(
+            context, device, properties, &err);
+        test_error(err, "clCreateCommandQueueWithProperties");
+        cl_uint queue_size = -1;
+        err = clGetCommandQueueInfo(cmd_queue, CL_QUEUE_SIZE,
+                                    sizeof(queue_size), &queue_size, nullptr);
+        test_failure_error_ret(err, CL_INVALID_COMMAND_QUEUE,
+                               "clGetCommandQueueInfo should return "
+                               "CL_INVALID_COMMAND_QUEUE when: \"command_queue "
+                               "is not a valid command-queue for param_name\"",
+                               TEST_FAIL);
+    }
+
+    constexpr cl_command_queue_info invalid_param = -1;
+    err = clGetCommandQueueInfo(queue, invalid_param, 0, nullptr, nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_VALUE,
+        "clGetCommandQueueInfo should return CL_INVALID_VALUE when: "
+        "\"param_name is not one of the supported values\"",
+        TEST_FAIL);
+
+
+    cl_uint ref_count = -1;
+    err = clGetCommandQueueInfo(queue, CL_QUEUE_REFERENCE_COUNT, 0, &ref_count,
+                                nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_VALUE,
+        "clGetCommandQueueInfo should return CL_INVALID_VALUE when: \"size in "
+        "bytes specified by param_value_size is < size of return type and "
+        "param_value is not a NULL value\"",
+        TEST_FAIL);
+
+    return TEST_PASS;
+}
+
+REGISTER_TEST_VERSION(negative_set_command_queue_property, Version(1, 0))
+{
+    auto version = get_device_cl_version(device);
+    if (version >= Version(1, 1))
+    {
+        // Implementations are allowed to return an error for
+        // non-OpenCL 1.0 devices. In which case, skip the test.
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    cl_queue_properties property{ CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE };
+    cl_int err = clSetCommandQueueProperty(nullptr, property, CL_TRUE, nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_COMMAND_QUEUE,
+        "clSetCommandQueueProperty should return CL_INVALID_COMMAND_QUEUE "
+        "when: \"command_queue is not a valid command-queue\" using a nullptr",
+        TEST_FAIL);
+
+    property = -1;
+    err = clSetCommandQueueProperty(queue, property, CL_TRUE, nullptr);
+    test_failure_error_ret(
+        err, CL_INVALID_VALUE,
+        "clSetCommandQueueProperty should return CL_INVALID_VALUE when: "
+        "\"values specified in properties are not valid\"",
+        TEST_FAIL);
+
+    return TEST_PASS;
+}
--- a/test_conformance/api/test_kernel_attributes.cpp
+++ b/test_conformance/api/test_kernel_attributes.cpp
@@ -336,3 +336,161 @@ REGISTER_TEST(kernel_attributes)
    }
    return success ? TEST_PASS : TEST_FAIL;
 }
+
+REGISTER_TEST(null_required_work_group_size)
+{
+    cl_int error = CL_SUCCESS;
+
+    clGetKernelSuggestedLocalWorkSizeKHR_fn
+        clGetKernelSuggestedLocalWorkSizeKHR = nullptr;
+    if (is_extension_available(device, "cl_khr_suggested_local_work_size"))
+    {
+        cl_platform_id platform = nullptr;
+        error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform),
+                                &platform, NULL);
+        test_error(error, "clGetDeviceInfo for platform failed");
+
+        clGetKernelSuggestedLocalWorkSizeKHR =
+            (clGetKernelSuggestedLocalWorkSizeKHR_fn)
+                clGetExtensionFunctionAddressForPlatform(
+                    platform, "clGetKernelSuggestedLocalWorkSizeKHR");
+        test_assert_error(clGetKernelSuggestedLocalWorkSizeKHR != nullptr,
+                          "Couldn't get function pointer for "
+                          "clGetKernelSuggestedLocalWorkSizeKHR");
+    }
+
+    cl_uint device_max_dim = 0;
+    error = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
+                            sizeof(device_max_dim), &device_max_dim, nullptr);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed");
+    test_assert_error(device_max_dim >= 3,
+                      "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS must be at least 3!");
+
+    std::vector<size_t> device_max_work_item_sizes(device_max_dim);
+    error = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                            sizeof(size_t) * device_max_dim,
+                            device_max_work_item_sizes.data(), nullptr);
+
+    size_t device_max_work_group_size = 0;
+    error = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                            sizeof(device_max_work_group_size),
+                            &device_max_work_group_size, nullptr);
+    test_error(error,
+               "clGetDeviceInfo for CL_DEVICE_MAX_WORK_GROUP_SIZE failed");
+
+    clMemWrapper dst;
+    dst = clCreateBuffer(context, CL_MEM_READ_WRITE, 3 * sizeof(cl_int),
+                         nullptr, &error);
+
+    struct KernelAttribInfo
+    {
+        std::string str;
+        cl_uint max_dim;
+    };
+
+    std::vector<KernelAttribInfo> attribs;
+    attribs.push_back({ "__attribute__((reqd_work_group_size(2,1,1)))", 1 });
+    attribs.push_back({ "__attribute__((reqd_work_group_size(2,3,1)))", 2 });
+    attribs.push_back({ "__attribute__((reqd_work_group_size(2,3,4)))", 3 });
+
+    const std::string body_str = R"(
+        __kernel void wg_size(__global int* dst)
+        {
+            if (get_global_id(0) == 0 &&
+                get_global_id(1) == 0 &&
+                get_global_id(2) == 0) {
+                dst[0] = get_local_size(0);
+                dst[1] = get_local_size(1);
+                dst[2] = get_local_size(2);
+            }
+        }
+    )";
+
+    for (auto& attrib : attribs)
+    {
+        const std::string source_str = attrib.str + body_str;
+        const char* source = source_str.c_str();
+
+        clProgramWrapper program;
+        clKernelWrapper kernel;
+        error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                            &source, "wg_size");
+        test_error(error, "Unable to create test kernel");
+
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &dst);
+        test_error(error, "clSetKernelArg failed");
+
+        for (cl_uint work_dim = 1; work_dim <= attrib.max_dim; work_dim++)
+        {
+            const cl_int expected[3] = { 2, work_dim >= 2 ? 3 : 1,
+                                         work_dim >= 3 ? 4 : 1 };
+            const size_t test_work_group_size =
+                expected[0] * expected[1] * expected[2];
+            if ((size_t)expected[0] > device_max_work_item_sizes[0]
+                || (size_t)expected[1] > device_max_work_item_sizes[1]
+                || (size_t)expected[2] > device_max_work_item_sizes[2]
+                || test_work_group_size > device_max_work_group_size)
+            {
+                log_info("Skipping test for work_dim = %u: required work group "
+                         "size (%i, %i, %i) (total %zu) exceeds device max "
+                         "work group size (%zu, %zu, %zu) (total %zu)\n",
+                         work_dim, expected[0], expected[1], expected[2],
+                         test_work_group_size, device_max_work_item_sizes[0],
+                         device_max_work_item_sizes[1],
+                         device_max_work_item_sizes[2],
+                         device_max_work_group_size);
+                continue;
+            }
+
+            const cl_int zero = 0;
+            error = clEnqueueFillBuffer(queue, dst, &zero, sizeof(zero), 0,
+                                        sizeof(expected), 0, nullptr, nullptr);
+
+            const size_t global_work_size[3] = { 2 * 32, 3 * 32, 4 * 32 };
+            error = clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr,
+                                           global_work_size, nullptr, 0,
+                                           nullptr, nullptr);
+            test_error(error, "clEnqueueNDRangeKernel failed");
+
+            cl_int results[3] = { -1, -1, -1 };
+            error = clEnqueueReadBuffer(queue, dst, CL_TRUE, 0, sizeof(results),
+                                        results, 0, nullptr, nullptr);
+            test_error(error, "clEnqueueReadBuffer failed");
+
+            if (results[0] != expected[0] || results[1] != expected[1]
+                || results[2] != expected[2])
+            {
+                log_error("Executed local size mismatch with work_dim = %u: "
+                          "Expected (%d,%d,%d) got (%d,%d,%d)\n",
+                          work_dim, expected[0], expected[1], expected[2],
+                          results[0], results[1], results[2]);
+                return TEST_FAIL;
+            }
+
+            if (clGetKernelSuggestedLocalWorkSizeKHR != nullptr)
+            {
+                size_t suggested[3] = { 1, 1, 1 };
+                error = clGetKernelSuggestedLocalWorkSizeKHR(
+                    queue, kernel, work_dim, nullptr, global_work_size,
+                    suggested);
+                test_error(error,
+                           "clGetKernelSuggestedLocalWorkSizeKHR failed");
+
+                if ((cl_int)suggested[0] != expected[0]
+                    || (cl_int)suggested[1] != expected[1]
+                    || (cl_int)suggested[2] != expected[2])
+                {
+                    log_error("Suggested local size mismatch with work_dim = "
+                              "%u: Expected (%d,%d,%d) got (%d,%d,%d)\n",
+                              work_dim, expected[0], expected[1], expected[2],
+                              (cl_int)suggested[0], (cl_int)suggested[1],
+                              (cl_int)suggested[2]);
+                    return TEST_FAIL;
+                }
+            }
+        }
+    }
+
+    return TEST_PASS;
+}
--- a/test_conformance/api/test_kernels.cpp
+++ b/test_conformance/api/test_kernels.cpp
@@ -16,6 +16,8 @@
 #include "testBase.h"
 #include "harness/typeWrappers.h"
 #include "harness/conversions.h"
+#include "harness/stringHelpers.h"
+#include <array>
 #include <vector>

 const char *sample_single_test_kernel[] = {
@@ -87,6 +89,16 @@ const char *sample_two_kernel_program[] = {
 "\n"
 "}\n" };

+const char *sample_sampler_size_test_kernel = R"(
+    __kernel void sampler_size_test(sampler_t sampler, __read_only image2d_t src, __global float4 *dst)
+    {
+        int  tid = get_global_id(0);
+        int2 coord = (int2)(get_global_id(0), get_global_id(1));
+        float4 data = read_imagef(src, sampler, coord);
+        dst[tid] = data;
+    }
+)";
+
 const char *sample_mem_obj_size_test_kernel = R"(
    __kernel void mem_obj_size_test(__global int *src, __global int *dst)
    {
@@ -117,6 +129,14 @@ const char *sample_write_only_image_test_kernel = R"(
    }
 )";

+const char *sample_arg_size_test_kernel = R"(
+    %s
+    __kernel void arg_size_test(%s src, __global %s *dst)
+    {
+        dst[0]=src;
+    }
+)";
+
 REGISTER_TEST(get_kernel_info)
 {
    int error;
@@ -734,6 +754,148 @@ REGISTER_TEST(negative_set_immutable_memory_to_writeable_kernel_arg)
    return TEST_PASS;
 }

+REGISTER_TEST(negative_invalid_arg_sampler)
+{
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(device)
+
+    cl_int error = CL_SUCCESS;
+
+    clProgramWrapper program;
+    clKernelWrapper sampler_arg_kernel;
+
+    // Setup the test
+    error =
+        create_single_kernel_helper(context, &program, nullptr, 1,
+                                    &sample_sampler_size_test_kernel, nullptr);
+    test_error(error, "Unable to build test program");
+
+    sampler_arg_kernel = clCreateKernel(program, "sampler_size_test", &error);
+    test_error(error,
+               "Unable to get sampler_size_test kernel for built program");
+
+    // Run the test - CL_INVALID_SAMPLER
+    error = clSetKernelArg(sampler_arg_kernel, 0, sizeof(cl_sampler), nullptr);
+    test_failure_error_ret(
+        error, CL_INVALID_SAMPLER,
+        "clSetKernelArg is supposed to fail with CL_INVALID_SAMPLER when "
+        "argument is declared to be of type sampler_t and the specified "
+        "arg_value is not a valid sampler object",
+        TEST_FAIL);
+    return TEST_PASS;
+}
+
+REGISTER_TEST(negative_invalid_arg_sampler_size)
+{
+    PASSIVE_REQUIRE_IMAGE_SUPPORT(device)
+
+    cl_int error = CL_SUCCESS;
+    clProgramWrapper program;
+    clKernelWrapper sampler_arg_kernel;
+
+    // Setup the test
+    error =
+        create_single_kernel_helper(context, &program, nullptr, 1,
+                                    &sample_sampler_size_test_kernel, nullptr);
+    test_error(error, "Unable to build test program");
+
+    sampler_arg_kernel = clCreateKernel(program, "sampler_size_test", &error);
+    test_error(error,
+               "Unable to get sampler_size_test kernel for built program");
+
+    clSamplerWrapper sampler = clCreateSampler(
+        context, CL_FALSE, CL_ADDRESS_NONE, CL_FILTER_NEAREST, &error);
+    test_error(error, "Unable to create sampler");
+
+    // Run the test - CL_INVALID_ARG_SIZE
+    error =
+        clSetKernelArg(sampler_arg_kernel, 0, sizeof(cl_sampler) * 2, &sampler);
+    test_failure_error_ret(
+        error, CL_INVALID_ARG_SIZE,
+        "clSetKernelArg is supposed to fail with CL_INVALID_ARG_SIZE when "
+        "argument is a sampler object and arg_size > sizeof(cl_sampler)",
+        TEST_FAIL);
+
+    error =
+        clSetKernelArg(sampler_arg_kernel, 0, sizeof(cl_sampler) / 2, &sampler);
+    test_failure_error_ret(
+        error, CL_INVALID_ARG_SIZE,
+        "clSetKernelArg is supposed to fail with CL_INVALID_ARG_SIZE when "
+        "argument is a sampler object and arg_size < sizeof(cl_sampler)",
+        TEST_FAIL);
+
+    return TEST_PASS;
+}
+
+REGISTER_TEST(negative_invalid_arg_size)
+{
+    std::vector<ExplicitType> exp_types = { kChar,  kUChar, kShort, kUShort,
+                                            kInt,   kUInt,  kLong,  kULong,
+                                            kFloat, kHalf,  kDouble };
+
+    bool fp16_supported = is_extension_available(device, "cl_khr_fp16");
+    bool fp64_supported = is_extension_available(device, "cl_khr_fp64");
+
+    for (unsigned int type_num = 0; type_num < exp_types.size(); type_num++)
+    {
+        auto type = exp_types[type_num];
+
+        if ((type == kLong || type == kULong) && !gHasLong)
+            continue;
+        else if (type == kDouble && !fp64_supported)
+            continue;
+        else if (type == kHalf && !fp16_supported)
+            continue;
+        else if (strchr(get_explicit_type_name(type), ' ') != 0)
+            continue;
+
+        std::array<unsigned int, 5> sizes = { 1, 2, 4, 8, 16 };
+        std::vector<char> buf(sizeof(cl_ulong16), 0);
+        for (unsigned i = 0; i < sizes.size(); i++)
+        {
+            clProgramWrapper program;
+            clKernelWrapper kernel;
+
+            size_t destStride = get_explicit_type_size(type) * sizes[i];
+
+            std::ostringstream vecNameStr;
+            vecNameStr << get_explicit_type_name(type);
+            if (sizes[i] != 1) vecNameStr << sizes[i];
+
+            std::string ext_str;
+            if (type == kDouble)
+                ext_str = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+
+            if (type == kHalf)
+                ext_str = "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+
+            auto vt_name = vecNameStr.str();
+            std::string program_source =
+                str_sprintf(std::string(sample_arg_size_test_kernel),
+                            ext_str.c_str(), vt_name.c_str(), vt_name.c_str());
+
+            const char *ptr = program_source.c_str();
+            cl_int error = create_single_kernel_helper(
+                context, &program, &kernel, 1, &ptr, "arg_size_test");
+            test_error(error, "Unable to build test program!");
+
+            // Run the test
+            size_t reduced = destStride / 2;
+            error = clSetKernelArg(kernel, 0, reduced, buf.data());
+            if (error != CL_INVALID_ARG_SIZE)
+            {
+                std::stringstream sstr;
+                sstr << "clSetKernelArg is supposed to fail "
+                        "with CL_INVALID_ARG_SIZE with type "
+                     << vecNameStr.str() << " and sizeof " << reduced
+                     << std::endl;
+                log_error("%s", sstr.str().c_str());
+                return TEST_FAIL;
+            }
+        }
+    }
+    return TEST_PASS;
+}
+
 REGISTER_TEST(negative_invalid_arg_mem_obj)
 {
    cl_int error = CL_SUCCESS;
--- a/test_conformance/atomics/test_indexed_cases.cpp
+++ b/test_conformance/atomics/test_indexed_cases.cpp
@@ -255,18 +255,12 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
    }

    // Initialize our storage
-    std::unique_ptr<cl_int[]> l_bin_counts(new cl_int[number_of_bins]);
-    if (!l_bin_counts)
-    {
-        log_error("add_index_bin_test FAILED to allocate initial values for "
-                  "bin_counters.\n");
-        return -1;
-    }
+    std::vector<cl_int> l_bin_counts(number_of_bins);
    int i;
    for (i = 0; i < number_of_bins; i++) l_bin_counts[i] = 0;
    err = clEnqueueWriteBuffer(queue, bin_counters, true, 0,
                               sizeof(cl_int) * number_of_bins,
-                               l_bin_counts.get(), 0, NULL, NULL);
+                               l_bin_counts.data(), 0, NULL, NULL);
    if (err)
    {
        log_error("add_index_bin_test FAILED to set initial values for "
@@ -275,19 +269,12 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
        return -1;
    }

-    std::unique_ptr<cl_int[]> values(
-        new cl_int[number_of_bins * max_counts_per_bin]);
-    if (!values)
-    {
-        log_error(
-            "add_index_bin_test FAILED to allocate initial values for bins.\n");
-        return -1;
-    }
+    std::vector<cl_int> values(number_of_bins * max_counts_per_bin);
    for (i = 0; i < number_of_bins * max_counts_per_bin; i++) values[i] = -1;
    err = clEnqueueWriteBuffer(queue, bins, true, 0,
                               sizeof(cl_int) * number_of_bins
                                   * max_counts_per_bin,
-                               values.get(), 0, NULL, NULL);
+                               values.data(), 0, NULL, NULL);
    if (err)
    {
        log_error(
@@ -296,13 +283,7 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
        return -1;
    }

-    std::unique_ptr<cl_int[]> l_bin_assignments(new cl_int[number_of_items]);
-    if (!l_bin_assignments)
-    {
-        log_error("add_index_bin_test FAILED to allocate initial values for "
-                  "l_bin_assignments.\n");
-        return -1;
-    }
+    std::vector<cl_int> l_bin_assignments(number_of_items);
    for (i = 0; i < number_of_items; i++)
    {
        int bin = random_in_range(0, number_of_bins - 1, d);
@@ -326,7 +307,7 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
    }
    err = clEnqueueWriteBuffer(queue, bin_assignments, true, 0,
                               sizeof(cl_int) * number_of_items,
-                               l_bin_assignments.get(), 0, NULL, NULL);
+                               l_bin_assignments.data(), 0, NULL, NULL);
    if (err)
    {
        log_error("add_index_bin_test FAILED to set initial values for "
@@ -355,34 +336,22 @@ int add_index_bin_test(size_t *global_threads, cl_command_queue queue,
        return -1;
    }

-    std::unique_ptr<cl_int[]> final_bin_assignments(
-        new cl_int[number_of_bins * max_counts_per_bin]);
-    if (!final_bin_assignments)
-    {
-        log_error("add_index_bin_test FAILED to allocate initial values for "
-                  "final_bin_assignments.\n");
-        return -1;
-    }
+    std::vector<cl_int> final_bin_assignments(number_of_bins
+                                              * max_counts_per_bin);
    err = clEnqueueReadBuffer(queue, bins, true, 0,
                              sizeof(cl_int) * number_of_bins
                                  * max_counts_per_bin,
-                              final_bin_assignments.get(), 0, NULL, NULL);
+                              final_bin_assignments.data(), 0, NULL, NULL);
    if (err)
    {
        log_error("add_index_bin_test FAILED to read back bins: %d\n", err);
        return -1;
    }

-    std::unique_ptr<cl_int[]> final_bin_counts(new cl_int[number_of_bins]);
-    if (!final_bin_counts)
-    {
-        log_error("add_index_bin_test FAILED to allocate initial values for "
-                  "final_bin_counts.\n");
-        return -1;
-    }
+    std::vector<cl_int> final_bin_counts(number_of_bins);
    err = clEnqueueReadBuffer(queue, bin_counters, true, 0,
                              sizeof(cl_int) * number_of_bins,
-                              final_bin_counts.get(), 0, NULL, NULL);
+                              final_bin_counts.data(), 0, NULL, NULL);
    if (err)
    {
        log_error("add_index_bin_test FAILED to read back bin_counters: %d\n",
--- a/test_conformance/c11_atomics/common.cpp
+++ b/test_conformance/c11_atomics/common.cpp
@@ -193,15 +193,38 @@ int AtomicTypeInfo::IsSupported(cl_device_id device)
 template<> cl_int AtomicTypeExtendedInfo<cl_int>::MinValue() {return CL_INT_MIN;}
 template<> cl_uint AtomicTypeExtendedInfo<cl_uint>::MinValue() {return 0;}
 template<> cl_long AtomicTypeExtendedInfo<cl_long>::MinValue() {return CL_LONG_MIN;}
-template<> cl_ulong AtomicTypeExtendedInfo<cl_ulong>::MinValue() {return 0;}
-template<> cl_float AtomicTypeExtendedInfo<cl_float>::MinValue() {return CL_FLT_MIN;}
-template<> cl_double AtomicTypeExtendedInfo<cl_double>::MinValue() {return CL_DBL_MIN;}
+template <> cl_ulong AtomicTypeExtendedInfo<cl_ulong>::MinValue() { return 0; }
+template <> cl_half AtomicTypeExtendedInfo<cl_half>::MinValue()
+{
+    return cl_half_from_float(-CL_HALF_MAX, gHalfRoundingMode);
+}
+template <> cl_float AtomicTypeExtendedInfo<cl_float>::MinValue()
+{
+    return -CL_FLT_MAX;
+}
+template <> cl_double AtomicTypeExtendedInfo<cl_double>::MinValue()
+{
+    return -CL_DBL_MAX;
+}

-template<> cl_int AtomicTypeExtendedInfo<cl_int>::MaxValue() {return CL_INT_MAX;}
-template<> cl_uint AtomicTypeExtendedInfo<cl_uint>::MaxValue() {return CL_UINT_MAX;}
+template <> cl_int AtomicTypeExtendedInfo<cl_int>::MaxValue()
+{
+    return CL_INT_MAX;
+}
+template <> cl_uint AtomicTypeExtendedInfo<cl_uint>::MaxValue()
+{
+    return CL_UINT_MAX;
+}
 template<> cl_long AtomicTypeExtendedInfo<cl_long>::MaxValue() {return CL_LONG_MAX;}
 template<> cl_ulong AtomicTypeExtendedInfo<cl_ulong>::MaxValue() {return CL_ULONG_MAX;}
-template<> cl_float AtomicTypeExtendedInfo<cl_float>::MaxValue() {return CL_FLT_MAX;}
+template <> cl_half AtomicTypeExtendedInfo<cl_half>::MaxValue()
+{
+    return cl_half_from_float(CL_HALF_MAX, gHalfRoundingMode);
+}
+template <> cl_float AtomicTypeExtendedInfo<cl_float>::MaxValue()
+{
+    return CL_FLT_MAX;
+}
 template<> cl_double AtomicTypeExtendedInfo<cl_double>::MaxValue() {return CL_DBL_MAX;}

 cl_int getSupportedMemoryOrdersAndScopes(
--- a/test_conformance/c11_atomics/common.h
+++ b/test_conformance/c11_atomics/common.h
@@ -79,6 +79,7 @@ extern cl_device_atomic_capabilities gAtomicMemCap,
 extern cl_half_rounding_mode gHalfRoundingMode;
 extern bool gFloatAtomicsSupported;
 extern cl_device_fp_atomic_capabilities_ext gHalfAtomicCaps;
+extern cl_device_fp_atomic_capabilities_ext gDoubleAtomicCaps;
 extern cl_device_fp_atomic_capabilities_ext gFloatAtomicCaps;

 extern const char *
@@ -893,15 +894,16 @@ CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
        header += std::string("__global volatile ") + aTypeName + " destMemory["
            + ss.str() + "] = {\n";
        ss.str("");
+
        if (CBasicTest<HostAtomicType, HostDataType>::DataType()._type
            == TYPE_ATOMIC_FLOAT)
            ss << std::setprecision(10) << _startValue;
        else if (CBasicTest<HostAtomicType, HostDataType>::DataType()._type
                 == TYPE_ATOMIC_HALF)
-            ss << static_cast<HostDataType>(
-                cl_half_to_float(static_cast<cl_half>(_startValue)));
+            ss << cl_half_to_float(static_cast<cl_half>(_startValue));
        else
            ss << _startValue;
+
        for (cl_uint i = 0; i < maxNumDestItems; i++)
        {
            if (aTypeName == "atomic_flag")
--- a/test_conformance/c11_atomics/host_atomics.h
+++ b/test_conformance/c11_atomics/host_atomics.h
@@ -18,7 +18,6 @@

 #include "harness/testHarness.h"
 #include <mutex>
-
 #include "CL/cl_half.h"

 #ifdef WIN32
@@ -99,7 +98,19 @@ template <typename AtomicType, typename CorrespondingType>
 CorrespondingType host_atomic_fetch_add(volatile AtomicType *a, CorrespondingType c,
                                        TExplicitMemoryOrderType order)
 {
-    if constexpr (std::is_same_v<AtomicType, HOST_ATOMIC_FLOAT>)
+    if constexpr (std::is_same_v<AtomicType, HOST_ATOMIC_HALF>)
+    {
+        static std::mutex mx;
+        std::lock_guard<std::mutex> lock(mx);
+        CorrespondingType old_value = *a;
+        *a = cl_half_from_float((cl_half_to_float(*a) + cl_half_to_float(c)),
+                                gHalfRoundingMode);
+        return old_value;
+    }
+    else if constexpr (
+        std::is_same_v<
+            AtomicType,
+            HOST_ATOMIC_FLOAT> || std::is_same_v<AtomicType, HOST_ATOMIC_DOUBLE>)
    {
        static std::mutex mx;
        std::lock_guard<std::mutex> lock(mx);
@@ -109,7 +120,7 @@ CorrespondingType host_atomic_fetch_add(volatile AtomicType *a, CorrespondingTyp
    }
    else
    {
-#if defined( _MSC_VER ) || (defined( __INTEL_COMPILER ) && defined(WIN32))
+#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
        return InterlockedExchangeAdd(a, c);
 #elif defined(__GNUC__)
        return __sync_fetch_and_add(a, c);
@@ -124,7 +135,15 @@ template <typename AtomicType, typename CorrespondingType>
 CorrespondingType host_atomic_fetch_sub(volatile AtomicType *a, CorrespondingType c,
                                        TExplicitMemoryOrderType order)
 {
-    if constexpr (std::is_same_v<AtomicType, HOST_ATOMIC_HALF>)
+    if constexpr (std::is_same_v<AtomicType, HOST_ATOMIC_FLOAT>)
+    {
+        static std::mutex mx;
+        std::lock_guard<std::mutex> lock(mx);
+        CorrespondingType old_value = *a;
+        *a -= c;
+        return old_value;
+    }
+    else if constexpr (std::is_same_v<AtomicType, HOST_ATOMIC_HALF>)
    {
        static std::mutex mx;
        std::lock_guard<std::mutex> lock(mx);
@@ -173,14 +192,30 @@ bool host_atomic_compare_exchange(volatile AtomicType *a, CorrespondingType *exp
                                  TExplicitMemoryOrderType order_failure)
 {
    CorrespondingType tmp;
-    if constexpr (std::is_same_v<AtomicType, HOST_ATOMIC_FLOAT>)
+    if constexpr (std::is_same_v<AtomicType, HOST_ATOMIC_HALF>)
+    {
+        static std::mutex mtx;
+        std::lock_guard<std::mutex> lock(mtx);
+        tmp = *reinterpret_cast<volatile cl_half *>(a);
+
+        if (cl_half_to_float(tmp) == cl_half_to_float(*expected))
+        {
+            *reinterpret_cast<volatile cl_half *>(a) = desired;
+            return true;
+        }
+        *expected = tmp;
+    }
+    else if constexpr (
+        std::is_same_v<
+            AtomicType,
+            HOST_ATOMIC_DOUBLE> || std::is_same_v<AtomicType, HOST_ATOMIC_FLOAT>)
    {
        static std::mutex mtx;
        std::lock_guard<std::mutex> lock(mtx);
        tmp = *reinterpret_cast<volatile float *>(a);
        if (tmp == *expected)
        {
-            *reinterpret_cast<volatile float *>(a) = desired;
+            *a = desired;
            return true;
        }
        *expected = tmp;
@@ -188,7 +223,6 @@ bool host_atomic_compare_exchange(volatile AtomicType *a, CorrespondingType *exp
    else
    {
 #if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
-
        tmp = InterlockedCompareExchange(a, desired, *expected);
 #elif defined(__GNUC__)
        tmp = __sync_val_compare_and_swap(a, *expected, desired);
--- a/test_conformance/c11_atomics/main.cpp
+++ b/test_conformance/c11_atomics/main.cpp
@@ -34,6 +34,7 @@ cl_device_atomic_capabilities gAtomicMemCap,
 cl_half_rounding_mode gHalfRoundingMode = CL_HALF_RTE;
 bool gFloatAtomicsSupported = false;
 cl_device_fp_atomic_capabilities_ext gHalfAtomicCaps = 0;
+cl_device_fp_atomic_capabilities_ext gDoubleAtomicCaps = 0;
 cl_device_fp_atomic_capabilities_ext gFloatAtomicCaps = 0;

 test_status InitCL(cl_device_id device) {
@@ -134,6 +135,14 @@ test_status InitCL(cl_device_id device) {
    {
        gFloatAtomicsSupported = true;

+        if (is_extension_available(device, "cl_khr_fp64"))
+        {
+            cl_int error = clGetDeviceInfo(
+                device, CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT,
+                sizeof(gDoubleAtomicCaps), &gDoubleAtomicCaps, nullptr);
+            test_error_ret(error, "clGetDeviceInfo failed!", TEST_FAIL);
+        }
+
        cl_int error = clGetDeviceInfo(
            device, CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT,
            sizeof(gFloatAtomicCaps), &gFloatAtomicCaps, nullptr);
--- a/test_conformance/c11_atomics/test_atomics.cpp
+++ b/test_conformance/c11_atomics/test_atomics.cpp
@@ -417,7 +417,7 @@ public:
        correct = true;
        for (cl_uint i = 0; i < threadCount; i++)
        {
-            if constexpr (std::is_same<HostDataType, cl_half>::value)
+            if constexpr (std::is_same_v<HostDataType, cl_half>)
            {
                HostDataType test = cl_half_from_float(static_cast<float>(i),
                                                       gHalfRoundingMode);
@@ -1163,13 +1163,30 @@ REGISTER_TEST(svm_atomic_compare_exchange_weak)
                                                     num_elements, true);
 }

+template <typename T> double kahan_sum(const std::vector<T> &nums)
+{
+    return 0.0;
+}
+template <> double kahan_sum<double>(const std::vector<double> &nums)
+{
+    double sum = 0.0;
+    double compensation = 0.0;
+    for (double num : nums)
+    {
+        double y = num - compensation;
+        double t = sum + y;
+        compensation = (t - sum) - y;
+        sum = t;
+    }
+    return sum;
+}
 template <typename HostAtomicType, typename HostDataType>
 class CBasicTestFetchAdd
    : public CBasicTestMemOrderScope<HostAtomicType, HostDataType> {

    double min_range;
    double max_range;
-    double max_error_fp32;
+    double max_error;
    std::vector<HostDataType> ref_vals;

 public:
@@ -1182,26 +1199,98 @@ public:
    CBasicTestFetchAdd(TExplicitAtomicType dataType, bool useSVM)
        : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType,
                                                                useSVM),
-          min_range(-999.0), max_range(999.0), max_error_fp32(0.0)
+          min_range(-999.0), max_range(999.0), max_error(0.0)
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
-            StartValue(0.f);
+            StartValue((HostDataType)0.0);
            CBasicTestMemOrderScope<HostAtomicType,
                                    HostDataType>::OldValueCheck(false);
        }
    }
+    template <typename Iterator> float accum_halfs(Iterator begin, Iterator end)
+    {
+        cl_half sum = 0;
+        for (auto it = begin; it != end; ++it)
+        {
+            sum = cl_half_from_float(cl_half_to_float(sum)
+                                         + cl_half_to_float(*it),
+                                     gHalfRoundingMode);
+        }
+        return cl_half_to_float(sum);
+    }
    bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
                      MTdata d) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            if (threadCount > ref_vals.size())
            {
                ref_vals.resize(threadCount);

                for (cl_uint i = 0; i < threadCount; i++)
-                    ref_vals[i] = get_random_float(min_range, max_range, d);
+                    ref_vals[i] = cl_half_from_float(
+                        get_random_float(min_range, max_range, d),
+                        gHalfRoundingMode);
+
+                memcpy(startRefValues, ref_vals.data(),
+                       sizeof(HostDataType) * ref_vals.size());
+
+                // Estimate highest possible summation error for given set.
+                std::vector<float> sums;
+                std::sort(ref_vals.begin(), ref_vals.end(),
+                          [](cl_half a, cl_half b) {
+                              return cl_half_to_float(a) < cl_half_to_float(b);
+                          });
+
+                sums.push_back(accum_halfs(ref_vals.begin(), ref_vals.end()));
+                sums.push_back(accum_halfs(ref_vals.rbegin(), ref_vals.rend()));
+
+                std::sort(ref_vals.begin(), ref_vals.end(),
+                          [](cl_half a, cl_half b) {
+                              return std::abs(cl_half_to_float(a))
+                                  < std::abs(cl_half_to_float(b));
+                          });
+
+                float precise = 0.f;
+                for (auto elem : ref_vals) precise += cl_half_to_float(elem);
+                sums.push_back(precise);
+
+                sums.push_back(accum_halfs(ref_vals.begin(), ref_vals.end()));
+                sums.push_back(accum_halfs(ref_vals.rbegin(), ref_vals.rend()));
+
+                std::sort(sums.begin(), sums.end());
+                max_error = std::abs(sums.front() - sums.back());
+
+                // restore unsorted order
+                memcpy(ref_vals.data(), startRefValues,
+                       sizeof(HostDataType) * ref_vals.size());
+            }
+            else
+            {
+                memcpy(startRefValues, ref_vals.data(),
+                       sizeof(HostDataType) * threadCount);
+            }
+            return true;
+        }
+        else if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
+        {
+            if (threadCount > ref_vals.size())
+            {
+                ref_vals.resize(threadCount);
+
+                for (cl_uint i = 0; i < threadCount; i++)
+                    if constexpr (std::is_same_v<HostDataType, HOST_DOUBLE>)
+                        ref_vals[i] =
+                            get_random_double(min_range, max_range, d);
+                    else
+                        ref_vals[i] = get_random_float(min_range, max_range, d);

                memcpy(startRefValues, ref_vals.data(),
                       sizeof(HostDataType) * ref_vals.size());
@@ -1216,12 +1305,17 @@ public:
                sums.push_back(
                    std::accumulate(ref_vals.rbegin(), ref_vals.rend(), 0.f));

-                std::sort(
-                    ref_vals.begin(), ref_vals.end(),
-                    [](float a, float b) { return std::abs(a) < std::abs(b); });
+                std::sort(ref_vals.begin(), ref_vals.end(),
+                          [](HostDataType a, HostDataType b) {
+                              return std::abs(a) < std::abs(b);
+                          });

                double precise = 0.0;
-                for (auto elem : ref_vals) precise += double(elem);
+                if constexpr (std::is_same_v<HostDataType, HOST_DOUBLE>)
+                    precise = kahan_sum(ref_vals);
+                else
+                    for (auto elem : ref_vals) precise += double(elem);
+
                sums.push_back(precise);

                sums.push_back(
@@ -1231,8 +1325,7 @@ public:
                    std::accumulate(ref_vals.rbegin(), ref_vals.rend(), 0.f));

                std::sort(sums.begin(), sums.end());
-                max_error_fp32 =
-                    std::abs((HOST_ATOMIC_FLOAT)sums.front() - sums.back());
+                max_error = std::abs(sums.front() - sums.back());

                // restore unsorted order
                memcpy(ref_vals.data(), startRefValues,
@@ -1252,7 +1345,10 @@ public:
        std::string memoryOrderScope = MemoryOrderScopeStr();
        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");

-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_DOUBLE> || std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            return "  atomic_fetch_add" + postfix + "(&destMemory[0], ("
                + DataType().AddSubOperandTypeName() + ")oldValues[tid]"
@@ -1286,7 +1382,10 @@ public:
                      volatile HostAtomicType *destMemory,
                      HostDataType *oldValues) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_DOUBLE> || std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            host_atomic_fetch_add(&destMemory[0], (HostDataType)oldValues[tid],
                                  MemoryOrder());
@@ -1312,7 +1411,23 @@ public:
                       cl_uint whichDestValue) override
    {
        expected = StartValue();
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
+        {
+            if (whichDestValue == 0)
+            {
+                for (cl_uint i = 0; i < threadCount; i++)
+                {
+                    expected = cl_half_from_float(
+                        cl_half_to_float(expected)
+                            + cl_half_to_float(startRefValues[i]),
+                        gHalfRoundingMode);
+                }
+            }
+        }
+        else if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_DOUBLE> || std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            if (whichDestValue == 0)
                for (cl_uint i = 0; i < threadCount; i++)
@@ -1331,12 +1446,22 @@ public:
                             const std::vector<HostAtomicType> &testValues,
                             cl_uint whichDestValue) override
    {
-        if (std::is_same<HostDataType, HOST_ATOMIC_FLOAT>::value)
+        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            if (whichDestValue == 0)
-                return std::abs((HOST_ATOMIC_FLOAT)expected
+                return std::abs(cl_half_to_float(expected)
+                                - cl_half_to_float(testValues[whichDestValue]))
+                    > max_error;
+        }
+        else if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_DOUBLE> || std::is_same_v<HostDataType, HOST_FLOAT>)
+        {
+            if (whichDestValue == 0)
+                return std::abs((HostDataType)expected
                                - testValues[whichDestValue])
-                    > max_error_fp32;
+                    > max_error;
        }
        return CBasicTestMemOrderScope<
            HostAtomicType, HostDataType>::IsTestNotAsExpected(expected,
@@ -1346,7 +1471,10 @@ public:
    bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues,
                    HostAtomicType *finalValues) override
    {
-        if (std::is_same<HostDataType, HOST_ATOMIC_FLOAT>::value)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_DOUBLE> || std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            correct = true;
            for (cl_uint i = 1; i < threadCount; i++)
@@ -1369,7 +1497,28 @@ public:
    int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
                          cl_command_queue queue) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        {
+            if (LocalMemory()
+                && (gHalfAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0)
+                return 0; // skip test - not applicable
+
+            if (!LocalMemory()
+                && (gHalfAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT) == 0)
+                return 0;
+        }
+        else if constexpr (std::is_same_v<HostDataType, HOST_DOUBLE>)
+        {
+            if (LocalMemory()
+                && (gDoubleAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0)
+                return 0; // skip test - not applicable
+
+            if (!LocalMemory()
+                && (gDoubleAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT)
+                    == 0)
+                return 0;
+        }
+        else if constexpr (std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            if (LocalMemory()
                && (gFloatAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0)
@@ -1385,7 +1534,10 @@ public:
    }
    cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_DOUBLE> || std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            return threadCount;
        }
@@ -1420,6 +1572,16 @@ static int test_atomic_fetch_add_generic(cl_device_id deviceID,

    if (gFloatAtomicsSupported)
    {
+        CBasicTestFetchAdd<HOST_ATOMIC_HALF, HOST_HALF> test_half(
+            TYPE_ATOMIC_HALF, useSVM);
+        EXECUTE_TEST(error,
+                     test_half.Execute(deviceID, context, queue, num_elements));
+
+        CBasicTestFetchAdd<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double(
+            TYPE_ATOMIC_DOUBLE, useSVM);
+        EXECUTE_TEST(
+            error, test_double.Execute(deviceID, context, queue, num_elements));
+
        CBasicTestFetchAdd<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(
            TYPE_ATOMIC_FLOAT, useSVM);
        EXECUTE_TEST(
@@ -1506,7 +1668,10 @@ public:
                                                                useSVM),
          min_range(-999.0), max_range(999.0), max_error(0.0)
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_FLOAT> || std::is_same_v<HostDataType, HOST_HALF>)
        {
            StartValue(0);
            CBasicTestMemOrderScope<HostAtomicType,
@@ -1514,6 +1679,13 @@ public:
        }
    }
    template <typename Iterator>
+    HostDataType subtract(Iterator begin, Iterator end)
+    {
+        HostDataType res = 0;
+        for (auto it = begin; it != end; ++it) res = res - *it;
+        return res;
+    }
+    template <typename Iterator>
    float subtract_halfs(Iterator begin, Iterator end)
    {
        cl_half res = 0;
@@ -1528,12 +1700,53 @@ public:
    bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
                      MTdata d) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        if constexpr (std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            if (threadCount > ref_vals.size())
            {
                ref_vals.resize(threadCount);
+                for (cl_uint i = 0; i < threadCount; i++)
+                    ref_vals[i] = get_random_float(min_range, max_range, d);

+                memcpy(startRefValues, ref_vals.data(),
+                       sizeof(HostDataType) * ref_vals.size());
+
+                // Estimate highest possible subtraction error for given set.
+                std::vector<HostDataType> sums;
+                std::sort(ref_vals.begin(), ref_vals.end());
+                sums.push_back(subtract(ref_vals.begin(), ref_vals.end()));
+                sums.push_back(subtract(ref_vals.rbegin(), ref_vals.rend()));
+
+                std::sort(
+                    ref_vals.begin(), ref_vals.end(),
+                    [](float a, float b) { return std::abs(a) < std::abs(b); });
+
+                double precise = 0.0;
+                for (auto elem : ref_vals) precise += double(elem);
+                sums.push_back(precise);
+                sums.push_back(subtract(ref_vals.begin(), ref_vals.end()));
+                sums.push_back(subtract(ref_vals.rbegin(), ref_vals.rend()));
+
+                std::sort(sums.begin(), sums.end());
+                max_error =
+                    std::abs((HOST_ATOMIC_FLOAT)sums.front() - sums.back());
+
+                // restore unsorted order
+                memcpy(ref_vals.data(), startRefValues,
+                       sizeof(HostDataType) * ref_vals.size());
+            }
+            else
+            {
+                memcpy(startRefValues, ref_vals.data(),
+                       sizeof(HostDataType) * threadCount);
+            }
+            return true;
+        }
+        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
+        {
+            if (threadCount > ref_vals.size())
+            {
+                ref_vals.resize(threadCount);
                for (cl_uint i = 0; i < threadCount; i++)
                    ref_vals[i] = cl_half_from_float(
                        get_random_float(min_range, max_range, d),
@@ -1563,7 +1776,6 @@ public:
                float precise = 0.f;
                for (auto elem : ref_vals) precise -= cl_half_to_float(elem);
                sums.push_back(precise);
-
                sums.push_back(
                    subtract_halfs(ref_vals.begin(), ref_vals.end()));
                sums.push_back(
@@ -1571,7 +1783,6 @@ public:

                std::sort(sums.begin(), sums.end());
                max_error = std::abs(sums.front() - sums.back());
-
                // restore unsorted order
                memcpy(ref_vals.data(), startRefValues,
                       sizeof(HostDataType) * ref_vals.size());
@@ -1590,7 +1801,10 @@ public:
        std::string memoryOrderScope = MemoryOrderScopeStr();
        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");

-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            return "  atomic_fetch_sub" + postfix + "(&destMemory[0], ("
                + DataType().AddSubOperandTypeName() + ")oldValues[tid]"
@@ -1612,7 +1826,10 @@ public:
                      volatile HostAtomicType *destMemory,
                      HostDataType *oldValues) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            host_atomic_fetch_sub(&destMemory[0], (HostDataType)oldValues[tid],
                                  MemoryOrder());
@@ -1634,7 +1851,13 @@ public:
                       cl_uint whichDestValue) override
    {
        expected = StartValue();
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        if constexpr (std::is_same_v<HostDataType, HOST_FLOAT>)
+        {
+            if (whichDestValue == 0)
+                for (cl_uint i = 0; i < threadCount; i++)
+                    expected -= startRefValues[i];
+        }
+        else if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            if (whichDestValue == 0)
            {
@@ -1659,7 +1882,14 @@ public:
                             const std::vector<HostAtomicType> &testValues,
                             cl_uint whichDestValue) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        if constexpr (std::is_same_v<HostDataType, HOST_FLOAT>)
+        {
+            if (whichDestValue == 0)
+                return std::abs((HOST_ATOMIC_FLOAT)expected
+                                - testValues[whichDestValue])
+                    > max_error;
+        }
+        else if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            if (whichDestValue == 0)
                return std::abs(cl_half_to_float(expected)
@@ -1674,7 +1904,7 @@ public:
    bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues,
                    HostAtomicType *finalValues) override
    {
-        if (std::is_same<HostDataType, HOST_ATOMIC_FLOAT>::value)
+        if (std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            correct = true;
            for (cl_uint i = 1; i < threadCount; i++)
@@ -1697,7 +1927,17 @@ public:
    int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
                          cl_command_queue queue) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        if constexpr (std::is_same_v<HostDataType, HOST_FLOAT>)
+        {
+            if (LocalMemory()
+                && (gFloatAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0)
+                return 0; // skip test - not applicable
+
+            if (!LocalMemory()
+                && (gFloatAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT) == 0)
+                return 0;
+        }
+        else if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
        {
            if (LocalMemory()
                && (gHalfAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT) == 0)
@@ -1713,7 +1953,10 @@ public:
    }
    cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_HALF>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            return threadCount;
        }
@@ -1748,6 +1991,11 @@ static int test_atomic_fetch_sub_generic(cl_device_id deviceID,

    if (gFloatAtomicsSupported)
    {
+        CBasicTestFetchSub<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(
+            TYPE_ATOMIC_FLOAT, useSVM);
+        EXECUTE_TEST(
+            error, test_float.Execute(deviceID, context, queue, num_elements));
+
        CBasicTestFetchSub<HOST_ATOMIC_HALF, HOST_HALF> test_half(
            TYPE_ATOMIC_HALF, useSVM);
        EXECUTE_TEST(error,
@@ -2624,7 +2872,10 @@ public:
          min_range(-999.0), max_range(999.0)
    {
        StartValue(DataType().MaxValue());
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            CBasicTestMemOrderScope<HostAtomicType,
                                    HostDataType>::OldValueCheck(false);
@@ -2634,7 +2885,10 @@ public:
    {
        std::string memoryOrderScope = MemoryOrderScopeStr();
        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
            return "  atomic_fetch_min" + postfix
                + "(&destMemory[0], oldValues[tid] " + memoryOrderScope + ");\n"
@@ -2653,7 +2907,10 @@ public:
                      volatile HostAtomicType *destMemory,
                      HostDataType *oldValues) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
            host_atomic_fetch_min(&destMemory[0], oldValues[tid],
                                  MemoryOrder());
@@ -2669,7 +2926,19 @@ public:
    bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
                      MTdata d) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
+        {
+            for (cl_uint i = 0; i < threadCount; i++)
+            {
+                startRefValues[i] = cl_half_from_float(
+                    get_random_float(min_range, max_range, d),
+                    gHalfRoundingMode);
+            }
+        }
+        else if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
            for (cl_uint i = 0; i < threadCount; i++)
            {
@@ -2696,7 +2965,19 @@ public:
                       cl_uint whichDestValue) override
    {
        expected = StartValue();
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
+        {
+            if (whichDestValue == 0)
+            {
+                for (cl_uint i = 0; i < threadCount; i++)
+                {
+                    if (cl_half_to_float(startRefValues[i])
+                        < cl_half_to_float(expected))
+                        expected = startRefValues[i];
+                }
+            }
+        }
+        else if constexpr (std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            if (whichDestValue == 0)
                for (cl_uint i = 0; i < threadCount; i++)
@@ -2716,7 +2997,9 @@ public:
                             const std::vector<HostAtomicType> &testValues,
                             cl_uint whichDestValue) override
    {
-        if (std::is_same<HostDataType, HOST_ATOMIC_FLOAT>::value)
+        if (std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
            if (whichDestValue == 0)
                return CBasicTestMemOrderScope<HostAtomicType, HostDataType>::
@@ -2731,12 +3014,14 @@ public:
    bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues,
                    HostAtomicType *finalValues) override
    {
-        if (std::is_same<HostDataType, HOST_ATOMIC_FLOAT>::value)
+        if (std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same<HostDataType, HOST_FLOAT>::value)
        {
            correct = true;
            for (cl_uint i = 1; i < threadCount; i++)
            {
-                for (cl_uint i = 1; i < threadCount; i++)
+                if (refValues[i] != StartValue())
                {
                    log_error("Thread %d found %d mismatch(es)\n", i,
                              (cl_uint)refValues[i]);
@@ -2754,7 +3039,31 @@ public:
    int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
                          cl_command_queue queue) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
+        {
+            if (LocalMemory()
+                && (gHalfAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
+                    == 0)
+                return 0; // skip test - not applicable
+
+            if (!LocalMemory()
+                && (gHalfAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT)
+                    == 0)
+                return 0;
+        }
+        else if constexpr (std::is_same_v<HostDataType, HOST_DOUBLE>)
+        {
+            if (LocalMemory()
+                && (gDoubleAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
+                    == 0)
+                return 0; // skip test - not applicable
+
+            if (!LocalMemory()
+                && (gDoubleAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT)
+                    == 0)
+                return 0;
+        }
+        else if constexpr (std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            if (LocalMemory()
                && (gFloatAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
@@ -2772,7 +3081,10 @@ public:
    }
    cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
            return threadCount;
        }
@@ -2807,6 +3119,16 @@ static int test_atomic_fetch_min_generic(cl_device_id deviceID,

    if (gFloatAtomicsSupported)
    {
+        CBasicTestFetchMin<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double(
+            TYPE_ATOMIC_DOUBLE, useSVM);
+        EXECUTE_TEST(
+            error, test_double.Execute(deviceID, context, queue, num_elements));
+
+        CBasicTestFetchMin<HOST_ATOMIC_HALF, HOST_HALF> test_half(
+            TYPE_ATOMIC_HALF, useSVM);
+        EXECUTE_TEST(error,
+                     test_half.Execute(deviceID, context, queue, num_elements));
+
        CBasicTestFetchMin<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(
            TYPE_ATOMIC_FLOAT, useSVM);
        EXECUTE_TEST(
@@ -2890,18 +3212,31 @@ public:
                                                                useSVM),
          min_range(-999.0), max_range(999.0)
    {
-        StartValue(DataType().MinValue());
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            CBasicTestMemOrderScope<HostAtomicType,
                                    HostDataType>::OldValueCheck(false);
+            if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
+                StartValue(cl_half_from_float(-CL_HALF_MAX, gHalfRoundingMode));
+            else
+                StartValue(-DataType().MaxValue());
+        }
+        else
+        {
+            StartValue(DataType().MinValue());
        }
    }
    std::string ProgramCore() override
    {
        std::string memoryOrderScope = MemoryOrderScopeStr();
        std::string postfix(memoryOrderScope.empty() ? "" : "_explicit");
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
            return "  atomic_fetch_max" + postfix
                + "(&destMemory[0], oldValues[tid] " + memoryOrderScope + ");\n"
@@ -2920,7 +3255,10 @@ public:
                      volatile HostAtomicType *destMemory,
                      HostDataType *oldValues) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
            host_atomic_fetch_max(&destMemory[0], oldValues[tid],
                                  MemoryOrder());
@@ -2936,7 +3274,19 @@ public:
    bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues,
                      MTdata d) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
+        {
+            for (cl_uint i = 0; i < threadCount; i++)
+            {
+                startRefValues[i] = cl_half_from_float(
+                    get_random_float(min_range, max_range, d),
+                    gHalfRoundingMode);
+            }
+        }
+        else if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
            for (cl_uint i = 0; i < threadCount; i++)
            {
@@ -2963,7 +3313,19 @@ public:
                       cl_uint whichDestValue) override
    {
        expected = StartValue();
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
+        {
+            if (whichDestValue == 0)
+            {
+                for (cl_uint i = 0; i < threadCount; i++)
+                {
+                    if (cl_half_to_float(startRefValues[i])
+                        > cl_half_to_float(expected))
+                        expected = startRefValues[i];
+                }
+            }
+        }
+        else if constexpr (std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            if (whichDestValue == 0)
                for (cl_uint i = 0; i < threadCount; i++)
@@ -2983,7 +3345,9 @@ public:
                             const std::vector<HostAtomicType> &testValues,
                             cl_uint whichDestValue) override
    {
-        if (std::is_same<HostDataType, HOST_ATOMIC_FLOAT>::value)
+        if (std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
            if (whichDestValue == 0)
                return CBasicTestMemOrderScope<HostAtomicType, HostDataType>::
@@ -2998,7 +3362,9 @@ public:
    bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues,
                    HostAtomicType *finalValues) override
    {
-        if (std::is_same<HostDataType, HOST_ATOMIC_FLOAT>::value)
+        if (std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
            correct = true;
            for (cl_uint i = 1; i < threadCount; i++)
@@ -3021,7 +3387,31 @@ public:
    int ExecuteSingleTest(cl_device_id deviceID, cl_context context,
                          cl_command_queue queue) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (std::is_same_v<HostDataType, HOST_HALF>)
+        {
+            if (LocalMemory()
+                && (gHalfAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
+                    == 0)
+                return 0; // skip test - not applicable
+
+            if (!LocalMemory()
+                && (gHalfAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT)
+                    == 0)
+                return 0;
+        }
+        else if constexpr (std::is_same_v<HostDataType, HOST_DOUBLE>)
+        {
+            if (LocalMemory()
+                && (gDoubleAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
+                    == 0)
+                return 0; // skip test - not applicable
+
+            if (!LocalMemory()
+                && (gDoubleAtomicCaps & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT)
+                    == 0)
+                return 0;
+        }
+        else if constexpr (std::is_same_v<HostDataType, HOST_FLOAT>)
        {
            if (LocalMemory()
                && (gFloatAtomicCaps & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
@@ -3039,7 +3429,10 @@ public:
    }
    cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID) override
    {
-        if constexpr (std::is_same_v<HostDataType, HOST_ATOMIC_FLOAT>)
+        if constexpr (
+            std::is_same_v<
+                HostDataType,
+                HOST_HALF> || std::is_same_v<HostDataType, HOST_FLOAT> || std::is_same_v<HostDataType, HOST_DOUBLE>)
        {
            return threadCount;
        }
@@ -3074,6 +3467,16 @@ static int test_atomic_fetch_max_generic(cl_device_id deviceID,

    if (gFloatAtomicsSupported)
    {
+        CBasicTestFetchMax<HOST_ATOMIC_DOUBLE, HOST_DOUBLE> test_double(
+            TYPE_ATOMIC_DOUBLE, useSVM);
+        EXECUTE_TEST(
+            error, test_double.Execute(deviceID, context, queue, num_elements));
+
+        CBasicTestFetchMax<HOST_ATOMIC_HALF, HOST_HALF> test_half(
+            TYPE_ATOMIC_HALF, useSVM);
+        EXECUTE_TEST(error,
+                     test_half.Execute(deviceID, context, queue, num_elements));
+
        CBasicTestFetchMax<HOST_ATOMIC_FLOAT, HOST_FLOAT> test_float(
            TYPE_ATOMIC_FLOAT, useSVM);
        EXECUTE_TEST(
@@ -3898,6 +4301,9 @@ private:
    struct TestDefinition _subCase;
 };

+#if 0
+// The tests below are likely incorrect and have been disabled.
+// See https://github.com/KhronosGroup/OpenCL-CTS/issues/2544
 static int test_atomic_fence_generic(cl_device_id deviceID, cl_context context,
                                     cl_command_queue queue, int num_elements,
                                     bool useSVM)
@@ -3977,3 +4383,4 @@ REGISTER_TEST(svm_atomic_fence)
    return test_atomic_fence_generic(device, context, queue, num_elements,
                                     true);
 }
+#endif
--- a/test_conformance/common/directx_wrapper/CMakeLists.txt
+++ b/test_conformance/common/directx_wrapper/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(DIRECTX_WRAPPER_SOURCES
+        directx_wrapper.cpp
+)
+
+add_library(directx_wrapper STATIC ${DIRECTX_WRAPPER_SOURCES})
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+include_directories(${CLConform_INCLUDE_DIR})
+
+if (WIN32)
+    target_link_libraries(directx_wrapper d3d12)
+endif ()
--- a/test_conformance/common/directx_wrapper/directx_wrapper.cpp
+++ b/test_conformance/common/directx_wrapper/directx_wrapper.cpp
@@ -0,0 +1,71 @@
+//
+// Copyright (c) 2025 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "directx_wrapper.hpp"
+
+DirectXWrapper::DirectXWrapper()
+{
+
+    HRESULT hr = D3D12CreateDevice(nullptr, D3D_FEATURE_LEVEL_12_0,
+                                   IID_PPV_ARGS(&dx_device));
+    if (FAILED(hr))
+    {
+        throw std::runtime_error("Failed to create DirectX 12 device");
+    }
+
+    D3D12_COMMAND_QUEUE_DESC desc{};
+    desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
+    desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+    hr = dx_device->CreateCommandQueue(&desc, IID_PPV_ARGS(&dx_command_queue));
+    if (FAILED(hr))
+    {
+        throw std::runtime_error("Failed to create DirectX 12 command queue");
+    }
+
+    hr = dx_device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT,
+                                           IID_PPV_ARGS(&dx_command_allocator));
+    if (FAILED(hr))
+    {
+        throw std::runtime_error(
+            "Failed to create DirectX 12 command allocator");
+    }
+}
+
+ID3D12Device* DirectXWrapper::getDXDevice() const { return dx_device.Get(); }
+
+ID3D12CommandQueue* DirectXWrapper::getDXCommandQueue() const
+{
+    return dx_command_queue.Get();
+}
+ID3D12CommandAllocator* DirectXWrapper::getDXCommandAllocator() const
+{
+    return dx_command_allocator.Get();
+}
+
+DirectXFenceWrapper::DirectXFenceWrapper(ID3D12Device* dx_device)
+    : dx_device(dx_device)
+{
+    if (!dx_device)
+    {
+        throw std::runtime_error("ID3D12Device is not valid");
+    }
+    const HRESULT hr = dx_device->CreateFence(0, D3D12_FENCE_FLAG_SHARED,
+                                              IID_PPV_ARGS(&dx_fence));
+    if (FAILED(hr))
+    {
+        throw std::runtime_error("Failed to create the DirectX fence");
+    }
+}
--- a/test_conformance/common/directx_wrapper/directx_wrapper.hpp
+++ b/test_conformance/common/directx_wrapper/directx_wrapper.hpp
@@ -0,0 +1,47 @@
+//
+// Copyright (c) 2025 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <d3d12.h>
+#include <wrl/client.h>
+#include <stdexcept>
+
+using namespace Microsoft::WRL;
+
+class DirectXWrapper {
+public:
+    DirectXWrapper();
+
+    ID3D12Device* getDXDevice() const;
+    ID3D12CommandQueue* getDXCommandQueue() const;
+    ID3D12CommandAllocator* getDXCommandAllocator() const;
+
+protected:
+    ComPtr<ID3D12Device> dx_device = nullptr;
+    ComPtr<ID3D12CommandQueue> dx_command_queue = nullptr;
+    ComPtr<ID3D12CommandAllocator> dx_command_allocator = nullptr;
+};
+
+class DirectXFenceWrapper {
+public:
+    DirectXFenceWrapper(ID3D12Device* dx_device);
+    ID3D12Fence* operator*() const { return dx_fence.Get(); }
+
+private:
+    ComPtr<ID3D12Fence> dx_fence = nullptr;
+    ComPtr<ID3D12Device> dx_device = nullptr;
+};
--- a/test_conformance/commonfns/test_unary_fn.cpp
+++ b/test_conformance/commonfns/test_unary_fn.cpp
@@ -57,7 +57,7 @@ namespace {
 template <typename T>
 int verify_degrees(const T *const inptr, const T *const outptr, int n)
 {
-    float error, max_error = 0.0f;
+    float error, max_error = -INFINITY;
    double r, max_val = NAN;
    int max_index = 0;

@@ -89,13 +89,16 @@ int verify_degrees(const T *const inptr, const T *const outptr, int n)
    }

    if (std::is_same<T, half>::value)
-        log_info("degrees: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
-                 max_error, max_index, max_val, conv_to_flt(outptr[max_index]),
-                 max_val, conv_to_flt(outptr[max_index]));
+        log_info("degrees: Max error %f ulps at %d, input %a: *%a vs %a  (*%g "
+                 "vs %g)\n",
+                 max_error, max_index, conv_to_flt(inptr[max_index]), max_val,
+                 conv_to_flt(outptr[max_index]), max_val,
+                 conv_to_flt(outptr[max_index]));
    else
-        log_info("degrees: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
-                 max_error, max_index, max_val, outptr[max_index], max_val,
-                 outptr[max_index]);
+        log_info("degrees: Max error %f ulps at %d, input %a: *%a vs %a  (*%g "
+                 "vs %g)\n",
+                 max_error, max_index, conv_to_flt(inptr[max_index]), max_val,
+                 outptr[max_index], max_val, outptr[max_index]);

    return 0;
 }
@@ -103,7 +106,7 @@ int verify_degrees(const T *const inptr, const T *const outptr, int n)
 template <typename T>
 int verify_radians(const T *const inptr, const T *const outptr, int n)
 {
-    float error, max_error = 0.0f;
+    float error, max_error = -INFINITY;
    double r, max_val = NAN;
    int max_index = 0;

@@ -135,13 +138,16 @@ int verify_radians(const T *const inptr, const T *const outptr, int n)
    }

    if (std::is_same<T, half>::value)
-        log_info("radians: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
-                 max_error, max_index, max_val, conv_to_flt(outptr[max_index]),
-                 max_val, conv_to_flt(outptr[max_index]));
+        log_info("radians: Max error %f ulps at %d, input %a: *%a vs %a  (*%g "
+                 "vs %g)\n",
+                 max_error, max_index, conv_to_flt(inptr[max_index]), max_val,
+                 conv_to_flt(outptr[max_index]), max_val,
+                 conv_to_flt(outptr[max_index]));
    else
-        log_info("radians: Max error %f ulps at %d: *%a vs %a  (*%g vs %g)\n",
-                 max_error, max_index, max_val, outptr[max_index], max_val,
-                 outptr[max_index]);
+        log_info("radians: Max error %f ulps at %d, input %a: *%a vs %a  (*%g "
+                 "vs %g)\n",
+                 max_error, max_index, conv_to_flt(inptr[max_index]), max_val,
+                 outptr[max_index], max_val, outptr[max_index]);

    return 0;
 }
--- a/test_conformance/compiler/CMakeLists.txt
+++ b/test_conformance/compiler/CMakeLists.txt
@@ -1,5 +1,7 @@
 set(MODULE_NAME COMPILER)

+find_package(Python3 COMPONENTS Interpreter QUIET)
+
 set(${MODULE_NAME}_SOURCES
    main.cpp
    test_build_helpers.cpp
@@ -52,7 +54,7 @@ add_custom_command(
        COMMAND ${CMAKE_COMMAND} -E copy_directory
        ${CLConform_SOURCE_DIR}/test_conformance/compiler/secondIncludeTestDirectory
        ${COMPILER_TEST_RESOURCES}/secondIncludeTestDirectory
-        COMMAND ${COMPILER_ASSEMBLY_SCRIPT} --source-dir "${COMPILER_ASM_PATH}" --output-dir "${COMPILER_SPV_PATH}" ${COMPILER_SPV_EXTRA} --verbose
+        COMMAND ${Python3_EXECUTABLE} ${COMPILER_ASSEMBLY_SCRIPT} --source-dir "${COMPILER_ASM_PATH}" --output-dir "${COMPILER_SPV_PATH}" ${COMPILER_SPV_EXTRA} --verbose
        DEPENDS ${COMPILER_ASSEMBLY_SCRIPT} ${COMPILER_ASM}
        VERBATIM)

--- a/test_conformance/compiler/main.cpp
+++ b/test_conformance/compiler/main.cpp
@@ -13,10 +13,62 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+#include <string>
+#include <filesystem>
+
 #include "harness/testHarness.h"
+#include "harness/stringHelpers.h"
+
+std::string spvBinariesPath = "spirv_bin";
+const std::string spvBinariesPathArg = "--spirv-binaries-path";
+
+void printUsage()
+{
+    log_info("Reading SPIR-V files from default '%s' path.\n",
+             spvBinariesPath.c_str());
+    log_info("In case you want to set other directory use '%s' argument.\n",
+             spvBinariesPathArg.c_str());
+}

 int main(int argc, const char *argv[])
 {
+    bool modifiedSpvBinariesPath = false;
+    bool listTests = false;
+    for (int i = 0; i < argc; ++i)
+    {
+        int argsRemoveNum = 0;
+        if (argv[i] == spvBinariesPathArg)
+        {
+            if (i + 1 == argc)
+            {
+                log_error("Missing value for '%s' argument.\n",
+                          spvBinariesPathArg.c_str());
+                return TEST_FAIL;
+            }
+            else
+            {
+                spvBinariesPath = std::string(argv[i + 1]);
+                argsRemoveNum += 2;
+                modifiedSpvBinariesPath = true;
+            }
+        }
+
+        if (argsRemoveNum > 0)
+        {
+            for (int j = i; j < (argc - argsRemoveNum); ++j)
+                argv[j] = argv[j + argsRemoveNum];
+
+            argc -= argsRemoveNum;
+            --i;
+        }
+        listTests |= (argv[i] == std::string("--list")
+                      || argv[i] == std::string("-list"));
+    }
+    if (modifiedSpvBinariesPath == false && !listTests)
+    {
+        printUsage();
+    }
+
    return runTestHarness(argc, argv, test_registry::getInstance().num_tests(),
                          test_registry::getInstance().definitions(), false, 0);
 }
--- a/test_conformance/compiler/test_compile.cpp
+++ b/test_conformance/compiler/test_compile.cpp
@@ -24,6 +24,8 @@
 #include <unistd.h>
 #endif
 #include "harness/conversions.h"
+#include "harness/stringHelpers.h"
+#include "harness/parseParameters.h"

 #define MAX_LINE_SIZE_IN_PROGRAM 1024
 #define MAX_LOG_SIZE_IN_PROGRAM 2048
@@ -158,6 +160,14 @@ const char *link_static_function_access = // use with compile_static_function
    "extern int foo(int, int);\n"
    "int access_foo() { int blah = foo(3, 4); return blah + 5; }\n";

+const char *multi_build_test_kernel = R"(
+__kernel void test_kernel(__global int *dst)
+{
+    int tid = get_global_id(0);
+    dst[tid] = BUILD_OPT_VAL;
+}
+)";
+
 static int test_large_single_compile(cl_context context, cl_device_id deviceID,
                                     unsigned int numLines)
 {
@@ -3059,12 +3069,13 @@ REGISTER_TEST(execute_after_included_header_link)
    }

    const auto simple_header_path = temp_dir_path / simple_header_name;
-    FILE *simple_header_file =
-        fopen(simple_header_path.u8string().c_str(), "w");
+    const std::string simple_header_path_str =
+        to_string(simple_header_path.u8string());
+    FILE *simple_header_file = fopen(simple_header_path_str.c_str(), "w");
    if (simple_header_file == NULL)
    {
        log_error("ERROR: Unable to create simple header file %s! (in %s:%d)\n",
-                  simple_header_path.u8string().c_str(), __FILE__, __LINE__);
+                  simple_header_path_str.c_str(), __FILE__, __LINE__);
        return -1;
    }
    if (fprintf(simple_header_file, "%s", simple_header) < 0)
@@ -3082,7 +3093,7 @@ REGISTER_TEST(execute_after_included_header_link)
    }

    const std::string include_path =
-        std::string("-I") + temp_dir_path.generic_u8string();
+        std::string("-I") + to_string(temp_dir_path.generic_u8string());
    error = clCompileProgram(program, 1, &device, include_path.c_str(), 0, NULL,
                             NULL, NULL, NULL);
    test_error(error,
@@ -3931,3 +3942,100 @@ REGISTER_TEST(compile_and_link_status_options_log)

    return 0;
 }
+
+REGISTER_TEST(multiple_build_program)
+{
+    if (gCompilationMode != kOnline)
+    {
+        log_info(
+            "Skipping multiple_build_program, compilation mode not online\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    cl_int error = CL_SUCCESS;
+    const size_t num_threads = num_elements;
+
+    clProgramWrapper program = clCreateProgramWithSource(
+        context, 1, &multi_build_test_kernel, nullptr, &error);
+    test_error(error, "clCreateProgramWithSource failed");
+
+    clMemWrapper out_stream_0 = clCreateBuffer(
+        context, CL_MEM_READ_WRITE, sizeof(cl_int) * num_threads, NULL, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    clMemWrapper out_stream_1 = clCreateBuffer(
+        context, CL_MEM_READ_WRITE, sizeof(cl_int) * num_threads, NULL, &error);
+    test_error(error, "clCreateBuffer failed");
+
+    {
+        /* Build with the macro defined */
+        error = clBuildProgram(program, 1, &device, "-DBUILD_OPT_VAL=1 ", NULL,
+                               NULL);
+        test_error(error, "clBuildProgram failed");
+
+        clKernelWrapper kernel0 =
+            clCreateKernel(program, "test_kernel", &error);
+        test_error(error, "clCreateKernel failed");
+
+        error = clSetKernelArg(kernel0, 0, sizeof(out_stream_0), &out_stream_0);
+        test_error(error, "clSetKernelArg failed");
+
+        error = clEnqueueNDRangeKernel(queue, kernel0, 1, NULL, &num_threads,
+                                       NULL, 0, NULL, NULL);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+    }
+
+    {
+        /* Rebuild with the macro redefined */
+        error = clBuildProgram(program, 1, &device, "-DBUILD_OPT_VAL=2 ", NULL,
+                               NULL);
+        test_error(error, "clBuildProgram failed");
+
+        clKernelWrapper kernel1 =
+            clCreateKernel(program, "test_kernel", &error);
+        test_error(error, "clCreateKernel failed");
+
+        error = clSetKernelArg(kernel1, 0, sizeof(out_stream_1), &out_stream_1);
+        test_error(error, "clSetKernelArg failed");
+
+        error = clEnqueueNDRangeKernel(queue, kernel1, 1, NULL, &num_threads,
+                                       NULL, 0, NULL, NULL);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+    }
+
+    error = clFinish(queue);
+    test_error(error, "clFinish failed");
+
+    std::vector<cl_int> test_values(num_threads, 0);
+    error = clEnqueueReadBuffer(queue, out_stream_0, true, 0,
+                                sizeof(cl_int) * num_threads,
+                                test_values.data(), 0, NULL, NULL);
+    test_error(error, "clEnqueueReadBuffer failed");
+
+    for (size_t i = 0; i < test_values.size(); i++)
+    {
+        if (test_values[i] != 1)
+        {
+            log_error("Unexpected test value %d for kernel0 at pos %zu.\n",
+                      test_values[i], i);
+            return TEST_FAIL;
+        }
+    }
+
+    error = clEnqueueReadBuffer(queue, out_stream_1, true, 0,
+                                sizeof(cl_int) * num_threads,
+                                test_values.data(), 0, NULL, NULL);
+    test_error(error, "clEnqueueReadBuffer failed");
+
+    for (size_t i = 0; i < test_values.size(); i++)
+    {
+        if (test_values[i] != 2)
+        {
+            log_error("Unexpected test value %d for kernel1 at pos %zu.\n",
+                      test_values[i], i);
+            return TEST_FAIL;
+        }
+    }
+
+    return TEST_PASS;
+}
--- a/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
+++ b/test_conformance/compiler/test_compiler_defines_for_extensions.cpp
@@ -94,6 +94,7 @@ const char *known_extensions[] = {
    "cl_khr_external_memory_dma_buf",
    "cl_khr_command_buffer",
    "cl_khr_command_buffer_mutable_dispatch",
+    "cl_khr_command_buffer_mutable_memory_commands",
    "cl_khr_command_buffer_multi_device",
    "cl_khr_external_memory_android_hardware_buffer",
    "cl_khr_unified_svm",
--- a/test_conformance/compiler/test_unload_platform_compiler.cpp
+++ b/test_conformance/compiler/test_unload_platform_compiler.cpp
@@ -32,8 +32,6 @@ const std::string slash = "\\";
 #else
 const std::string slash = "/";
 #endif
-std::string compilerSpvBinaries = "test_conformance" + slash + "compiler"
-    + slash + "spirv_bin" + slash + "write_kernel.spv";

 const std::string spvExt = ".spv";

@@ -338,8 +336,8 @@ public:

        std::vector<unsigned char> kernel_buffer;

-        std::string file_name =
-            compilerSpvBinaries + std::to_string(address_bits);
+        std::string file_name = spvBinariesPath + slash + "write_kernel.spv"
+            + std::to_string(address_bits);
        m_spirv_binary = readBinary(file_name.c_str());
        m_spirv_size = m_spirv_binary.size();
    }
--- a/test_conformance/compiler/test_unload_platform_compiler_resources.hpp
+++ b/test_conformance/compiler/test_unload_platform_compiler_resources.hpp
@@ -1,4 +1,6 @@
-#include <array>
+#include <string>
+
+extern std::string spvBinariesPath;

 static const char write_kernel_source[] = R"(
 	kernel void write_kernel(global unsigned int *p) {
--- a/test_conformance/extensions/CMakeLists.txt
+++ b/test_conformance/extensions/CMakeLists.txt
@@ -15,3 +15,6 @@ add_subdirectory( cl_ext_buffer_device_address )
 if(VULKAN_IS_SUPPORTED)
    add_subdirectory( cl_khr_external_semaphore )
 endif()
+if(D3D12_IS_SUPPORTED)
+    add_subdirectory( cl_khr_external_semaphore_dx_fence )
+endif()
--- a/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt
+++ b/test_conformance/extensions/cl_khr_command_buffer/CMakeLists.txt
@@ -17,6 +17,7 @@ set(${MODULE_NAME}_SOURCES
    command_buffer_test_barrier.cpp
    command_buffer_test_event_info.cpp
    command_buffer_finalize.cpp
+    command_buffer_pipelined_enqueue.cpp
    negative_command_buffer_finalize.cpp
    negative_command_buffer_svm_mem.cpp
    negative_command_buffer_copy_image.cpp
--- a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
@@ -27,9 +27,6 @@ BasicCommandBufferTest::BasicCommandBufferTest(cl_device_id device,
    : CommandBufferTestBase(device), context(context), queue(nullptr),
      num_elements(0), simultaneous_use_support(false),
      out_of_order_support(false), queue_out_of_order_support(false),
-      // try to use simultaneous path by default
-      simultaneous_use_requested(true),
-      // due to simultaneous cases extend buffer size
      buffer_size_multiplier(1), command_buffer(this)
 {
    cl_int error = clRetainCommandQueue(queue);
@@ -72,9 +69,8 @@ bool BasicCommandBufferTest::Skip()
                            sizeof(capabilities), &capabilities, NULL);
    test_error(error,
               "Unable to query CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR");
-    simultaneous_use_support = simultaneous_use_requested
-        && (capabilities & CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR)
-            != 0;
+    simultaneous_use_support =
+        (capabilities & CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR) != 0;
    out_of_order_support =
        supported_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
    device_side_enqueue_support =
@@ -167,19 +163,7 @@ cl_int BasicCommandBufferTest::SetUp(int elements)
    error = SetUpKernelArgs();
    test_error(error, "SetUpKernelArgs failed");

-    if (simultaneous_use_support)
-    {
-        cl_command_buffer_properties_khr properties[3] = {
-            CL_COMMAND_BUFFER_FLAGS_KHR, CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR,
-            0
-        };
-        command_buffer =
-            clCreateCommandBufferKHR(1, &queue, properties, &error);
-    }
-    else
-    {
-        command_buffer = clCreateCommandBufferKHR(1, &queue, nullptr, &error);
-    }
+    command_buffer = clCreateCommandBufferKHR(1, &queue, nullptr, &error);
    test_error(error, "clCreateCommandBufferKHR failed");

    return CL_SUCCESS;
@@ -192,11 +176,6 @@ cl_int MultiFlagCreationTest::Run()

    // First try to find multiple flags that are supported by the driver and
    // device.
-    if (simultaneous_use_support)
-    {
-        flags |= CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;
-    }
-
    if (is_extension_available(
            device, CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_NAME))
    {
@@ -207,6 +186,11 @@ cl_int MultiFlagCreationTest::Run()
            device, CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME))
    {
        flags |= CL_COMMAND_BUFFER_MUTABLE_KHR;
+
+        if (simultaneous_use_support)
+        {
+            flags |= CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;
+        }
    }

    cl_command_buffer_properties_khr props[] = { CL_COMMAND_BUFFER_FLAGS_KHR,
@@ -381,11 +365,6 @@ cl_int ExplicitFlushTest::Run()
    return CL_SUCCESS;
 }

-bool ExplicitFlushTest::Skip()
-{
-    return BasicCommandBufferTest::Skip() || !simultaneous_use_support;
-}
-
 cl_int InterleavedEnqueueTest::Run()
 {
    cl_int error = clCommandNDRangeKernelKHR(
@@ -431,11 +410,6 @@ cl_int InterleavedEnqueueTest::Run()
    return CL_SUCCESS;
 }

-bool InterleavedEnqueueTest::Skip()
-{
-    return BasicCommandBufferTest::Skip() || !simultaneous_use_support;
-}
-
 cl_int EnqueueAndReleaseTest::Run()
 {
    cl_int error = clCommandNDRangeKernelKHR(
--- a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h
@@ -78,8 +78,11 @@ protected:
    bool queue_out_of_order_support;
    bool device_side_enqueue_support;

-    // user request for simultaneous use
-    bool simultaneous_use_requested;
+    // Extends size of created 'in_mem' & 'out_mem' buffers, such that the same
+    // cl_mem buffer can be used across multiple enqueues of a command-buffer.
+    // Accessed in the kernel at an offset for each enqueue which is passed as
+    // a kernel parameter through the 'off_mem' buffer.
+    // See BasicCommandBufferTest::SetUpKernel() definition.
    unsigned buffer_size_multiplier;
    clCommandBufferWrapper command_buffer;
 };
@@ -116,7 +119,6 @@ struct ExplicitFlushTest : public BasicCommandBufferTest
    using BasicCommandBufferTest::BasicCommandBufferTest;

    cl_int Run() override;
-    bool Skip() override;
 };

 // Test enqueueing a command-buffer twice separated by another enqueue operation
@@ -125,7 +127,6 @@ struct InterleavedEnqueueTest : public BasicCommandBufferTest
    using BasicCommandBufferTest::BasicCommandBufferTest;

    cl_int Run() override;
-    bool Skip() override;
 };

 // Test releasing a command-buffer after it has been submitted for execution,
@@ -156,9 +157,9 @@ int MakeAndRunTest(cl_device_id device, cl_context context,
        cl_version extension_version =
            get_extension_version(device, "cl_khr_command_buffer");

-        if (extension_version != CL_MAKE_VERSION(0, 9, 7))
+        if (extension_version != CL_MAKE_VERSION(0, 9, 8))
        {
-            log_info("cl_khr_command_buffer version 0.9.7 is required to run "
+            log_info("cl_khr_command_buffer version 0.9.8 is required to run "
                     "the test, skipping.\n ");
            return TEST_SKIPPED_ITSELF;
        }
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_basic.h
@@ -50,13 +50,14 @@ struct BasicMutableCommandBufferTest : BasicCommandBufferTest

    virtual cl_int SetUp(int elements) override
    {
-        BasicCommandBufferTest::SetUp(elements);
+        cl_int error = BasicCommandBufferTest::SetUp(elements);
+        test_error(error, "BasicCommandBufferTest::SetUp failed");

-        cl_int error = init_extension_functions();
+        error = init_extension_functions();
        test_error(error, "Unable to initialise extension functions");

        cl_command_buffer_properties_khr prop = CL_COMMAND_BUFFER_MUTABLE_KHR;
-        if (simultaneous_use_support)
+        if (simultaneous_use_requested)
        {
            prop |= CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;
        }
@@ -90,10 +91,10 @@ struct BasicMutableCommandBufferTest : BasicCommandBufferTest
                cl_version extension_version = get_extension_version(
                    device, "cl_khr_command_buffer_mutable_dispatch");

-                if (extension_version != CL_MAKE_VERSION(0, 9, 3))
+                if (extension_version != CL_MAKE_VERSION(0, 9, 4))
                {
                    log_info("cl_khr_command_buffer_mutable_dispatch version "
-                             "0.9.3 is "
+                             "0.9.4 is "
                             "required to run the test, skipping.\n ");
                    extension_avaliable = false;
                }
@@ -128,6 +129,7 @@ struct BasicMutableCommandBufferTest : BasicCommandBufferTest
    }

    clUpdateMutableCommandsKHR_fn clUpdateMutableCommandsKHR = nullptr;
+    bool simultaneous_use_requested = false;

    const char* kernelString = "__kernel void empty() {}";
    const size_t global_work_size = 4 * 16;
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp
@@ -70,9 +70,9 @@ struct MutableDispatchImage1DArguments : public BasicMutableCommandBufferTest
            {
               int offset = get_global_id(0);

-               int4 color = read_imagei( source, sampler, offset );
+               uint4 color = read_imageui( source, sampler, offset );

-               write_imagei( dest, offset, color );
+               write_imageui( dest, offset, color );
            })";

        cl_int error;
@@ -260,9 +260,9 @@ struct MutableDispatchImage2DArguments : public BasicMutableCommandBufferTest
               int x = get_global_id(0);
               int y = get_global_id(1);

-               int4 color = read_imagei( source, sampler, (int2) (x, y) );
+               uint4 color = read_imageui( source, sampler, (int2) (x, y) );

-               write_imagei( dest, (int2) (x, y), color );
+               write_imageui( dest, (int2) (x, y), color );
            })";

        cl_int error;
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_iterative_arg_update.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_iterative_arg_update.cpp
@@ -35,9 +35,7 @@ struct IterativeArgUpdateDispatch : BasicMutableCommandBufferTest
                               cl_command_queue queue)
        : BasicMutableCommandBufferTest(device, context, queue),
          command(nullptr)
-    {
-        simultaneous_use_requested = false;
-    }
+    {}

    bool Skip() override
    {
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_multiple_dispatches.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_multiple_dispatches.cpp
@@ -33,9 +33,7 @@ struct MultipleCommandsDispatch : BasicMutableCommandBufferTest
                             cl_command_queue queue)
        : BasicMutableCommandBufferTest(device, context, queue),
          command_pri(nullptr), command_sec(nullptr)
-    {
-        simultaneous_use_requested = false;
-    }
+    {}

    bool Skip() override
    {
@@ -47,7 +45,7 @@ struct MultipleCommandsDispatch : BasicMutableCommandBufferTest
                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
            && mutable_capabilities & CL_MUTABLE_DISPATCH_ARGUMENTS_KHR;

-        // require mutable arguments capabillity
+        // require mutable arguments capability
        return !mutable_support;
    }

--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_overwrite_update.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_overwrite_update.cpp
@@ -34,9 +34,7 @@ struct OverwriteUpdateDispatch : BasicMutableCommandBufferTest
                            cl_command_queue queue)
        : BasicMutableCommandBufferTest(device, context, queue),
          command(nullptr)
-    {
-        simultaneous_use_requested = false;
-    }
+    {}

    bool Skip() override
    {
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_simultaneous.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_simultaneous.cpp
@@ -21,10 +21,12 @@
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
 ////////////////////////////////////////////////////////////////////////////////
-// mutable dispatch tests which handle following cases:
-// - out-of-order queue use
+// mutable dispatch tests which handles
+// - out-of-order queue with dependencies between command-buffer enqueues
 // - out-of-order queue with simultaneous use
+// - in-order queue with dependencies between command-buffer enqueues
 // - in-order queue with simultaneous use
+// - cross queue with dependencies between command-buffer enqueues
 // - cross-queue with simultaneous use

 namespace {
@@ -35,11 +37,10 @@ struct SimultaneousMutableDispatchTest : public BasicMutableCommandBufferTest
    SimultaneousMutableDispatchTest(cl_device_id device, cl_context context,
                                    cl_command_queue queue)
        : BasicMutableCommandBufferTest(device, context, queue),
-          work_queue(nullptr), work_command_buffer(this), user_event(nullptr),
-          wait_pass_event(nullptr), command(nullptr)
+          work_queue(nullptr), work_command_buffer(this), new_in_mem(nullptr),
+          command(nullptr)
    {
        simultaneous_use_requested = simultaneous_request;
-        if (simultaneous_request) buffer_size_multiplier = 2;
    }

    cl_int SetUpKernel() override
@@ -48,26 +49,36 @@ struct SimultaneousMutableDispatchTest : public BasicMutableCommandBufferTest
        test_error(error, "BasicCommandBufferTest::SetUpKernel failed");

        // create additional kernel to properly prepare output buffer for test
-        const char* kernel_str =
+        const char *kernel_str =
            R"(
-          __kernel void fill(int pattern, __global int* out, __global int*
-        offset)
+          __kernel void mul(__global int* out, __global int* in, int mul_val)
          {
              size_t id = get_global_id(0);
-              size_t ind = offset[0] + id ;
-              out[ind] = pattern;
+              out[id] = in[id] * mul_val;
          })";

        error = create_single_kernel_helper_create_program(
-            context, &program_fill, 1, &kernel_str);
+            context, &program_mul, 1, &kernel_str);
        test_error(error, "Failed to create program with source");

        error =
-            clBuildProgram(program_fill, 1, &device, nullptr, nullptr, nullptr);
+            clBuildProgram(program_mul, 1, &device, nullptr, nullptr, nullptr);
        test_error(error, "Failed to build program");

-        kernel_fill = clCreateKernel(program_fill, "fill", &error);
-        test_error(error, "Failed to create copy kernel");
+        kernel_mul = clCreateKernel(program_mul, "mul", &error);
+        test_error(error, "Failed to create multiply kernel");
+
+        new_out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                                     sizeof(cl_int) * num_elements
+                                         * buffer_size_multiplier,
+                                     nullptr, &error);
+        test_error(error, "clCreateBuffer failed");
+
+        new_in_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
+                                    sizeof(cl_int) * num_elements
+                                        * buffer_size_multiplier,
+                                    nullptr, &error);
+        test_error(error, "clCreateBuffer failed");

        return CL_SUCCESS;
    }
@@ -77,14 +88,13 @@ struct SimultaneousMutableDispatchTest : public BasicMutableCommandBufferTest
        cl_int error = BasicCommandBufferTest::SetUpKernelArgs();
        test_error(error, "BasicCommandBufferTest::SetUpKernelArgs failed");

-        error = clSetKernelArg(kernel_fill, 0, sizeof(cl_int),
-                               &overwritten_pattern);
+        error = clSetKernelArg(kernel_mul, 0, sizeof(out_mem), &out_mem);
        test_error(error, "clSetKernelArg failed");

-        error = clSetKernelArg(kernel_fill, 1, sizeof(out_mem), &out_mem);
+        error = clSetKernelArg(kernel_mul, 1, sizeof(off_mem), &in_mem);
        test_error(error, "clSetKernelArg failed");

-        error = clSetKernelArg(kernel_fill, 2, sizeof(off_mem), &off_mem);
+        error = clSetKernelArg(kernel_mul, 2, sizeof(cl_int), &pattern_pri);
        test_error(error, "clSetKernelArg failed");

        return CL_SUCCESS;
@@ -101,30 +111,28 @@ struct SimultaneousMutableDispatchTest : public BasicMutableCommandBufferTest
                context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
                &error);
            test_error(error, "Unable to create command queue to test with");
-
-            cl_command_buffer_properties_khr prop =
-                CL_COMMAND_BUFFER_MUTABLE_KHR;
-            if (simultaneous_use_support)
-            {
-                prop |= CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;
-            }
-
-            const cl_command_buffer_properties_khr props[] = {
-                CL_COMMAND_BUFFER_FLAGS_KHR,
-                prop,
-                0,
-            };
-
-            work_command_buffer =
-                clCreateCommandBufferKHR(1, &work_queue, props, &error);
-            test_error(error, "clCreateCommandBufferKHR failed");
        }
        else
        {
            work_queue = queue;
-            work_command_buffer = command_buffer;
        }

+        cl_command_buffer_properties_khr prop = CL_COMMAND_BUFFER_MUTABLE_KHR;
+
+        if (simultaneous_use_requested)
+        {
+            prop |= CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;
+        }
+
+        const cl_command_buffer_properties_khr props[] = {
+            CL_COMMAND_BUFFER_FLAGS_KHR,
+            prop,
+            0,
+        };
+
+        work_command_buffer =
+            clCreateCommandBufferKHR(1, &work_queue, props, &error);
+        test_error(error, "clCreateCommandBufferKHR failed");
        return CL_SUCCESS;
    }

@@ -145,293 +153,245 @@ struct SimultaneousMutableDispatchTest : public BasicMutableCommandBufferTest
            || !mutable_support;
    }

+    cl_int RecordCommandBuffer()
+    {
+        cl_int error = clCommandNDRangeKernelKHR(
+            work_command_buffer, nullptr, nullptr, kernel_mul, 1, nullptr,
+            &num_elements, nullptr, 0, nullptr, nullptr, &command);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(work_command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    cl_int RunSerializedPass(std::vector<cl_int> &first_enqueue_output,
+                             std::vector<cl_int> &second_enqueue_output)
+    {
+        /* Serialize command-buffer enqueue, is a linear sequence of
+         * commands, with dependencies enforced using an in-order queue
+         * or cl_event dependencies.
+         *
+         * 1. Fill input buffer
+         * 2. Enqueue command-buffer doing: `output = a * input;
+         * 3. Read output buffer to host data so it can be verified later
+         * -  Update command to new input buffer, new `a` val and use output
+         *    buffer from previous invocation as new input buffer.
+         * 4. Enqueue command-buffer again.
+         * 5. Read new output buffer back to host data so it can be verified
+         *    later
+         *
+         */
+        clEventWrapper E[4];
+        cl_int error = clEnqueueFillBuffer(
+            work_queue, in_mem, &pattern_fill, sizeof(cl_int), 0, data_size(),
+            0, nullptr, (out_of_order_request ? &E[0] : nullptr));
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, work_command_buffer, (out_of_order_request ? 1 : 0),
+            (out_of_order_request ? &E[0] : nullptr),
+            (out_of_order_request ? &E[1] : nullptr));
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueReadBuffer(work_queue, out_mem, CL_FALSE, 0,
+                                    data_size(), first_enqueue_output.data(),
+                                    (out_of_order_request ? 1 : 0),
+                                    (out_of_order_request ? &E[1] : nullptr),
+                                    (out_of_order_request ? &E[2] : nullptr));
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        cl_mutable_dispatch_arg_khr arg_1{ 0, sizeof(new_out_mem),
+                                           &new_out_mem };
+
+        cl_mutable_dispatch_arg_khr arg_2{ 1, sizeof(cl_mem), &out_mem };
+        cl_mutable_dispatch_arg_khr arg_3{ 2, sizeof(cl_int), &pattern_sec };
+
+        cl_mutable_dispatch_arg_khr args[] = { arg_1, arg_2, arg_3 };
+        cl_mutable_dispatch_config_khr dispatch_config{
+            command,
+            3 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+
+        cl_uint num_configs = 1;
+        cl_command_buffer_update_type_khr config_types[1] = {
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR
+        };
+        const void* configs[1] = { &dispatch_config };
+        error = clUpdateMutableCommandsKHR(work_command_buffer, num_configs,
+                                           config_types, configs);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, work_command_buffer, (out_of_order_request ? 1 : 0),
+            (out_of_order_request ? &E[2] : nullptr),
+            (out_of_order_request ? &E[3] : nullptr));
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueReadBuffer(
+            work_queue, new_out_mem, CL_FALSE, 0, data_size(),
+            second_enqueue_output.data(), (out_of_order_request ? 1 : 0),
+            (out_of_order_request ? &E[3] : nullptr), nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+        return CL_SUCCESS;
+    }
+
+    cl_int RunSimultaneousPass(std::vector<cl_int> &first_enqueue_output,
+                               std::vector<cl_int> &second_enqueue_output)
+    {
+        /* Simultaneous command-buffer pass enqueues a command-buffer twice
+         * without dependencies between the enqueues, but an update so that
+         * all the parameters are different to avoid race conditions in the
+         * kernel execution. The asynchronous task graph looks like:
+         *
+         *  (Fill input A buffer)         (Fill input B buffer)
+         *          |                               |
+         *  (Enqueue command_buffer)      (Enqueue updated command_buffer)
+         *          |                               |
+         *  (Read output A buffer)        (Read output B buffer)
+         */
+        clEventWrapper E[4];
+        cl_int error = clEnqueueFillBuffer(
+            work_queue, in_mem, &pattern_fill, sizeof(cl_int), 0, data_size(),
+            0, nullptr, (out_of_order_request ? &E[0] : nullptr));
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueFillBuffer(work_queue, new_in_mem, &pattern_fill_2,
+                                    sizeof(cl_int), 0, data_size(), 0, nullptr,
+                                    (out_of_order_request ? &E[1] : nullptr));
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, work_command_buffer, (out_of_order_request ? 1 : 0),
+            (out_of_order_request ? &E[0] : nullptr),
+            (out_of_order_request ? &E[2] : nullptr));
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        cl_mutable_dispatch_arg_khr arg_1{ 0, sizeof(new_out_mem),
+                                           &new_out_mem };
+        cl_mutable_dispatch_arg_khr arg_2{ 1, sizeof(cl_mem), &new_in_mem };
+        cl_mutable_dispatch_arg_khr arg_3{ 2, sizeof(cl_int), &pattern_sec };
+
+        cl_mutable_dispatch_arg_khr args[] = { arg_1, arg_2, arg_3 };
+        cl_mutable_dispatch_config_khr dispatch_config{
+            command,
+            3 /* num_args */,
+            0 /* num_svm_arg */,
+            0 /* num_exec_infos */,
+            0 /* work_dim - 0 means no change to dimensions */,
+            args /* arg_list */,
+            nullptr /* arg_svm_list - nullptr means no change*/,
+            nullptr /* exec_info_list */,
+            nullptr /* global_work_offset */,
+            nullptr /* global_work_size */,
+            nullptr /* local_work_size */
+        };
+
+        cl_uint num_configs = 1;
+        cl_command_buffer_update_type_khr config_types[1] = {
+            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR
+        };
+        const void* configs[1] = { &dispatch_config };
+        error = clUpdateMutableCommandsKHR(work_command_buffer, num_configs,
+                                           config_types, configs);
+        test_error(error, "clUpdateMutableCommandsKHR failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, work_command_buffer, (out_of_order_request ? 1 : 0),
+            (out_of_order_request ? &E[1] : nullptr),
+            (out_of_order_request ? &E[3] : nullptr));
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueReadBuffer(
+            work_queue, out_mem, CL_FALSE, 0, data_size(),
+            first_enqueue_output.data(), (out_of_order_request ? 1 : 0),
+            (out_of_order_request ? &E[2] : nullptr), nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        error = clEnqueueReadBuffer(
+            work_queue, new_out_mem, CL_FALSE, 0, data_size(),
+            second_enqueue_output.data(), (out_of_order_request ? 1 : 0),
+            (out_of_order_request ? &E[3] : nullptr), nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+        return CL_SUCCESS;
+    }
+
+    cl_int VerifySerializedPass(std::vector<cl_int> &first_enqueue_output,
+                                std::vector<cl_int> &second_enqueue_output)
+    {
+        const cl_int first_enqueue_ref = pattern_pri * pattern_fill;
+        const cl_int second_enqueue_ref = pattern_sec * first_enqueue_ref;
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(first_enqueue_ref, first_enqueue_output[i],
+                                     i);
+            CHECK_VERIFICATION_ERROR(second_enqueue_ref,
+                                     second_enqueue_output[i], i);
+        }
+        return CL_SUCCESS;
+    }
+
+    cl_int VerifySimultaneousPass(std::vector<cl_int> &first_enqueue_output,
+                                  std::vector<cl_int> &second_enqueue_output)
+    {
+        const cl_int first_enqueue_ref = pattern_pri * pattern_fill;
+        const cl_int second_enqueue_ref = pattern_sec * pattern_fill_2;
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(first_enqueue_ref, first_enqueue_output[i],
+                                     i);
+            CHECK_VERIFICATION_ERROR(second_enqueue_ref,
+                                     second_enqueue_output[i], i);
+        }
+        return CL_SUCCESS;
+    }
+
    cl_int Run() override
    {
-        cl_int error = CL_SUCCESS;
+        cl_int error = RecordCommandBuffer();
+        test_error(error, "RecordCommandBuffer failed");

-        if (simultaneous_use_support)
+        std::vector<cl_int> first_enqueue_output(num_elements);
+        std::vector<cl_int> second_enqueue_output(num_elements);
+
+        if (simultaneous_use_requested)
        {
-            // enqueue simultaneous command-buffers with out-of-order calls
-            error = RunSimultaneous();
-            test_error(error, "RunSimultaneous failed");
+            error = RunSimultaneousPass(first_enqueue_output,
+                                        second_enqueue_output);
+            test_error(error, "RunSimultaneousPass failed");
        }
        else
        {
-            // enqueue single command-buffer with out-of-order calls
-            error = RunSingle();
-            test_error(error, "RunSingle failed");
+            error =
+                RunSerializedPass(first_enqueue_output, second_enqueue_output);
+            test_error(error, "RunSerializedPass failed");
        }

-        return CL_SUCCESS;
-    }
-
-    cl_int RecordCommandBuffer()
-    {
-        cl_sync_point_khr sync_points[2];
-        const cl_int pattern = pattern_pri;
-        cl_int error = clCommandFillBufferKHR(
-            work_command_buffer, nullptr, nullptr, in_mem, &pattern,
-            sizeof(cl_int), 0, data_size(), 0, nullptr, &sync_points[0],
-            nullptr);
-        test_error(error, "clCommandFillBufferKHR failed");
-
-        error = clCommandFillBufferKHR(work_command_buffer, nullptr, nullptr,
-                                       out_mem, &overwritten_pattern,
-                                       sizeof(cl_int), 0, data_size(), 0,
-                                       nullptr, &sync_points[1], nullptr);
-        test_error(error, "clCommandFillBufferKHR failed");
-
-        error = clCommandNDRangeKernelKHR(
-            work_command_buffer, nullptr, nullptr, kernel, 1, nullptr,
-            &num_elements, nullptr, 2, sync_points, nullptr, &command);
-        test_error(error, "clCommandNDRangeKernelKHR failed");
-
-        error = clFinalizeCommandBufferKHR(work_command_buffer);
-        test_error(error, "clFinalizeCommandBufferKHR failed");
-
-        return CL_SUCCESS;
-    }
-
-    cl_int RunSingle()
-    {
-        cl_int error;
-
-        error = RecordCommandBuffer();
-        test_error(error, "RecordCommandBuffer failed");
-
-        error = clEnqueueCommandBufferKHR(0, nullptr, work_command_buffer, 0,
-                                          nullptr, &single_event);
-        test_error(error, "clEnqueueCommandBufferKHR failed");
-
-        std::vector<cl_int> output_data(num_elements);
-        error =
-            clEnqueueReadBuffer(work_queue, out_mem, CL_TRUE, 0, data_size(),
-                                output_data.data(), 1, &single_event, nullptr);
-        test_error(error, "clEnqueueReadBuffer failed");
-
-        for (size_t i = 0; i < num_elements; i++)
-        {
-            CHECK_VERIFICATION_ERROR(pattern_pri, output_data[i], i);
-        }
-
-        clMemWrapper new_out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
-                                                  sizeof(cl_int) * num_elements
-                                                      * buffer_size_multiplier,
-                                                  nullptr, &error);
-        test_error(error, "clCreateBuffer failed");
-
-        cl_mutable_dispatch_arg_khr arg_1{ 1, sizeof(new_out_mem),
-                                           &new_out_mem };
-        cl_mutable_dispatch_arg_khr args[] = { arg_1 };
-
-        cl_mutable_dispatch_config_khr dispatch_config{
-            command,
-            1 /* num_args */,
-            0 /* num_svm_arg */,
-            0 /* num_exec_infos */,
-            0 /* work_dim - 0 means no change to dimensions */,
-            args /* arg_list */,
-            nullptr /* arg_svm_list - nullptr means no change*/,
-            nullptr /* exec_info_list */,
-            nullptr /* global_work_offset */,
-            nullptr /* global_work_size */,
-            nullptr /* local_work_size */
-        };
-
-        cl_uint num_configs = 1;
-        cl_command_buffer_update_type_khr config_types[1] = {
-            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR
-        };
-        const void* configs[1] = { &dispatch_config };
-        error = clUpdateMutableCommandsKHR(work_command_buffer, num_configs,
-                                           config_types, configs);
-        test_error(error, "clUpdateMutableCommandsKHR failed");
-
-        error = clEnqueueCommandBufferKHR(0, nullptr, work_command_buffer, 0,
-                                          nullptr, &single_event);
-        test_error(error, "clEnqueueCommandBufferKHR failed");
-
-        error = clEnqueueReadBuffer(work_queue, new_out_mem, CL_TRUE, 0,
-                                    data_size(), output_data.data(), 1,
-                                    &single_event, nullptr);
-        test_error(error, "clEnqueueReadBuffer failed");
-
-        for (size_t i = 0; i < num_elements; i++)
-        {
-            CHECK_VERIFICATION_ERROR(pattern_pri, output_data[i], i);
-        }
-
-        return CL_SUCCESS;
-    }
-
-    cl_int RecordSimultaneousCommandBuffer()
-    {
-        cl_sync_point_khr sync_points[2];
-        // for both simultaneous passes this call will fill entire in_mem buffer
-        cl_int error = clCommandFillBufferKHR(
-            work_command_buffer, nullptr, nullptr, in_mem, &pattern_pri,
-            sizeof(cl_int), 0, data_size() * buffer_size_multiplier, 0, nullptr,
-            &sync_points[0], nullptr);
-        test_error(error, "clCommandFillBufferKHR failed");
-
-        // to avoid overwriting the entire result buffer instead of filling
-        // only relevant part this additional kernel was introduced
-
-        error = clCommandNDRangeKernelKHR(
-            work_command_buffer, nullptr, nullptr, kernel_fill, 1, nullptr,
-            &num_elements, nullptr, 0, nullptr, &sync_points[1], &command);
-        test_error(error, "clCommandNDRangeKernelKHR failed");
-
-        error = clCommandNDRangeKernelKHR(
-            work_command_buffer, nullptr, nullptr, kernel, 1, nullptr,
-            &num_elements, nullptr, 2, sync_points, nullptr, &command);
-        test_error(error, "clCommandNDRangeKernelKHR failed");
-
-        error = clFinalizeCommandBufferKHR(work_command_buffer);
-        test_error(error, "clFinalizeCommandBufferKHR failed");
-
-        return CL_SUCCESS;
-    }
-
-    struct SimulPassData
-    {
-        cl_int offset;
-        std::vector<cl_int> output_buffer;
-        std::vector<cl_int> updated_output_buffer;
-        // 0:user event, 1:offset-buffer fill event, 2:kernel done event
-        clEventWrapper wait_events[3];
-    };
-
-    cl_int EnqueueSimultaneousPass(SimulPassData& pd)
-    {
-        cl_int error = CL_SUCCESS;
-        if (!user_event)
-        {
-            user_event = clCreateUserEvent(context, &error);
-            test_error(error, "clCreateUserEvent failed");
-        }
-
-        pd.wait_events[0] = user_event;
-
-        // filling offset buffer must wait for previous pass completeness
-        error = clEnqueueFillBuffer(
-            work_queue, off_mem, &pd.offset, sizeof(cl_int), 0, sizeof(cl_int),
-            (wait_pass_event != nullptr ? 1 : 0),
-            (wait_pass_event != nullptr ? &wait_pass_event : nullptr),
-            &pd.wait_events[1]);
-        test_error(error, "clEnqueueFillBuffer failed");
-
-        // command buffer execution must wait for two wait-events
-        error =
-            clEnqueueCommandBufferKHR(0, nullptr, work_command_buffer, 2,
-                                      &pd.wait_events[0], &pd.wait_events[2]);
-        test_error(error, "clEnqueueCommandBufferKHR failed");
-
-        error = clEnqueueReadBuffer(work_queue, out_mem, CL_FALSE,
-                                    pd.offset * sizeof(cl_int), data_size(),
-                                    pd.output_buffer.data(), 1,
-                                    &pd.wait_events[2], nullptr);
-        test_error(error, "clEnqueueReadBuffer failed");
-
-        clMemWrapper new_out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
-                                                  sizeof(cl_int) * num_elements
-                                                      * buffer_size_multiplier,
-                                                  nullptr, &error);
-        test_error(error, "clCreateBuffer failed");
-        // Retain new output memory object until the end of the test.
-        retained_output_buffers.push_back(new_out_mem);
-
-        cl_mutable_dispatch_arg_khr arg_1{ 1, sizeof(new_out_mem),
-                                           &new_out_mem };
-        cl_mutable_dispatch_arg_khr args[] = { arg_1 };
-
-        cl_mutable_dispatch_config_khr dispatch_config{
-            command,
-            1 /* num_args */,
-            0 /* num_svm_arg */,
-            0 /* num_exec_infos */,
-            0 /* work_dim - 0 means no change to dimensions */,
-            args /* arg_list */,
-            nullptr /* arg_svm_list - nullptr means no change*/,
-            nullptr /* exec_info_list */,
-            nullptr /* global_work_offset */,
-            nullptr /* global_work_size */,
-            nullptr /* local_work_size */
-        };
-
-        cl_uint num_configs = 1;
-        cl_command_buffer_update_type_khr config_types[1] = {
-            CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR
-        };
-        const void* configs[1] = { &dispatch_config };
-        error = clUpdateMutableCommandsKHR(work_command_buffer, num_configs,
-                                           config_types, configs);
-        test_error(error, "clUpdateMutableCommandsKHR failed");
-
-        // command buffer execution must wait for two wait-events
-        error =
-            clEnqueueCommandBufferKHR(0, nullptr, work_command_buffer, 2,
-                                      &pd.wait_events[0], &pd.wait_events[2]);
-        test_error(error, "clEnqueueCommandBufferKHR failed");
-
-        error = clEnqueueReadBuffer(work_queue, new_out_mem, CL_FALSE,
-                                    pd.offset * sizeof(cl_int), data_size(),
-                                    pd.updated_output_buffer.data(), 1,
-                                    &pd.wait_events[2], nullptr);
-        test_error(error, "clEnqueueReadBuffer failed");
-
-        return CL_SUCCESS;
-    }
-
-    cl_int RunSimultaneous()
-    {
-        cl_int error = RecordSimultaneousCommandBuffer();
-        test_error(error, "RecordSimultaneousCommandBuffer failed");
-
-        cl_int offset = static_cast<cl_int>(num_elements);
-
-        std::vector<SimulPassData> simul_passes = {
-            { 0, std::vector<cl_int>(num_elements),
-              std::vector<cl_int>(num_elements) },
-            { offset, std::vector<cl_int>(num_elements),
-              std::vector<cl_int>(num_elements) }
-        };
-
-        for (auto&& pass : simul_passes)
-        {
-            error = EnqueueSimultaneousPass(pass);
-            test_error(error, "EnqueueSimultaneousPass failed");
-
-            wait_pass_event = pass.wait_events[2];
-        }
-
-        error = clSetUserEventStatus(user_event, CL_COMPLETE);
-        test_error(error, "clSetUserEventStatus failed");
-
        error = clFinish(work_queue);
        test_error(error, "clFinish failed");

        // verify the result buffers
-        auto& first_pass_output = simul_passes[0].output_buffer;
-        auto& first_pass_updated_output = simul_passes[0].updated_output_buffer;
-        auto& second_pass_output = simul_passes[1].output_buffer;
-        auto& second_pass_updated_output =
-            simul_passes[1].updated_output_buffer;
-        for (size_t i = 0; i < num_elements; i++)
+        if (simultaneous_use_requested)
        {
-            // First pass:
-            // Before updating, out_mem is copied from in_mem (pattern_pri)
-            CHECK_VERIFICATION_ERROR(pattern_pri, first_pass_output[i], i);
-            // After updating, new_out_mem is copied from in_mem (pattern_pri)
-            CHECK_VERIFICATION_ERROR(pattern_pri, first_pass_updated_output[i],
-                                     i);
-            // Second pass:
-            // Before updating, out_mem is filled with overwritten_pattern
-            CHECK_VERIFICATION_ERROR(overwritten_pattern, second_pass_output[i],
-                                     i);
-            // After updating, new_out_mem is copied from in_mem (pattern_pri)
-            CHECK_VERIFICATION_ERROR(pattern_pri, second_pass_updated_output[i],
-                                     i);
+            error = VerifySimultaneousPass(first_enqueue_output,
+                                           second_enqueue_output);
+            test_error(error, "VerifySimultaneousPass failed");
+        }
+        else
+        {
+            error = VerifySerializedPass(first_enqueue_output,
+                                         second_enqueue_output);
+            test_error(error, "VerifySerializedPass failed");
        }

        return CL_SUCCESS;
@@ -440,22 +400,20 @@ struct SimultaneousMutableDispatchTest : public BasicMutableCommandBufferTest
    clCommandQueueWrapper work_queue;
    clCommandBufferWrapper work_command_buffer;

-    clEventWrapper user_event;
-    clEventWrapper single_event;
-    clEventWrapper wait_pass_event;
+    clKernelWrapper kernel_mul;
+    clProgramWrapper program_mul;

-    clKernelWrapper kernel_fill;
-    clProgramWrapper program_fill;
+    clMemWrapper new_out_mem, new_in_mem;

-    std::vector<clMemWrapper> retained_output_buffers;
-
-    const size_t test_global_work_size = 3 * sizeof(cl_int);
    const cl_int pattern_pri = 42;
+    const cl_int pattern_sec = 0xACDC;
+    const cl_int pattern_fill = 0xA;
+    const cl_int pattern_fill_2 = -3;

-    const cl_int overwritten_pattern = 0xACDC;
    cl_mutable_command_khr command;
 };

+template <bool simultaneous_use_request>
 struct CrossQueueSimultaneousMutableDispatchTest
    : public BasicMutableCommandBufferTest
 {
@@ -463,9 +421,9 @@ struct CrossQueueSimultaneousMutableDispatchTest
                                              cl_context context,
                                              cl_command_queue queue)
        : BasicMutableCommandBufferTest(device, context, queue),
-          queue_sec(nullptr), command(nullptr)
+          queue_sec(nullptr), new_out_mem(nullptr), command(nullptr)
    {
-        simultaneous_use_requested = true;
+        simultaneous_use_requested = simultaneous_use_request;
    }

    cl_int SetUpKernel() override
@@ -488,6 +446,11 @@ struct CrossQueueSimultaneousMutableDispatchTest
        kernel = clCreateKernel(program, "fill", &error);
        test_error(error, "Failed to create copy kernel");

+        new_out_mem =
+            clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                           sizeof(cl_int) * num_elements, nullptr, &error);
+        test_error(error, "clCreateBuffer failed");
+
        return CL_SUCCESS;
    }

@@ -530,24 +493,18 @@ struct CrossQueueSimultaneousMutableDispatchTest
                sizeof(mutable_capabilities), &mutable_capabilities, nullptr)
            && mutable_capabilities & CL_MUTABLE_DISPATCH_ARGUMENTS_KHR;

-        return !simultaneous_use_support || !mutable_support;
+        return (simultaneous_use_requested && !simultaneous_use_support)
+            || !mutable_support;
    }

    cl_int Run() override
    {
-        // record command buffer
-        cl_int pattern = 0;
-        cl_int error = clCommandFillBufferKHR(
-            command_buffer, nullptr, nullptr, out_mem, &pattern, sizeof(cl_int),
-            0, data_size(), 0, nullptr, nullptr, nullptr);
-        test_error(error, "clCommandFillBufferKHR failed");
-
        cl_command_properties_khr props[] = {
            CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
            CL_MUTABLE_DISPATCH_ARGUMENTS_KHR, 0
        };

-        error = clCommandNDRangeKernelKHR(
+        cl_int error = clCommandNDRangeKernelKHR(
            command_buffer, nullptr, props, kernel, 1, nullptr, &num_elements,
            nullptr, 0, nullptr, nullptr, &command);
        test_error(error, "clCommandNDRangeKernelKHR failed");
@@ -555,16 +512,15 @@ struct CrossQueueSimultaneousMutableDispatchTest
        error = clFinalizeCommandBufferKHR(command_buffer);
        test_error(error, "clFinalizeCommandBufferKHR failed");

-        // enqueue command buffer to default queue
-        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
-                                          nullptr, nullptr);
+        // If we are testing not using simultaneous-use then we need to use
+        // an event to serialize the execution order to the command-buffer
+        // submission to each queue.
+        clEventWrapper E;
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, command_buffer, 0, nullptr,
+            (simultaneous_use_requested ? nullptr : &E));
        test_error(error, "clEnqueueCommandBufferKHR failed");

-        // update mutable parameters
-        clMemWrapper new_out_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
-                                                  data_size(), nullptr, &error);
-        test_error(error, "clCreateBuffer failed");
-
        cl_mutable_dispatch_arg_khr arg_0{ 0, sizeof(cl_int), &pattern_sec };
        cl_mutable_dispatch_arg_khr arg_1{ 1, sizeof(new_out_mem),
                                           &new_out_mem };
@@ -594,30 +550,35 @@ struct CrossQueueSimultaneousMutableDispatchTest
        test_error(error, "clUpdateMutableCommandsKHR failed");

        // enqueue command buffer to non-default queue
-        error = clEnqueueCommandBufferKHR(1, &queue_sec, command_buffer, 0,
-                                          nullptr, nullptr);
+        error = clEnqueueCommandBufferKHR(
+            1, &queue_sec, command_buffer, (simultaneous_use_requested ? 0 : 1),
+            (simultaneous_use_requested ? nullptr : &E), nullptr);
        test_error(error, "clEnqueueCommandBufferKHR failed");

-        error = clFinish(queue_sec);
-        test_error(error, "clFinish failed");
-
        // read result of command buffer execution
        std::vector<cl_int> output_data(num_elements);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
+                                    output_data.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        std::vector<cl_int> sec_output_data(num_elements);
        error =
            clEnqueueReadBuffer(queue_sec, new_out_mem, CL_TRUE, 0, data_size(),
-                                output_data.data(), 0, nullptr, nullptr);
+                                sec_output_data.data(), 0, nullptr, nullptr);
        test_error(error, "clEnqueueReadBuffer failed");

        // verify the result
        for (size_t i = 0; i < num_elements; i++)
        {
-            CHECK_VERIFICATION_ERROR(pattern_sec, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_pri, output_data[i], i);
+            CHECK_VERIFICATION_ERROR(pattern_sec, sec_output_data[i], i);
        }

        return CL_SUCCESS;
    }

    clCommandQueueWrapper queue_sec;
+    clMemWrapper new_out_mem;
    const cl_int pattern_pri = 42;
    const cl_int pattern_sec = 0xACDC;
    cl_mutable_command_khr command;
@@ -637,14 +598,26 @@ REGISTER_TEST(mutable_dispatch_simultaneous_out_of_order)
        device, context, queue, num_elements);
 }

+REGISTER_TEST(mutable_dispatch_in_order)
+{
+    return MakeAndRunTest<SimultaneousMutableDispatchTest<false, false>>(
+        device, context, queue, num_elements);
+}
+
 REGISTER_TEST(mutable_dispatch_simultaneous_in_order)
 {
    return MakeAndRunTest<SimultaneousMutableDispatchTest<true, false>>(
        device, context, queue, num_elements);
 }

-REGISTER_TEST(mutable_dispatch_simultaneous_cross_queue)
+REGISTER_TEST(mutable_dispatch_cross_queue)
 {
-    return MakeAndRunTest<CrossQueueSimultaneousMutableDispatchTest>(
+    return MakeAndRunTest<CrossQueueSimultaneousMutableDispatchTest<false>>(
+        device, context, queue, num_elements);
+}
+
+REGISTER_TEST(mutable_dispatch_simultaneous_cross_queue)
+{
+    return MakeAndRunTest<CrossQueueSimultaneousMutableDispatchTest<true>>(
        device, context, queue, num_elements);
 }
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_event_sync.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_event_sync.cpp
@@ -79,11 +79,7 @@ struct CommandBufferEventSync : public BasicCommandBufferTest
        : BasicCommandBufferTest(device, context, queue),
          command_buffer_sec(this), kernel_sec(nullptr), in_mem_sec(nullptr),
          out_mem_sec(nullptr), off_mem_sec(nullptr), test_event(nullptr)
-    {
-        simultaneous_use_requested =
-            (event_mode == EventMode::RET_COMBUF_WAIT_FOR_COMBUF) ? true
-                                                                  : false;
-    }
+    {}

    //--------------------------------------------------------------------------
    cl_int SetUpKernel() override
@@ -159,9 +155,6 @@ struct CommandBufferEventSync : public BasicCommandBufferTest
    {
        if (BasicCommandBufferTest::Skip()) return true;

-        if (simultaneous_use_requested && !simultaneous_use_support)
-            return true;
-
        if (out_of_order_requested && !out_of_order_support) return true;

        return false;
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_get_command_buffer_info.cpp
@@ -48,6 +48,39 @@ struct CommandBufferGetCommandBufferInfo : public BasicCommandBufferTest
        : BasicCommandBufferTest(device, context, queue)
    {}

+    bool Skip() override
+    {
+        if (BasicCommandBufferTest::Skip()) return true;
+
+        if (test_mode == CombufInfoTestMode::CITM_PROP_ARRAY)
+        {
+            return !simultaneous_use_support
+                || !(is_extension_available(
+                    device,
+                    CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME));
+        }
+
+        return false;
+    }
+
+    cl_int SetUp(int elements) override
+    {
+
+        cl_int error = BasicCommandBufferTest::SetUp(elements);
+        test_error(error, "BasicCommandBufferTest::SetUp() failed");
+        if (test_mode == CombufInfoTestMode::CITM_PROP_ARRAY)
+        {
+            cl_command_buffer_properties_khr properties[3] = {
+                CL_COMMAND_BUFFER_FLAGS_KHR,
+                CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR, 0
+            };
+            command_buffer =
+                clCreateCommandBufferKHR(1, &queue, properties, &error);
+            test_error(error, "clCreateCommandBufferKHR failed");
+        }
+        return CL_SUCCESS;
+    }
+
    //--------------------------------------------------------------------------
    cl_int Run() override
    {
@@ -237,33 +270,6 @@ struct CommandBufferGetCommandBufferInfo : public BasicCommandBufferTest
        error = verify_state(CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR);
        test_error(error, "verify_state failed");

-        error = clEnqueueFillBuffer(queue, out_mem, &pattern, sizeof(cl_int), 0,
-                                    data_size(), 0, nullptr, nullptr);
-        test_error(error, "clEnqueueFillBuffer failed");
-
-        clEventWrapper trigger_event = clCreateUserEvent(context, &error);
-        test_error(error, "clCreateUserEvent failed");
-
-        clEventWrapper execute_event;
-        // enqueued command buffer blocked on user event
-        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 1,
-                                          &trigger_event, &execute_event);
-        test_error(error, "clEnqueueCommandBufferKHR failed");
-
-        // execute command buffer
-        cl_int signal_error = clSetUserEventStatus(trigger_event, CL_COMPLETE);
-
-        test_error(error, "verify_state failed");
-
-        test_error(signal_error, "clSetUserEventStatus failed");
-
-        error = clWaitForEvents(1, &execute_event);
-        test_error(error, "Unable to wait for execute event");
-
-        // verify executable state
-        error = verify_state(CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR);
-        test_error(error, "verify_state failed");
-
        return CL_SUCCESS;
    }

--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_out_of_order.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_out_of_order.cpp
@@ -21,11 +21,9 @@
 namespace {

 ////////////////////////////////////////////////////////////////////////////////
-// out-of-order tests for cl_khr_command_buffer which handles below cases:
-// -test case for out-of-order command-buffer
-// -test an out-of-order command-buffer with simultaneous use
+// Tests for cl_khr_command_buffer which handles submitting a command-buffer to
+// an out-of-order queue.

-template <bool simultaneous_request>
 struct OutOfOrderTest : public BasicCommandBufferTest
 {
    OutOfOrderTest(cl_device_id device, cl_context context,
@@ -35,18 +33,11 @@ struct OutOfOrderTest : public BasicCommandBufferTest
          user_event(nullptr), wait_pass_event(nullptr), kernel_fill(nullptr),
          program_fill(nullptr)
    {
-        simultaneous_use_requested = simultaneous_request;
-        if (simultaneous_request) buffer_size_multiplier = 2;
+        buffer_size_multiplier = 2; // two enqueues of command-buffer
    }

-    //--------------------------------------------------------------------------
    cl_int SetUpKernel() override
    {
-        // if device doesn't support simultaneous use which was requested
-        // we can skip creation of OCL resources
-        if (simultaneous_use_requested && !simultaneous_use_support)
-            return CL_SUCCESS;
-
        cl_int error = BasicCommandBufferTest::SetUpKernel();
        test_error(error, "BasicCommandBufferTest::SetUpKernel failed");

@@ -74,14 +65,8 @@ struct OutOfOrderTest : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    cl_int SetUpKernelArgs() override
    {
-        // if device doesn't support simultaneous use which was requested
-        // we can skip creation of OCL resources
-        if (simultaneous_use_requested && !simultaneous_use_support)
-            return CL_SUCCESS;
-
        cl_int error = BasicCommandBufferTest::SetUpKernelArgs();
        test_error(error, "BasicCommandBufferTest::SetUpKernelArgs failed");

@@ -98,7 +83,6 @@ struct OutOfOrderTest : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    cl_int SetUp(int elements) override
    {
        cl_int error = BasicCommandBufferTest::SetUp(elements);
@@ -108,110 +92,23 @@ struct OutOfOrderTest : public BasicCommandBufferTest
            context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error);
        test_error(error, "Unable to create command queue to test with");

-        cl_command_buffer_properties_khr properties[3] = {
-            CL_COMMAND_BUFFER_FLAGS_KHR, 0, 0
-        };
-
-        if (simultaneous_use_requested && simultaneous_use_support)
-            properties[1] = CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;
-
-        out_of_order_command_buffer = clCreateCommandBufferKHR(
-            1, &out_of_order_queue, properties, &error);
+        out_of_order_command_buffer =
+            clCreateCommandBufferKHR(1, &out_of_order_queue, nullptr, &error);
        test_error(error, "clCreateCommandBufferKHR failed");

        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    bool Skip() override
    {
        if (BasicCommandBufferTest::Skip()) return true;
-
-        if (!out_of_order_support
-            || (simultaneous_use_requested && !simultaneous_use_support))
-            return true;
-
-        return false;
+        return !out_of_order_support;
    }

-    //--------------------------------------------------------------------------
-    cl_int Run() override
-    {
-        cl_int error = CL_SUCCESS;
-
-        if (simultaneous_use_support)
-        {
-            // enqueue simultaneous command-buffers with out-of-order calls
-            error = RunSimultaneous();
-            test_error(error, "RunSimultaneous failed");
-        }
-        else
-        {
-            // enqueue single command-buffer with out-of-order calls
-            error = RunSingle();
-            test_error(error, "RunSingle failed");
-        }
-
-        return CL_SUCCESS;
-    }
-
-    //--------------------------------------------------------------------------
-    cl_int RecordCommandBuffer()
+    cl_int RecordCommandBuffer() const
    {
        cl_sync_point_khr sync_points[2];
-        const cl_int pattern = pattern_pri;
-        cl_int error = clCommandFillBufferKHR(
-            out_of_order_command_buffer, nullptr, nullptr, in_mem, &pattern,
-            sizeof(cl_int), 0, data_size(), 0, nullptr, &sync_points[0],
-            nullptr);
-        test_error(error, "clCommandFillBufferKHR failed");
-
-        error = clCommandFillBufferKHR(out_of_order_command_buffer, nullptr,
-                                       nullptr, out_mem, &overwritten_pattern,
-                                       sizeof(cl_int), 0, data_size(), 0,
-                                       nullptr, &sync_points[1], nullptr);
-        test_error(error, "clCommandFillBufferKHR failed");
-
-        error = clCommandNDRangeKernelKHR(
-            out_of_order_command_buffer, nullptr, nullptr, kernel, 1, nullptr,
-            &num_elements, nullptr, 2, sync_points, nullptr, nullptr);
-        test_error(error, "clCommandNDRangeKernelKHR failed");
-
-        error = clFinalizeCommandBufferKHR(out_of_order_command_buffer);
-        test_error(error, "clFinalizeCommandBufferKHR failed");
-
-        return CL_SUCCESS;
-    }
-
-    //--------------------------------------------------------------------------
-    cl_int RunSingle()
-    {
-        cl_int error = RecordCommandBuffer();
-        test_error(error, "RecordCommandBuffer failed");
-
-        error = clEnqueueCommandBufferKHR(
-            0, nullptr, out_of_order_command_buffer, 0, nullptr, &user_event);
-        test_error(error, "clEnqueueCommandBufferKHR failed");
-
-        std::vector<cl_int> output_data(num_elements);
-        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
-                                    data_size(), output_data.data(), 1,
-                                    &user_event, nullptr);
-        test_error(error, "clEnqueueReadBuffer failed");
-
-        for (size_t i = 0; i < num_elements; i++)
-        {
-            CHECK_VERIFICATION_ERROR(pattern_pri, output_data[i], i);
-        }
-
-        return CL_SUCCESS;
-    }
-
-    //--------------------------------------------------------------------------
-    cl_int RecordSimultaneousCommandBuffer() const
-    {
-        cl_sync_point_khr sync_points[2];
-        // for both simultaneous passes this call will fill entire in_mem buffer
+        // fill entire in_mem buffer
        cl_int error = clCommandFillBufferKHR(
            out_of_order_command_buffer, nullptr, nullptr, in_mem, &pattern_pri,
            sizeof(cl_int), 0, data_size() * buffer_size_multiplier, 0, nullptr,
@@ -236,79 +133,63 @@ struct OutOfOrderTest : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
-    struct SimulPassData
+    struct EnqueuePassData
    {
        cl_int offset;
        std::vector<cl_int> output_buffer;
-        // 0:user event, 1:offset-buffer fill event, 2:kernel done event
-        clEventWrapper wait_events[3];
+        // 0: offset-buffer fill event, 2:kernel done event
+        clEventWrapper wait_events[2];
    };

-    //--------------------------------------------------------------------------
-    cl_int EnqueueSimultaneousPass(SimulPassData& pd)
+    cl_int EnqueuePass(EnqueuePassData& pd)
    {
-        cl_int error = CL_SUCCESS;
-        if (!user_event)
-        {
-            user_event = clCreateUserEvent(context, &error);
-            test_error(error, "clCreateUserEvent failed");
-        }
-
-        pd.wait_events[0] = user_event;
-
        // filling offset buffer must wait for previous pass completeness
-        error = clEnqueueFillBuffer(
+        cl_int error = clEnqueueFillBuffer(
            out_of_order_queue, off_mem, &pd.offset, sizeof(cl_int), 0,
            sizeof(cl_int), (wait_pass_event != nullptr ? 1 : 0),
            (wait_pass_event != nullptr ? &wait_pass_event : nullptr),
-            &pd.wait_events[1]);
+            &pd.wait_events[0]);
        test_error(error, "clEnqueueFillBuffer failed");

        // command buffer execution must wait for two wait-events
        error = clEnqueueCommandBufferKHR(
-            0, nullptr, out_of_order_command_buffer, 2, &pd.wait_events[0],
-            &pd.wait_events[2]);
+            0, nullptr, out_of_order_command_buffer, 1, &pd.wait_events[0],
+            &pd.wait_events[1]);
        test_error(error, "clEnqueueCommandBufferKHR failed");

        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_FALSE,
                                    pd.offset * sizeof(cl_int), data_size(),
                                    pd.output_buffer.data(), 1,
-                                    &pd.wait_events[2], nullptr);
+                                    &pd.wait_events[1], nullptr);
        test_error(error, "clEnqueueReadBuffer failed");

        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
-    cl_int RunSimultaneous()
+    cl_int Run() override
    {
-        cl_int error = RecordSimultaneousCommandBuffer();
-        test_error(error, "RecordSimultaneousCommandBuffer failed");
+        cl_int error = RecordCommandBuffer();
+        test_error(error, "RecordCommandBuffer failed");

        cl_int offset = static_cast<cl_int>(num_elements);
-
-        std::vector<SimulPassData> simul_passes = {
+        std::vector<EnqueuePassData> enqueue_passes = {
            { 0, std::vector<cl_int>(num_elements) },
            { offset, std::vector<cl_int>(num_elements) }
        };

-        for (auto&& pass : simul_passes)
+        for (auto&& pass : enqueue_passes)
        {
-            error = EnqueueSimultaneousPass(pass);
-            test_error(error, "EnqueueSimultaneousPass failed");
+            error = EnqueuePass(pass);
+            test_error(error, "EnqueuePass failed");

-            wait_pass_event = pass.wait_events[2];
+            wait_pass_event = pass.wait_events[1];
        }

-        error = clSetUserEventStatus(user_event, CL_COMPLETE);
-        test_error(error, "clSetUserEventStatus failed");
-
        error = clFinish(out_of_order_queue);
        test_error(error, "clFinish failed");

        // verify the result buffers
-        for (auto&& pass : simul_passes)
+        for (auto&& pass : enqueue_passes)
        {
            auto& res_data = pass.output_buffer;
            for (size_t i = 0; i < num_elements; i++)
@@ -320,7 +201,6 @@ struct OutOfOrderTest : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    clCommandQueueWrapper out_of_order_queue;
    clCommandBufferWrapper out_of_order_command_buffer;

@@ -338,12 +218,5 @@ struct OutOfOrderTest : public BasicCommandBufferTest

 REGISTER_TEST(out_of_order)
 {
-    return MakeAndRunTest<OutOfOrderTest<false>>(device, context, queue,
-                                                 num_elements);
-}
-
-REGISTER_TEST(simultaneous_out_of_order)
-{
-    return MakeAndRunTest<OutOfOrderTest<true>>(device, context, queue,
-                                                num_elements);
+    return MakeAndRunTest<OutOfOrderTest>(device, context, queue, num_elements);
 }
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_pipelined_enqueue.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_pipelined_enqueue.cpp
@@ -0,0 +1,321 @@
+//
+// Copyright (c) 2025 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "basic_command_buffer.h"
+
+#include <vector>
+
+namespace {
+
+////////////////////////////////////////////////////////////////////////////////
+// Tests for multiple sequential submissions of a command-buffer without a
+// blocking wait between them, but using the following mechanisms to serialize
+// execution of the submissions.
+// * In-order queue dependencies
+// * Event dependencies in command-buffer submissions to an out-of-order queue
+// * Barrier submissions between command-buffer submissions to an out-of-order
+//   queue
+
+// Base class that individual test fixtures are derived from
+struct CommandBufferPipelined : public BasicCommandBufferTest
+{
+    CommandBufferPipelined(cl_device_id device, cl_context context,
+                           cl_command_queue queue)
+        : BasicCommandBufferTest(device, context, queue)
+    {}
+
+    cl_int SetUpKernel() override
+    {
+        const char* mul_kernel_str =
+            R"(
+            __kernel void mul_by_val(int in, __global int* data)
+            {
+                size_t id = get_global_id(0);
+                data[id] *= in;
+            }
+
+            __kernel void increment(__global int* data)
+            {
+                size_t id = get_global_id(0);
+                data[id]++;
+            })";
+
+        cl_int error = create_single_kernel_helper_create_program(
+            context, &program, 1, &mul_kernel_str);
+        test_error(error, "Failed to create program with source");
+
+        error = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
+        test_error(error, "Failed to build program");
+
+        mul_kernel = clCreateKernel(program, "mul_by_val", &error);
+        test_error(error, "Failed to create mul_by_val kernel");
+
+        inc_kernel = clCreateKernel(program, "increment", &error);
+        test_error(error, "Failed to create increment kernel");
+
+        return CL_SUCCESS;
+    }
+
+    cl_int SetUpKernelArgs() override
+    {
+        cl_int error = CL_SUCCESS;
+        out_mem = clCreateBuffer(context, CL_MEM_READ_WRITE,
+                                 num_elements * buffer_size_multiplier
+                                     * sizeof(cl_int),
+                                 nullptr, &error);
+        test_error(error, "clCreateBuffer failed");
+
+        cl_int val_arg = pattern;
+        error = clSetKernelArg(mul_kernel, 0, sizeof(cl_int), &val_arg);
+        test_error(error, "clSetKernelArg failed");
+
+        error = clSetKernelArg(mul_kernel, 1, sizeof(out_mem), &out_mem);
+        test_error(error, "clSetKernelArg failed");
+
+        error = clSetKernelArg(inc_kernel, 0, sizeof(out_mem), &out_mem);
+        test_error(error, "clSetKernelArg failed");
+
+        return CL_SUCCESS;
+    }
+
+    cl_int RecordCommandBuffer(clCommandBufferWrapper& cmd_buf)
+    {
+        cl_int error = clCommandNDRangeKernelKHR(
+            cmd_buf, nullptr, nullptr, inc_kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(cmd_buf);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        // Zero initialize buffer before starting test
+        cl_int zero_pattern = 0;
+        error =
+            clEnqueueFillBuffer(queue, out_mem, &zero_pattern, sizeof(cl_int),
+                                0, data_size(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueFillBuffer failed");
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
+        return CL_SUCCESS;
+    }
+
+    const cl_int pattern = 42;
+
+    clKernelWrapper inc_kernel = nullptr;
+    clKernelWrapper mul_kernel = nullptr;
+};
+
+struct InOrderPipelined : public CommandBufferPipelined
+{
+    InOrderPipelined(cl_device_id device, cl_context context,
+                     cl_command_queue queue)
+        : CommandBufferPipelined(device, context, queue)
+    {}
+
+    cl_int Run() override
+    {
+        cl_int error = RecordCommandBuffer(command_buffer);
+        test_error(error, "RecordCommandBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error =
+            clEnqueueNDRangeKernel(queue, mul_kernel, 1, nullptr, &num_elements,
+                                   nullptr, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_int> output_data(num_elements);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
+                                    output_data.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        // Verify
+        const cl_int ref = pattern + 1;
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(ref, output_data[i], i);
+        }
+        return CL_SUCCESS;
+    }
+};
+
+struct EventPipelined : public CommandBufferPipelined
+{
+    EventPipelined(cl_device_id device, cl_context context,
+                   cl_command_queue queue)
+        : CommandBufferPipelined(device, context, queue),
+          out_of_order_queue(nullptr), out_of_order_command_buffer(this)
+    {}
+
+    bool Skip() override
+    {
+        return CommandBufferPipelined::Skip() || !out_of_order_support;
+    }
+
+    cl_int SetUp(int elements) override
+    {
+        cl_int error = CommandBufferPipelined::SetUp(elements);
+        test_error(error, "EventPipelined::SetUp failed");
+
+        out_of_order_queue = clCreateCommandQueue(
+            context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error);
+        test_error(error, "Unable to create command queue to test with");
+
+        out_of_order_command_buffer =
+            clCreateCommandBufferKHR(1, &out_of_order_queue, nullptr, &error);
+        test_error(error, "clCreateCommandBufferKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    cl_int Run() override
+    {
+        cl_int error = RecordCommandBuffer(out_of_order_command_buffer);
+        test_error(error, "RecordCommandBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 0, nullptr, &events[0]);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueNDRangeKernel(out_of_order_queue, mul_kernel, 1,
+                                       nullptr, &num_elements, nullptr, 1,
+                                       &events[0], &events[1]);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 1, &events[1], &events[2]);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        std::vector<cl_int> output_data(num_elements);
+        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
+                                    data_size(), output_data.data(), 1,
+                                    &events[2], nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        // Verify
+        const cl_int ref = pattern + 1;
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(ref, output_data[i], i);
+        }
+        return CL_SUCCESS;
+    }
+
+    clCommandQueueWrapper out_of_order_queue;
+    clCommandBufferWrapper out_of_order_command_buffer;
+    clEventWrapper events[3] = { nullptr, nullptr, nullptr };
+};
+
+struct BarrierPipelined : public CommandBufferPipelined
+{
+    BarrierPipelined(cl_device_id device, cl_context context,
+                     cl_command_queue queue)
+        : CommandBufferPipelined(device, context, queue),
+          out_of_order_queue(nullptr), out_of_order_command_buffer(this)
+    {}
+
+    bool Skip() override
+    {
+        return CommandBufferPipelined::Skip() || !out_of_order_support;
+    }
+
+    cl_int SetUp(int elements) override
+    {
+        cl_int error = CommandBufferPipelined::SetUp(elements);
+        test_error(error, "EventPipelined::SetUp failed");
+
+        out_of_order_queue = clCreateCommandQueue(
+            context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error);
+        test_error(error, "Unable to create command queue to test with");
+
+        out_of_order_command_buffer =
+            clCreateCommandBufferKHR(1, &out_of_order_queue, nullptr, &error);
+        test_error(error, "clCreateCommandBufferKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    cl_int Run() override
+    {
+        cl_int error = RecordCommandBuffer(out_of_order_command_buffer);
+        test_error(error, "RecordCommandBuffer failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueBarrier(out_of_order_queue);
+        test_error(error, "clEnqueueBarrier failed");
+
+        error =
+            clEnqueueNDRangeKernel(out_of_order_queue, mul_kernel, 1, nullptr,
+                                   &num_elements, nullptr, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueNDRangeKernel failed");
+
+        error = clEnqueueBarrier(out_of_order_queue);
+        test_error(error, "clEnqueueBarrier failed");
+
+        error = clEnqueueCommandBufferKHR(
+            0, nullptr, out_of_order_command_buffer, 0, nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clEnqueueBarrier(out_of_order_queue);
+        test_error(error, "clEnqueueBarrier failed");
+
+        std::vector<cl_int> output_data(num_elements);
+        error = clEnqueueReadBuffer(out_of_order_queue, out_mem, CL_TRUE, 0,
+                                    data_size(), output_data.data(), 0, nullptr,
+                                    nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        // Verify
+        const cl_int ref = pattern + 1;
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(ref, output_data[i], i);
+        }
+        return CL_SUCCESS;
+    }
+
+    clCommandQueueWrapper out_of_order_queue;
+    clCommandBufferWrapper out_of_order_command_buffer;
+};
+} // anonymous namespace
+
+REGISTER_TEST(pipeline_in_order_deps)
+{
+    return MakeAndRunTest<InOrderPipelined>(device, context, queue,
+                                            num_elements);
+}
+
+REGISTER_TEST(pipeline_event_deps)
+{
+    return MakeAndRunTest<EventPipelined>(device, context, queue, num_elements);
+}
+
+REGISTER_TEST(pipeline_barrier_deps)
+{
+    return MakeAndRunTest<BarrierPipelined>(device, context, queue,
+                                            num_elements);
+}
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_printf.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_printf.cpp
@@ -44,27 +44,18 @@
 namespace {

 ////////////////////////////////////////////////////////////////////////////////
-// printf tests for cl_khr_command_buffer which handles below cases:
-// -test cases for device side printf
-// -test cases for device side printf with a simultaneous use command-buffer
+// Test for cl_khr_command_buffer which handles a command-buffer containing a
+// printf kernel being repeatedly enqueued.

-template <bool simul_use>
 struct CommandBufferPrintfTest : public BasicCommandBufferTest
 {
    CommandBufferPrintfTest(cl_device_id device, cl_context context,
                            cl_command_queue queue)
-        : BasicCommandBufferTest(device, context, queue),
-          trigger_event(nullptr), wait_event(nullptr), file_descriptor(0),
-          printf_use_support(false)
+        : BasicCommandBufferTest(device, context, queue), file_descriptor(0)
    {
-        simultaneous_use_requested = simul_use;
-        if (simul_use)
-        {
-            buffer_size_multiplier = num_test_iters;
-        }
+        buffer_size_multiplier = num_test_iters;
    }

-    //--------------------------------------------------------------------------
    void ReleaseOutputStream(int fd)
    {
        fflush(stdout);
@@ -72,7 +63,6 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        close(fd);
    }

-    //--------------------------------------------------------------------------
    int AcquireOutputStream(int* error)
    {
        int fd = streamDup(fileno(stdout));
@@ -85,7 +75,6 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        return fd;
    }

-    //--------------------------------------------------------------------------
    void GetAnalysisBuffer(std::stringstream& buffer)
    {
        std::ifstream fp(temp_filename, std::ios::in);
@@ -95,7 +84,6 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        }
    }

-    //--------------------------------------------------------------------------
    void PurgeTempFile()
    {
        std::ofstream ofs(temp_filename,
@@ -103,9 +91,10 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        ofs.close();
    }

-    //--------------------------------------------------------------------------
    bool Skip() override
    {
+        if (BasicCommandBufferTest::Skip()) return true;
+
        // Query if device supports kernel printf use
        cl_device_command_buffer_capabilities_khr capabilities;
        cl_int error =
@@ -114,16 +103,13 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        test_error(error,
                   "Unable to query CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR");

-        printf_use_support =
+        const bool printf_use_support =
            (capabilities & CL_COMMAND_BUFFER_CAPABILITY_KERNEL_PRINTF_KHR)
            != 0;

-        if (!printf_use_support) return true;
-        return BasicCommandBufferTest::Skip()
-            || (simultaneous_use_requested && !simultaneous_use_support);
+        return !printf_use_support;
    }

-    //--------------------------------------------------------------------------
    cl_int SetUpKernel() override
    {
        cl_int error = CL_SUCCESS;
@@ -153,14 +139,12 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    size_t data_size() const override
    {
        return sizeof(cl_char) * num_elements * buffer_size_multiplier
            * max_pattern_length;
    }

-    //--------------------------------------------------------------------------
    cl_int SetUpKernelArgs() override
    {
        cl_int error = CL_SUCCESS;
@@ -192,7 +176,6 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    cl_int SetUp(int elements) override
    {
        auto pcFname = get_temp_filename();
@@ -209,39 +192,10 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        return BasicCommandBufferTest::SetUp(elements);
    }

-    //--------------------------------------------------------------------------
-    cl_int Run() override
-    {
-        cl_int error = CL_SUCCESS;

-        // record command buffer with primary queue
-        error = RecordCommandBuffer();
-        test_error(error, "RecordCommandBuffer failed");
-
-        if (simultaneous_use_support)
-        {
-            // enqueue simultaneous command-buffers with printf calls
-            error = RunSimultaneous();
-            test_error(error, "RunSimultaneous failed");
-        }
-        else
-        {
-            // enqueue single command-buffer with printf calls
-            error = RunSingle();
-            test_error(error, "RunSingle failed");
-        }
-
-        std::remove(temp_filename.c_str());
-
-        return CL_SUCCESS;
-    }
-
-    //--------------------------------------------------------------------------
    cl_int RecordCommandBuffer()
    {
-        cl_int error = CL_SUCCESS;
-
-        error = clCommandNDRangeKernelKHR(
+        cl_int error = clCommandNDRangeKernelKHR(
            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
            nullptr, 0, nullptr, nullptr, nullptr);
        test_error(error, "clCommandNDRangeKernelKHR failed");
@@ -251,7 +205,6 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
 #define test_error_release_stdout(errCode, msg)                                \
    {                                                                          \
        auto errCodeResult = errCode;                                          \
@@ -263,96 +216,7 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        }                                                                      \
    }

-    //--------------------------------------------------------------------------
-    cl_int EnqueueSinglePass(const std::vector<cl_char>& pattern,
-                             std::vector<cl_char>& output_data)
-    {
-        cl_int error = CL_SUCCESS;
-        auto in_mem_size = sizeof(cl_char) * pattern.size();
-        error = clEnqueueWriteBuffer(queue, in_mem, CL_TRUE, 0, in_mem_size,
-                                     &pattern[0], 0, nullptr, nullptr);
-        test_error(error, "clEnqueueWriteBuffer failed");
-
-        test_assert_error(pattern.size() - 1 <= CL_UINT_MAX,
-                          "pattern.size() - 1 does not fit in a cl_uint");
-        cl_uint offset[] = { 0, static_cast<cl_uint>(pattern.size() - 1) };
-        error = clEnqueueWriteBuffer(queue, off_mem, CL_TRUE, 0, sizeof(offset),
-                                     offset, 0, nullptr, nullptr);
-        test_error(error, "clEnqueueWriteBuffer failed");
-
-        // redirect output stream to temporary file
-        file_descriptor = AcquireOutputStream(&error);
-        if (error != 0)
-        {
-            log_error("Error while redirection stdout to file");
-            return TEST_FAIL;
-        }
-
-        // enqueue command buffer with kernel containing printf command
-        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
-                                          nullptr, &wait_event);
-        test_error_release_stdout(error, "clEnqueueCommandBufferKHR failed");
-
-        fflush(stdout);
-
-        // Wait until kernel finishes its execution and (thus) the output
-        // printed from the kernel is immediately printed
-        error = clWaitForEvents(1, &wait_event);
-        test_error(error, "clWaitForEvents failed");
-
-        // output buffer contains pattern to be compared with printout
-        error = clEnqueueReadBuffer(queue, out_mem, CL_FALSE, 0, data_size(),
-                                    output_data.data(), 0, nullptr, nullptr);
-        test_error_release_stdout(error, "clEnqueueReadBuffer failed");
-
-        error = clFinish(queue);
-        test_error_release_stdout(error, "clFinish failed");
-
-        ReleaseOutputStream(file_descriptor);
-
-        // copy content of temporary file into string stream
-        std::stringstream sstr;
-        GetAnalysisBuffer(sstr);
-        if (sstr.str().size() != num_elements * offset[1])
-        {
-            log_error("GetAnalysisBuffer failed\n");
-            return TEST_FAIL;
-        }
-
-        // verify the result - compare printout and output buffer
-        for (size_t i = 0; i < num_elements * offset[1]; i++)
-        {
-            CHECK_VERIFICATION_ERROR(sstr.str().at(i), output_data[i], i);
-        }
-
-        return CL_SUCCESS;
-    }
-
-    //--------------------------------------------------------------------------
-    cl_int RunSingle()
-    {
-        cl_int error = CL_SUCCESS;
-        std::vector<cl_char> output_data(num_elements * max_pattern_length);
-
-        for (unsigned i = 0; i < num_test_iters; i++)
-        {
-            unsigned pattern_length =
-                std::max(min_pattern_length, rand() % max_pattern_length);
-            char pattern_character = 'a' + rand() % 26;
-            std::vector<cl_char> pattern(pattern_length + 1, pattern_character);
-            pattern[pattern_length] = '\0';
-            error = EnqueueSinglePass(pattern, output_data);
-            test_error(error, "EnqueueSinglePass failed");
-
-            output_data.assign(output_data.size(), 0);
-            PurgeTempFile();
-        }
-
-        return CL_SUCCESS;
-    }
-
-    //--------------------------------------------------------------------------
-    struct SimulPassData
+    struct EnqueuePassData
    {
        // null terminated character buffer
        std::vector<cl_char> pattern;
@@ -361,8 +225,7 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        std::vector<cl_char> output_buffer;
    };

-    //--------------------------------------------------------------------------
-    cl_int EnqueueSimultaneousPass(SimulPassData& pd)
+    cl_int EnqueuePass(EnqueuePassData& pd)
    {
        // write current pattern to device memory
        auto in_mem_size = sizeof(cl_char) * pd.pattern.size();
@@ -377,15 +240,8 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
                                 pd.offset, 0, nullptr, nullptr);
        test_error_release_stdout(error, "clEnqueueWriteBuffer failed");

-        // create user event to block simultaneous command buffers
-        if (!trigger_event)
-        {
-            trigger_event = clCreateUserEvent(context, &error);
-            test_error_release_stdout(error, "clCreateUserEvent failed");
-        }
-
-        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 1,
-                                          &trigger_event, nullptr);
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
        test_error_release_stdout(error, "clEnqueueCommandBufferKHR failed");

        // output buffer contains pattern to be compared with printout
@@ -398,14 +254,14 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-
-    //--------------------------------------------------------------------------
-    cl_int RunSimultaneous()
+    cl_int Run() override
    {
-        cl_int error = CL_SUCCESS;
+        cl_int error = RecordCommandBuffer();
+        test_error(error, "RecordCommandBuffer failed");
+
        cl_int offset = static_cast<cl_int>(num_elements * max_pattern_length);

-        std::vector<SimulPassData> simul_passes(num_test_iters);
+        std::vector<EnqueuePassData> enqueue_passes(num_test_iters);

        const int pattern_chars_range = 26;
        std::list<cl_char> pattern_chars;
@@ -413,7 +269,7 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
            pattern_chars.push_back(cl_char('a' + i));

        test_assert_error(pattern_chars.size() >= num_test_iters,
-                          "Number of simultaneous launches must be lower than "
+                          "Number of launches must be lower than "
                          "size of characters container");

        cl_int total_pattern_coverage = 0;
@@ -428,11 +284,12 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest

            std::vector<cl_char> pattern(pattern_length + 1, pattern_character);
            pattern.back() = '\0';
-            simul_passes[i] = { pattern,
-                                { cl_int(i * offset), cl_int(pattern_length) },
-                                std::vector<cl_char>(num_elements
-                                                     * pattern_length) };
-            total_pattern_coverage += simul_passes[i].output_buffer.size();
+            enqueue_passes[i] = {
+                pattern,
+                { cl_int(i * offset), cl_int(pattern_length) },
+                std::vector<cl_char>(num_elements * pattern_length)
+            };
+            total_pattern_coverage += enqueue_passes[i].output_buffer.size();
            pattern_chars.erase(it);
        };

@@ -444,17 +301,14 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
            return TEST_FAIL;
        }

-        // enqueue read/write and command buffer operations
-        for (auto&& pass : simul_passes)
+        // enqueue read/write and command buffer operations, serialized
+        // by in-order queue
+        for (auto&& pass : enqueue_passes)
        {
-            error = EnqueueSimultaneousPass(pass);
-            test_error_release_stdout(error, "EnqueueSimultaneousPass failed");
+            error = EnqueuePass(pass);
+            test_error_release_stdout(error, "EnqueuePass failed");
        }

-        // execute command buffers
-        error = clSetUserEventStatus(trigger_event, CL_COMPLETE);
-        test_error_release_stdout(error, "clSetUserEventStatus failed");
-
        // flush streams
        fflush(stdout);

@@ -477,13 +331,13 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
        for (int i = 0; i < total_pattern_coverage; i++)
            counters_map[sstr.str().at(i)]++;

-        if (counters_map.size() != simul_passes.size())
+        if (counters_map.size() != enqueue_passes.size())
        {
            log_error("printout inconsistent with input data\n");
            return TEST_FAIL;
        }

-        for (auto&& pass : simul_passes)
+        for (auto&& pass : enqueue_passes)
        {
            auto& res_data = pass.output_buffer;

@@ -501,18 +355,13 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest
            }
        }

+        std::remove(temp_filename.c_str());
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
-    clEventWrapper trigger_event = nullptr;
-    clEventWrapper wait_event = nullptr;
-
    std::string temp_filename;
    int file_descriptor;

-    bool printf_use_support;
-
    // specifies max test length for printf pattern
    const unsigned max_pattern_length = 6;
    // specifies min test length for printf pattern
@@ -523,14 +372,8 @@ struct CommandBufferPrintfTest : public BasicCommandBufferTest

 } // anonymous namespace

-REGISTER_TEST(basic_printf)
+REGISTER_TEST(printf)
 {
-    return MakeAndRunTest<CommandBufferPrintfTest<false>>(device, context,
-                                                          queue, num_elements);
-}
-
-REGISTER_TEST(simultaneous_printf)
-{
-    return MakeAndRunTest<CommandBufferPrintfTest<true>>(device, context, queue,
-                                                         num_elements);
+    return MakeAndRunTest<CommandBufferPrintfTest>(device, context, queue,
+                                                   num_elements);
 }
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_profiling.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_profiling.cpp
@@ -86,21 +86,17 @@ cl_int VerifyResult(const clEventWrapper& event)
 }

 ////////////////////////////////////////////////////////////////////////////////
-// Command-buffer profiling test cases:
-// -all commands are recorded to a single command-queue
-// -profiling a command-buffer with simultaneous use
-template <bool simultaneous_request>
+// Command-buffer profiling test for enqueuing command-buffer twice and checking
+// the profiling counters on the events returned.
 struct CommandBufferProfiling : public BasicCommandBufferTest
 {
    CommandBufferProfiling(cl_device_id device, cl_context context,
                           cl_command_queue queue)
-        : BasicCommandBufferTest(device, context, queue), wait_event(nullptr)
+        : BasicCommandBufferTest(device, context, queue)
    {
-        simultaneous_use_requested = simultaneous_request;
-        if (simultaneous_request) buffer_size_multiplier = 2;
+        buffer_size_multiplier = 2; // Do two enqueues of command-buffer
    }

-    //--------------------------------------------------------------------------
    bool Skip() override
    {
        if (BasicCommandBufferTest::Skip()) return true;
@@ -127,10 +123,9 @@ struct CommandBufferProfiling : public BasicCommandBufferTest
                "Queue property CL_QUEUE_PROFILING_ENABLE not supported \n");
            return true;
        }
-        return (simultaneous_use_requested && !simultaneous_use_support);
+        return false;
    }

-    //--------------------------------------------------------------------------
    cl_int SetUp(int elements) override
    {

@@ -156,37 +151,45 @@ struct CommandBufferProfiling : public BasicCommandBufferTest
        return BasicCommandBufferTest::SetUp(elements);
    }

-    //--------------------------------------------------------------------------
+    struct EnqueuePassData
+    {
+        cl_int offset;
+        clEventWrapper query_event;
+    };
+
    cl_int Run() override
    {
-        cl_int error = CL_SUCCESS;
-
-        // record command buffer
-        error = RecordCommandBuffer();
+        cl_int error = RecordCommandBuffer();
        test_error(error, "RecordCommandBuffer failed");

-        if (simultaneous_use_requested)
+        cl_int offset = static_cast<cl_int>(num_elements);
+
+        std::vector<EnqueuePassData> enqueue_passes = {
+            { 0, clEventWrapper() }, { offset, clEventWrapper() }
+        };
+
+        // In-order queue serialized the command-buffer submissions
+        for (auto&& pass : enqueue_passes)
        {
-            // enqueue simultaneous command-buffers with profiling command queue
-            error = RunSimultaneous();
-            test_error(error, "RunSimultaneous failed");
+            error = EnqueuePass(pass);
+            test_error(error, "EnqueueSerializedPass failed");
        }
-        else
+
+        error = clFinish(queue);
+        test_error(error, "clFinish failed");
+
+        for (auto&& pass : enqueue_passes)
        {
-            // enqueue single command-buffer with profiling command queue
-            error = RunSingle();
-            test_error(error, "RunSingle failed");
+            error = VerifyResult(pass.query_event);
+            test_error(error, "VerifyResult failed");
        }

        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    cl_int RecordCommandBuffer()
    {
-        cl_int error = CL_SUCCESS;
-
-        error = clCommandNDRangeKernelKHR(
+        cl_int error = clCommandNDRangeKernelKHR(
            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
            nullptr, 0, nullptr, nullptr, nullptr);
        test_error(error, "clCommandNDRangeKernelKHR failed");
@@ -196,41 +199,7 @@ struct CommandBufferProfiling : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
-    cl_int RunSingle()
-    {
-        cl_int error = CL_SUCCESS;
-        std::vector<cl_int> output_data(num_elements);
-
-        error = clEnqueueFillBuffer(queue, in_mem, &pattern, sizeof(cl_int), 0,
-                                    data_size(), 0, nullptr, nullptr);
-        test_error(error, "clEnqueueFillBuffer failed");
-
-        clEventWrapper query_event;
-        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
-                                          nullptr, &query_event);
-        test_error(error, "clEnqueueCommandBufferKHR failed");
-
-        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
-                                    output_data.data(), 0, nullptr, nullptr);
-        test_error(error, "clEnqueueReadBuffer failed");
-
-        error = VerifyResult(query_event);
-        test_error(error, "VerifyResult failed");
-
-        return CL_SUCCESS;
-    }
-
-    //--------------------------------------------------------------------------
-    struct SimulPassData
-    {
-        cl_int offset;
-        std::vector<cl_int> output_buffer;
-        clEventWrapper query_event;
-    };
-
-    //--------------------------------------------------------------------------
-    cl_int EnqueueSimultaneousPass(SimulPassData& pd)
+    cl_int EnqueuePass(EnqueuePassData& pd)
    {
        cl_int error = clEnqueueFillBuffer(
            queue, out_mem, &pattern, sizeof(cl_int),
@@ -241,59 +210,13 @@ struct CommandBufferProfiling : public BasicCommandBufferTest
                                    0, sizeof(cl_int), 0, nullptr, nullptr);
        test_error(error, "clEnqueueFillBuffer failed");

-        if (!wait_event)
-        {
-            wait_event = clCreateUserEvent(context, &error);
-            test_error(error, "clCreateUserEvent failed");
-        }
-
-        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 1,
-                                          &wait_event, &pd.query_event);
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, &pd.query_event);
        test_error(error, "clEnqueueCommandBufferKHR failed");

-        error = clEnqueueReadBuffer(
-            queue, out_mem, CL_FALSE, pd.offset * sizeof(cl_int), data_size(),
-            pd.output_buffer.data(), 0, nullptr, nullptr);
-        test_error(error, "clEnqueueReadBuffer failed");
-
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
-    cl_int RunSimultaneous()
-    {
-        cl_int error = CL_SUCCESS;
-        cl_int offset = static_cast<cl_int>(num_elements);
-
-        std::vector<SimulPassData> simul_passes = {
-            { 0, std::vector<cl_int>(num_elements) },
-            { offset, std::vector<cl_int>(num_elements) }
-        };
-
-        for (auto&& pass : simul_passes)
-        {
-            error = EnqueueSimultaneousPass(pass);
-            test_error(error, "EnqueueSimultaneousPass failed");
-        }
-
-        error = clSetUserEventStatus(wait_event, CL_COMPLETE);
-        test_error(error, "clSetUserEventStatus failed");
-
-        error = clFinish(queue);
-        test_error(error, "clFinish failed");
-
-        for (auto&& pass : simul_passes)
-        {
-            error = VerifyResult(pass.query_event);
-            test_error(error, "VerifyResult failed");
-        }
-
-        return CL_SUCCESS;
-    }
-
-    //--------------------------------------------------------------------------
-    clEventWrapper wait_event;
-
    const cl_int pattern = 0xA;
 };

@@ -356,19 +279,13 @@ struct CommandBufferSubstituteQueueProfiling : public BasicCommandBufferTest
 };
 } // anonymous namespace

-REGISTER_TEST(basic_profiling)
+REGISTER_TEST(profiling)
 {
-    return MakeAndRunTest<CommandBufferProfiling<false>>(device, context, queue,
-                                                         num_elements);
+    return MakeAndRunTest<CommandBufferProfiling>(device, context, queue,
+                                                  num_elements);
 }

-REGISTER_TEST(simultaneous_profiling)
-{
-    return MakeAndRunTest<CommandBufferProfiling<true>>(device, context, queue,
-                                                        num_elements);
-}
-
-REGISTER_TEST(substitute_queue_profiling)
+REGISTER_TEST(profiling_substitute_queue)
 {
    return MakeAndRunTest<CommandBufferSubstituteQueueProfiling>(
        device, context, queue, num_elements);
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_queue_substitution.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_queue_substitution.cpp
@@ -23,21 +23,16 @@ namespace {
 // Command-queue substitution tests which handles below cases:
 // -substitution on queue without properties
 // -substitution on queue with properties
-// -simultaneous use queue substitution

-template <bool prop_use, bool simul_use>
+template <bool prop_use>
 struct SubstituteQueueTest : public BasicCommandBufferTest
 {
    SubstituteQueueTest(cl_device_id device, cl_context context,
                        cl_command_queue queue)
        : BasicCommandBufferTest(device, context, queue),
-          properties_use_requested(prop_use), user_event(nullptr)
-    {
-        simultaneous_use_requested = simul_use;
-        if (simul_use) buffer_size_multiplier = 2;
-    }
+          properties_use_requested(prop_use)
+    {}

-    //--------------------------------------------------------------------------
    bool Skip() override
    {
        if (properties_use_requested)
@@ -57,11 +52,9 @@ struct SubstituteQueueTest : public BasicCommandBufferTest
                return true;
        }

-        return BasicCommandBufferTest::Skip()
-            || (simultaneous_use_requested && !simultaneous_use_support);
+        return BasicCommandBufferTest::Skip();
    }

-    //--------------------------------------------------------------------------
    cl_int SetUp(int elements) override
    {
        // By default command queue is created without properties,
@@ -81,7 +74,6 @@ struct SubstituteQueueTest : public BasicCommandBufferTest
        return BasicCommandBufferTest::SetUp(elements);
    }

-    //--------------------------------------------------------------------------
    cl_int Run() override
    {
        // record command buffer with primary queue
@@ -106,23 +98,14 @@ struct SubstituteQueueTest : public BasicCommandBufferTest
            test_error(error, "clCreateCommandQueue failed");
        }

-        if (simultaneous_use_support)
-        {
-            // enque simultaneous command-buffers with substitute queue
-            error = RunSimultaneous(new_queue);
-            test_error(error, "RunSimultaneous failed");
-        }
-        else
-        {
-            // enque single command-buffer with substitute queue
-            error = RunSingle(new_queue);
-            test_error(error, "RunSingle failed");
-        }
+
+        // enqueue single command-buffer with substitute queue
+        error = RunSingle(new_queue);
+        test_error(error, "RunSingle failed");

        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    cl_int RecordCommandBuffer()
    {
        cl_int error = clCommandNDRangeKernelKHR(
@@ -135,14 +118,13 @@ struct SubstituteQueueTest : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    cl_int RunSingle(const cl_command_queue& q)
    {
-        cl_int error = CL_SUCCESS;
        std::vector<cl_int> output_data(num_elements);

-        error = clEnqueueFillBuffer(q, in_mem, &pattern_pri, sizeof(cl_int), 0,
-                                    data_size(), 0, nullptr, nullptr);
+        cl_int error =
+            clEnqueueFillBuffer(q, in_mem, &pattern_pri, sizeof(cl_int), 0,
+                                data_size(), 0, nullptr, nullptr);
        test_error(error, "clEnqueueFillBuffer failed");

        cl_command_queue queues[] = { q };
@@ -165,90 +147,8 @@ struct SubstituteQueueTest : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
-    struct SimulPassData
-    {
-        cl_int pattern;
-        cl_int offset;
-        cl_command_queue queue;
-        std::vector<cl_int> output_buffer;
-    };
-
-    //--------------------------------------------------------------------------
-    cl_int EnqueueSimultaneousPass(SimulPassData& pd)
-    {
-        cl_int error = clEnqueueFillBuffer(
-            pd.queue, in_mem, &pd.pattern, sizeof(cl_int),
-            pd.offset * sizeof(cl_int), data_size(), 0, nullptr, nullptr);
-        test_error(error, "clEnqueueFillBuffer failed");
-
-        error =
-            clEnqueueFillBuffer(pd.queue, off_mem, &pd.offset, sizeof(cl_int),
-                                0, sizeof(cl_int), 0, nullptr, nullptr);
-        test_error(error, "clEnqueueFillBuffer failed");
-
-        if (!user_event)
-        {
-            user_event = clCreateUserEvent(context, &error);
-            test_error(error, "clCreateUserEvent failed");
-        }
-
-        cl_command_queue queues[] = { pd.queue };
-        error = clEnqueueCommandBufferKHR(1, queues, command_buffer, 1,
-                                          &user_event, nullptr);
-        test_error(error, "clEnqueueCommandBufferKHR failed");
-
-        error = clEnqueueReadBuffer(
-            pd.queue, out_mem, CL_FALSE, pd.offset * sizeof(cl_int),
-            data_size(), pd.output_buffer.data(), 0, nullptr, nullptr);
-
-        test_error(error, "clEnqueueReadBuffer failed");
-
-        return CL_SUCCESS;
-    }
-
-    //--------------------------------------------------------------------------
-    cl_int RunSimultaneous(const cl_command_queue& q)
-    {
-        cl_int error = CL_SUCCESS;
-        cl_int offset = static_cast<cl_int>(num_elements);
-
-        std::vector<SimulPassData> simul_passes = {
-            { pattern_pri, 0, q, std::vector<cl_int>(num_elements) },
-            { pattern_sec, offset, q, std::vector<cl_int>(num_elements) }
-        };
-
-        for (auto&& pass : simul_passes)
-        {
-            error = EnqueueSimultaneousPass(pass);
-            test_error(error, "EnqueuePass failed");
-        }
-
-        error = clSetUserEventStatus(user_event, CL_COMPLETE);
-        test_error(error, "clSetUserEventStatus failed");
-
-        for (auto&& pass : simul_passes)
-        {
-            error = clFinish(pass.queue);
-            test_error(error, "clFinish failed");
-
-            auto& res_data = pass.output_buffer;
-
-            for (size_t i = 0; i < num_elements; i++)
-            {
-                CHECK_VERIFICATION_ERROR(pass.pattern, res_data[i], i);
-            }
-        }
-
-        return CL_SUCCESS;
-    }
-
-    //--------------------------------------------------------------------------
    const cl_int pattern_pri = 0xB;
-    const cl_int pattern_sec = 0xC;
-
    bool properties_use_requested;
-    clEventWrapper user_event;
 };

 // Command-queue substitution tests which handles below cases:
@@ -397,20 +297,14 @@ struct QueueOrderTest : public BasicCommandBufferTest

 REGISTER_TEST(queue_substitution)
 {
-    return MakeAndRunTest<SubstituteQueueTest<false, false>>(
-        device, context, queue, num_elements);
+    return MakeAndRunTest<SubstituteQueueTest<false>>(device, context, queue,
+                                                      num_elements);
 }

-REGISTER_TEST(properties_queue_substitution)
+REGISTER_TEST(queue_substitution_properties)
 {
-    return MakeAndRunTest<SubstituteQueueTest<true, false>>(
-        device, context, queue, num_elements);
-}
-
-REGISTER_TEST(simultaneous_queue_substitution)
-{
-    return MakeAndRunTest<SubstituteQueueTest<false, true>>(
-        device, context, queue, num_elements);
+    return MakeAndRunTest<SubstituteQueueTest<true>>(device, context, queue,
+                                                     num_elements);
 }

 REGISTER_TEST(queue_substitute_in_order)
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_set_kernel_arg.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_set_kernel_arg.cpp
@@ -22,25 +22,22 @@ namespace {

 ////////////////////////////////////////////////////////////////////////////////
 // clSetKernelArg tests for cl_khr_command_buffer which handles below cases:
-// -test interactions of clSetKernelArg with command-buffers
-// -test interactions of clSetKernelArg on a command-buffer pending execution
+// -test interactions of clSetKernelArg after command-buffer finalize but
+//  before enqueue
+// -test interactions of clSetKernelArg between command-buffer enqueue

-template <bool simul_use>
+template <bool enqueue_test>
 struct CommandBufferSetKernelArg : public BasicCommandBufferTest
 {
    CommandBufferSetKernelArg(cl_device_id device, cl_context context,
                              cl_command_queue queue)
-        : BasicCommandBufferTest(device, context, queue), trigger_event(nullptr)
+        : BasicCommandBufferTest(device, context, queue)
    {
-        simultaneous_use_requested = simul_use;
-        if (simul_use) buffer_size_multiplier = 2;
+        if (enqueue_test) buffer_size_multiplier = 2;
    }

-    //--------------------------------------------------------------------------
    cl_int SetUpKernel() override
    {
-        cl_int error = CL_SUCCESS;
-
        const char* kernel_str =
            R"(
            __kernel void copy(int in, __global int* out, __global int* offset)
@@ -50,8 +47,8 @@ struct CommandBufferSetKernelArg : public BasicCommandBufferTest
                out[ind] = in;
            })";

-        error = create_single_kernel_helper_create_program(context, &program, 1,
-                                                           &kernel_str);
+        cl_int error = create_single_kernel_helper_create_program(
+            context, &program, 1, &kernel_str);
        test_error(error, "Failed to create program with source");

        error = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
@@ -63,7 +60,6 @@ struct CommandBufferSetKernelArg : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    cl_int SetUpKernelArgs() override
    {
        cl_int error = CL_SUCCESS;
@@ -99,15 +95,14 @@ struct CommandBufferSetKernelArg : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    cl_int Run() override
    {
        cl_int error = CL_SUCCESS;
-        if (simultaneous_use_requested)
+        if (enqueue_test)
        {
-            // enqueue simultaneous command-buffers with clSetKernelArg calls
-            error = RunSimultaneous();
-            test_error(error, "RunSimultaneous failed");
+            // enqueue command-buffers with clSetKernelArg calls in between
+            error = RunMultipleEnqueue();
+            test_error(error, "RunMultipleEnqueue failed");
        }
        else
        {
@@ -119,12 +114,9 @@ struct CommandBufferSetKernelArg : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    cl_int RecordCommandBuffer()
    {
-        cl_int error = CL_SUCCESS;
-
-        error = clCommandNDRangeKernelKHR(
+        cl_int error = clCommandNDRangeKernelKHR(
            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
            nullptr, 0, nullptr, nullptr, nullptr);
        test_error(error, "clCommandNDRangeKernelKHR failed");
@@ -148,14 +140,12 @@ struct CommandBufferSetKernelArg : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
    cl_int RunSingle()
    {
-        cl_int error = CL_SUCCESS;
        std::vector<cl_int> output_data(num_elements);

        // record command buffer
-        error = RecordCommandBuffer();
+        cl_int error = RecordCommandBuffer();
        test_error(error, "RecordCommandBuffer failed");

        const cl_int pattern_base = 0;
@@ -187,20 +177,16 @@ struct CommandBufferSetKernelArg : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
-    struct SimulPassData
+    struct EnqueuePassData
    {
        cl_int pattern;
        cl_int offset;
        std::vector<cl_int> output_buffer;
    };

-    //--------------------------------------------------------------------------
-    cl_int RecordSimultaneousCommandBuffer() const
+    cl_int RecordEnqueueCommandBuffer() const
    {
-        cl_int error = CL_SUCCESS;
-
-        error = clCommandNDRangeKernelKHR(
+        cl_int error = clCommandNDRangeKernelKHR(
            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
            nullptr, 0, nullptr, nullptr, nullptr);
        test_error(error, "clCommandNDRangeKernelKHR failed");
@@ -210,8 +196,7 @@ struct CommandBufferSetKernelArg : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
-    cl_int EnqueueSimultaneousPass(SimulPassData& pd)
+    cl_int EnqueuePass(EnqueuePassData& pd)
    {
        cl_int error = clEnqueueFillBuffer(
            queue, out_mem, &pd.pattern, sizeof(cl_int),
@@ -222,14 +207,8 @@ struct CommandBufferSetKernelArg : public BasicCommandBufferTest
                                    0, sizeof(cl_int), 0, nullptr, nullptr);
        test_error(error, "clEnqueueFillBuffer failed");

-        if (!trigger_event)
-        {
-            trigger_event = clCreateUserEvent(context, &error);
-            test_error(error, "clCreateUserEvent failed");
-        }
-
-        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 1,
-                                          &trigger_event, nullptr);
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
        test_error(error, "clEnqueueCommandBufferKHR failed");

        error = clEnqueueReadBuffer(
@@ -240,49 +219,39 @@ struct CommandBufferSetKernelArg : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
-    cl_int RunSimultaneous()
+    cl_int RunMultipleEnqueue()
    {
-        cl_int error = CL_SUCCESS;
-
        // record command buffer with primary queue
-        error = RecordSimultaneousCommandBuffer();
-        test_error(error, "RecordSimultaneousCommandBuffer failed");
+        cl_int error = RecordEnqueueCommandBuffer();
+        test_error(error, "RecordEnqueeuCommandBuffer failed");

-        std::vector<SimulPassData> simul_passes = {
-            { 0, 0, std::vector<cl_int>(num_elements) }
+        cl_int offset = static_cast<cl_int>(num_elements);
+        std::vector<EnqueuePassData> enqueue_passes = {
+            { 0, 0, std::vector<cl_int>(num_elements) },
+            { 1, offset, std::vector<cl_int>(num_elements) }
        };

-        error = EnqueueSimultaneousPass(simul_passes.front());
-        test_error(error, "EnqueueSimultaneousPass 1 failed");
-
-        // changing kernel args at this point should have no effect,
-        // test will verify if clSetKernelArg didn't affect command-buffer
-        cl_int in_arg = pattern_sec;
-        error = clSetKernelArg(kernel, 0, sizeof(cl_int), &in_arg);
-        test_error(error, "clSetKernelArg failed");
-
-        error = clSetKernelArg(kernel, 1, sizeof(out_mem_k2), &out_mem_k2);
-        test_error(error, "clSetKernelArg failed");
-
-        if (simultaneous_use_support)
+        for (auto&& pass : enqueue_passes)
        {
-            cl_int offset = static_cast<cl_int>(num_elements);
-            simul_passes.push_back(
-                { 1, offset, std::vector<cl_int>(num_elements) });
+            // changing kernel args at this point should have no effect,
+            // test will verify if clSetKernelArg didn't affect command-buffer
+            cl_int in_arg = pattern_sec;
+            error = clSetKernelArg(kernel, 0, sizeof(cl_int), &in_arg);
+            test_error(error, "clSetKernelArg failed");

-            error = EnqueueSimultaneousPass(simul_passes.back());
-            test_error(error, "EnqueueSimultaneousPass 2 failed");
+            error = clSetKernelArg(kernel, 1, sizeof(out_mem_k2), &out_mem_k2);
+            test_error(error, "clSetKernelArg failed");
+
+
+            error = EnqueuePass(pass);
+            test_error(error, "EnqueuePass failed");
        }

-        error = clSetUserEventStatus(trigger_event, CL_COMPLETE);
-        test_error(error, "clSetUserEventStatus failed");
-
        error = clFinish(queue);
        test_error(error, "clFinish failed");

        // verify the result buffer
-        for (auto&& pass : simul_passes)
+        for (auto&& pass : enqueue_passes)
        {
            auto& res_data = pass.output_buffer;
            for (size_t i = 0; i < num_elements; i++)
@@ -294,9 +263,6 @@ struct CommandBufferSetKernelArg : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
-    clEventWrapper trigger_event = nullptr;
-
    const cl_int pattern_pri = 2;
    const cl_int pattern_sec = 3;

@@ -305,13 +271,13 @@ struct CommandBufferSetKernelArg : public BasicCommandBufferTest

 } // anonymous namespace

-REGISTER_TEST(basic_set_kernel_arg)
+REGISTER_TEST(set_kernel_arg_after_finalize)
 {
    return MakeAndRunTest<CommandBufferSetKernelArg<false>>(
        device, context, queue, num_elements);
 }

-REGISTER_TEST(pending_set_kernel_arg)
+REGISTER_TEST(set_kernel_arg_after_enqueue)
 {
    return MakeAndRunTest<CommandBufferSetKernelArg<true>>(device, context,
                                                           queue, num_elements);
--- a/test_conformance/extensions/cl_khr_command_buffer/negative_command_buffer_create.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/negative_command_buffer_create.cpp
@@ -131,9 +131,10 @@ struct CreateCommandBufferRepeatedProperties : public BasicCommandBufferTest
        if (BasicCommandBufferTest::Skip()) return true;

        bool skip = true;
-        if (simultaneous_use_support)
+        if (is_extension_available(
+                device, CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME))
        {
-            rep_prop = CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;
+            rep_prop = CL_COMMAND_BUFFER_MUTABLE_KHR;
            skip = false;
        }
        else if (is_extension_available(
@@ -142,13 +143,6 @@ struct CreateCommandBufferRepeatedProperties : public BasicCommandBufferTest
            rep_prop = CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR;
            skip = false;
        }
-        else if (is_extension_available(
-                     device,
-                     CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME))
-        {
-            rep_prop = CL_COMMAND_BUFFER_MUTABLE_KHR;
-            skip = false;
-        }

        return skip;
    }
@@ -185,7 +179,9 @@ struct CreateCommandBufferNotSupportedProperties : public BasicCommandBufferTest
        if (BasicCommandBufferTest::Skip()) return true;

        bool skip = true;
-        if (!simultaneous_use_support)
+        if (is_extension_available(
+                device, CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME)
+            && !simultaneous_use_support)
        {
            unsupported_prop = CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;
            skip = false;
--- a/test_conformance/extensions/cl_khr_command_buffer/negative_command_buffer_enqueue.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/negative_command_buffer_enqueue.cpp
@@ -66,102 +66,6 @@ struct EnqueueCommandBufferNotFinalized : public BasicCommandBufferTest
    }
 };

-// CL_INVALID_OPERATION if command_buffer was not created with the
-// CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR flag and is in the Pending state.
-struct EnqueueCommandBufferWithoutSimultaneousUseNotInPendingState
-    : public BasicCommandBufferTest
-{
-    EnqueueCommandBufferWithoutSimultaneousUseNotInPendingState(
-        cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicCommandBufferTest(device, context, queue), user_event(nullptr)
-    {}
-
-    cl_int Run() override
-    {
-        cl_int error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
-                                                 nullptr, nullptr);
-
-        test_failure_error_ret(error, CL_INVALID_OPERATION,
-                               "clEnqueueCommandBufferKHR should return "
-                               "CL_INVALID_OPERATION",
-                               TEST_FAIL);
-
-        error = clSetUserEventStatus(user_event, CL_COMPLETE);
-        test_error(error, "clSetUserEventStatus failed");
-        clFinish(queue);
-
-        return CL_SUCCESS;
-    }
-
-    cl_int SetUp(int elements) override
-    {
-        auto verify_state = [&](const cl_command_buffer_state_khr &expected) {
-            cl_command_buffer_state_khr state = ~cl_command_buffer_state_khr(0);
-
-            cl_int error = clGetCommandBufferInfoKHR(
-                command_buffer, CL_COMMAND_BUFFER_STATE_KHR, sizeof(state),
-                &state, nullptr);
-            test_error_ret(error, "clGetCommandBufferInfoKHR failed",
-                           TEST_FAIL);
-
-            test_assert_error(
-                state == expected,
-                "Unexpected result of CL_COMMAND_BUFFER_STATE_KHR query!");
-
-            return TEST_PASS;
-        };
-
-        cl_int error = BasicCommandBufferTest::SetUp(elements);
-        test_error(error, "BasicCommandBufferTest::SetUp failed");
-
-        command_buffer = clCreateCommandBufferKHR(1, &queue, nullptr, &error);
-        test_error(error, "clCreateCommandBufferKHR failed");
-
-        error = RecordCommandBuffer();
-        test_error(error, "RecordCommandBuffer failed");
-        error = verify_state(CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR);
-        test_error(error, "State is not Executable");
-
-        error = EnqueueCommandBuffer();
-        test_error(error, "EnqueueCommandBuffer failed");
-
-        return CL_SUCCESS;
-    }
-
-    cl_int RecordCommandBuffer()
-    {
-        cl_int error = clCommandNDRangeKernelKHR(
-            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
-            nullptr, 0, nullptr, nullptr, nullptr);
-        test_error(error, "clCommandNDRangeKernelKHR failed");
-
-        error = clFinalizeCommandBufferKHR(command_buffer);
-        test_error(error, "clFinalizeCommandBufferKHR failed");
-
-        return CL_SUCCESS;
-    }
-
-    cl_int EnqueueCommandBuffer()
-    {
-        cl_int pattern = 0xE;
-
-        cl_int error =
-            clEnqueueFillBuffer(queue, out_mem, &pattern, sizeof(cl_int), 0,
-                                data_size(), 0, nullptr, nullptr);
-        test_error(error, "clEnqueueFillBuffer failed");
-
-        user_event = clCreateUserEvent(context, &error);
-        test_error(error, "clCreateUserEvent failed");
-
-        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 1,
-                                          &user_event, nullptr);
-        test_error(error, "clEnqueueCommandBufferKHR failed");
-
-        return CL_SUCCESS;
-    }
-    clEventWrapper user_event;
-};
-
 // CL_INVALID_VALUE if queues is NULL and num_queues is > 0, or queues is not
 // NULL and num_queues is 0.
 struct EnqueueCommandBufferNullQueuesNumQueues : public BasicCommandBufferTest
@@ -623,14 +527,6 @@ REGISTER_TEST(negative_enqueue_command_buffer_not_finalized)
        device, context, queue, num_elements);
 }

-REGISTER_TEST(
-    negative_enqueue_command_buffer_without_simultaneous_no_pending_state)
-{
-    return MakeAndRunTest<
-        EnqueueCommandBufferWithoutSimultaneousUseNotInPendingState>(
-        device, context, queue, num_elements);
-}
-
 REGISTER_TEST(negative_enqueue_command_buffer_null_queues_num_queues)
 {
    return MakeAndRunTest<EnqueueCommandBufferNullQueuesNumQueues>(
--- a/test_conformance/extensions/cl_khr_command_buffer/negative_command_buffer_finalize.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/negative_command_buffer_finalize.cpp
@@ -44,20 +44,9 @@ struct FinalizeCommandBufferNotRecordingState : public BasicCommandBufferTest
    FinalizeCommandBufferNotRecordingState(cl_device_id device,
                                           cl_context context,
                                           cl_command_queue queue)
-        : BasicCommandBufferTest(device, context, queue), user_event(nullptr)
+        : BasicCommandBufferTest(device, context, queue)
    {}

-    cl_int SetUp(int elements) override
-    {
-        cl_int error = BasicCommandBufferTest::SetUp(elements);
-        test_error(error, "BasicCommandBufferTest::SetUp failed");
-
-        user_event = clCreateUserEvent(context, &error);
-        test_error(error, "clCreateUserEvent failed");
-
-        return CL_SUCCESS;
-    }
-
    cl_int Run() override
    {
        auto verify_state = [&](const cl_command_buffer_state_khr &expected) {
@@ -87,18 +76,6 @@ struct FinalizeCommandBufferNotRecordingState : public BasicCommandBufferTest
                               "CL_INVALID_OPERATION",
                               TEST_FAIL);

-        error = EnqueueCommandBuffer();
-        test_error(error, "EnqueueCommandBuffer failed");
-
-        error = clFinalizeCommandBufferKHR(command_buffer);
-        test_failure_error_ret(error, CL_INVALID_OPERATION,
-                               "clFinalizeCommandBufferKHR should return "
-                               "CL_INVALID_OPERATION",
-                               TEST_FAIL);
-
-        clSetUserEventStatus(user_event, CL_COMPLETE);
-        clFinish(queue);
-
        return CL_SUCCESS;
    }

@@ -114,22 +91,6 @@ struct FinalizeCommandBufferNotRecordingState : public BasicCommandBufferTest

        return CL_SUCCESS;
    }
-
-    cl_int EnqueueCommandBuffer()
-    {
-        cl_int pattern = 0xE;
-        cl_int error =
-            clEnqueueFillBuffer(queue, out_mem, &pattern, sizeof(cl_int), 0,
-                                data_size(), 0, nullptr, nullptr);
-        test_error(error, "clEnqueueFillBuffer failed");
-
-        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 1,
-                                          &user_event, nullptr);
-        test_error(error, "clEnqueueCommandBufferKHR failed");
-
-        return CL_SUCCESS;
-    }
-    clEventWrapper user_event;
 };
 };

--- a/test_conformance/extensions/cl_khr_command_buffer/negative_command_nd_range_kernel.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/negative_command_nd_range_kernel.cpp
@@ -398,16 +398,17 @@ struct CommandNDRangeKernelWithKernelEnqueueCall : public BasicCommandBufferTest

        const char* kernel_str =
            R"(
-__kernel void enqueue_call_func() {
-  }
+            __kernel void enqueue_call_func(__global int* out_mem) {
+            out_mem[get_global_id(0)] = 0x1234;
+              }
            
-__kernel void enqueue_call_kernel() {
-queue_t def_q = get_default_queue();
-ndrange_t ndrange = ndrange_1D(1);
-enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
-                   ^{enqueue_call_func();});
-  }
-)";
+            __kernel void enqueue_call_kernel(__global int* out_mem) {
+            queue_t def_q = get_default_queue();
+            ndrange_t ndrange = ndrange_1D(1);
+            enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
+                               ^{enqueue_call_func(out_mem);});
+              }
+            )";
        std::string build_options = std::string(" ") + cl_std;

        error = create_single_kernel_helper(context, &program, &kernel, 1,
@@ -443,7 +444,17 @@ enqueue_kernel(def_q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,

    cl_int Run() override
    {
-        cl_int error = clCommandNDRangeKernelKHR(
+        cl_int error = CL_SUCCESS;
+
+        clMemWrapper out_mem =
+            clCreateBuffer(context, CL_MEM_WRITE_ONLY,
+                           num_elements * sizeof(cl_int), nullptr, &error);
+        test_error(error, "clCreateBuffer failed");
+
+        error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &out_mem);
+        test_error(error, "clSetKernelArg failed");
+
+        error = clCommandNDRangeKernelKHR(
            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
            nullptr, 0, nullptr, nullptr, nullptr);

--- a/test_conformance/extensions/cl_khr_external_memory_ahb/debug_ahb.cpp
+++ b/test_conformance/extensions/cl_khr_external_memory_ahb/debug_ahb.cpp
@@ -15,6 +15,7 @@
 //

 #include "debug_ahb.h"
+#include "harness/errorHelpers.h"

 constexpr AHardwareBuffer_UsageFlags flag_list[] = {
    AHARDWAREBUFFER_USAGE_CPU_READ_RARELY,
@@ -191,3 +192,27 @@ std::string ahardwareBufferFormatToString(AHardwareBuffer_Format format)
    }
    return result;
 }
+AHardwareBuffer *create_AHB(AHardwareBuffer_Desc *desc)
+{
+    AHardwareBuffer *buffer_ptr = nullptr;
+    int err = AHardwareBuffer_allocate(desc, &buffer_ptr);
+    if (err != 0)
+    {
+        throw std::runtime_error("AHardwareBuffer_allocate failed with code: "
+                                 + std::to_string(err) + "\n");
+    }
+    return buffer_ptr;
+}
+
+void log_unsupported_ahb_format(AHardwareBuffer_Desc aHardwareBufferDesc)
+{
+    std::string usage_string = ahardwareBufferDecodeUsageFlagsToString(
+        static_cast<AHardwareBuffer_UsageFlags>(aHardwareBufferDesc.usage));
+    log_info("Unsupported format %s:\n   Usage flags %s\n   Size (%u, %u, "
+             "layers = %u)\n",
+             ahardwareBufferFormatToString(static_cast<AHardwareBuffer_Format>(
+                                               aHardwareBufferDesc.format))
+                 .c_str(),
+             usage_string.c_str(), aHardwareBufferDesc.width,
+             aHardwareBufferDesc.height, aHardwareBufferDesc.layers);
+}
--- a/test_conformance/extensions/cl_khr_external_memory_ahb/debug_ahb.h
+++ b/test_conformance/extensions/cl_khr_external_memory_ahb/debug_ahb.h
@@ -19,6 +19,7 @@
 #include <string>
 #include <vector>
 #include <numeric>
+#include <CL/cl.h>

 #define CHECK_AHARDWARE_BUFFER_SUPPORT(ahardwareBuffer_Desc, format)           \
    if (!AHardwareBuffer_isSupported(&ahardwareBuffer_Desc))                   \
@@ -40,3 +41,90 @@ std::string ahardwareBufferFormatToString(AHardwareBuffer_Format format);
 std::string ahardwareBufferUsageFlagToString(AHardwareBuffer_UsageFlags flag);
 std::string
 ahardwareBufferDecodeUsageFlagsToString(AHardwareBuffer_UsageFlags flags);
+
+AHardwareBuffer* create_AHB(AHardwareBuffer_Desc* desc);
+void log_unsupported_ahb_format(AHardwareBuffer_Desc desc);
+
+struct AHardwareBufferWrapper
+{
+    AHardwareBuffer* m_ahb;
+
+    AHardwareBufferWrapper(): m_ahb(nullptr) {}
+
+    AHardwareBufferWrapper(AHardwareBuffer* ahb) { m_ahb = ahb; }
+
+    AHardwareBufferWrapper(AHardwareBuffer_Desc* desc)
+    {
+        m_ahb = create_AHB(desc);
+    }
+
+    AHardwareBufferWrapper& operator=(AHardwareBuffer* rhs)
+    {
+        release();
+        m_ahb = rhs;
+
+        return *this;
+    }
+
+    ~AHardwareBufferWrapper() { release(); }
+
+    // Copy constructor
+    AHardwareBufferWrapper(AHardwareBufferWrapper const& ahbw)
+        : m_ahb(ahbw.m_ahb)
+    {
+        retain();
+    }
+
+    // Copy assignment operator
+    AHardwareBufferWrapper& operator=(AHardwareBufferWrapper const& rhs)
+    {
+        release();
+
+        m_ahb = rhs.m_ahb;
+        retain();
+
+        return *this;
+    }
+
+    // Move constructor
+    AHardwareBufferWrapper(AHardwareBufferWrapper&& ahbw)
+    {
+        m_ahb = ahbw.m_ahb;
+        ahbw.m_ahb = nullptr;
+    }
+
+    // Move assignment operator
+    AHardwareBufferWrapper& operator=(AHardwareBufferWrapper&& rhs)
+    {
+        if (this != &rhs)
+        {
+            release(); // Giving up current reference
+            m_ahb = rhs.m_ahb;
+            rhs.m_ahb = nullptr;
+        }
+        return *this;
+    }
+
+    void retain()
+    {
+        if (nullptr != m_ahb)
+        {
+            AHardwareBuffer_acquire(m_ahb);
+        }
+    }
+
+    void release()
+    {
+        if (nullptr != m_ahb)
+        {
+            AHardwareBuffer_release(m_ahb);
+        }
+    }
+
+    // Usage operators
+    operator AHardwareBuffer*() { return m_ahb; }
+    cl_mem_properties get_props()
+    {
+        return reinterpret_cast<cl_mem_properties>(m_ahb);
+    }
+};
--- a/test_conformance/extensions/cl_khr_external_memory_ahb/test_ahb.cpp
+++ b/test_conformance/extensions/cl_khr_external_memory_ahb/test_ahb.cpp
@@ -97,7 +97,7 @@ static const char *diff_images_kernel_source = {
 };

 // Checks that the inferred image format is correct
-REGISTER_TEST(test_images)
+REGISTER_TEST(images)
 {
    cl_int err = CL_SUCCESS;

@@ -134,19 +134,15 @@ REGISTER_TEST(test_images)

                CHECK_AHARDWARE_BUFFER_SUPPORT(aHardwareBufferDesc, format);

-                AHardwareBuffer *aHardwareBuffer = nullptr;
-                int ahb_result = AHardwareBuffer_allocate(&aHardwareBufferDesc,
-                                                          &aHardwareBuffer);
-                if (ahb_result != 0)
-                {
-                    log_error("AHardwareBuffer_allocate failed with code %d\n",
-                              ahb_result);
-                    return TEST_FAIL;
-                }
+                AHardwareBufferWrapper aHardwareBuffer(&aHardwareBufferDesc);
+                log_info(
+                    "Testing %s\n",
+                    ahardwareBufferFormatToString(format.aHardwareBufferFormat)
+                        .c_str());

-                const cl_mem_properties props[] = {
+                cl_mem_properties props[] = {
                    CL_EXTERNAL_MEMORY_HANDLE_ANDROID_HARDWARE_BUFFER_KHR,
-                    reinterpret_cast<cl_mem_properties>(aHardwareBuffer), 0
+                    aHardwareBuffer.get_props(), 0
                };

                cl_mem image = clCreateImageWithProperties(
@@ -181,8 +177,6 @@ REGISTER_TEST(test_images)

                test_error(clReleaseMemObject(image),
                           "Failed to release image");
-                AHardwareBuffer_release(aHardwareBuffer);
-                aHardwareBuffer = nullptr;
            }
        }
    }
@@ -190,7 +184,7 @@ REGISTER_TEST(test_images)
    return TEST_PASS;
 }

-REGISTER_TEST(test_images_read)
+REGISTER_TEST(images_read)
 {
    cl_int err = CL_SUCCESS;
    RandomSeed seed(gRandomSeed);
@@ -238,15 +232,11 @@ REGISTER_TEST(test_images_read)

                CHECK_AHARDWARE_BUFFER_SUPPORT(aHardwareBufferDesc, format);

-                AHardwareBuffer *aHardwareBuffer = nullptr;
-                int ahb_result = AHardwareBuffer_allocate(&aHardwareBufferDesc,
-                                                          &aHardwareBuffer);
-                if (ahb_result != 0)
-                {
-                    log_error("AHardwareBuffer_allocate failed with code %d\n",
-                              ahb_result);
-                    return TEST_FAIL;
-                }
+                AHardwareBufferWrapper aHardwareBuffer(&aHardwareBufferDesc);
+                log_info(
+                    "Testing %s\n",
+                    ahardwareBufferFormatToString(format.aHardwareBufferFormat)
+                        .c_str());

                // Determine AHB memory layout
                AHardwareBuffer_Desc hardware_buffer_desc = {};
@@ -279,7 +269,7 @@ REGISTER_TEST(test_images_read)
                generate_random_image_data(&imageInfo, srcData, seed);

                void *hardware_buffer_data = nullptr;
-                ahb_result = AHardwareBuffer_lock(
+                int ahb_result = AHardwareBuffer_lock(
                    aHardwareBuffer, AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN, -1,
                    nullptr, &hardware_buffer_data);
                if (ahb_result != 0)
@@ -301,7 +291,7 @@ REGISTER_TEST(test_images_read)

                cl_mem_properties props[] = {
                    CL_EXTERNAL_MEMORY_HANDLE_ANDROID_HARDWARE_BUFFER_KHR,
-                    reinterpret_cast<cl_mem_properties>(aHardwareBuffer), 0
+                    aHardwareBuffer.get_props(), 0
                };

                clMemWrapper imported_image = clCreateImageWithProperties(
@@ -394,7 +384,7 @@ REGISTER_TEST(test_images_read)
                test_error(err, "clEnqueueNDRangeKernel failed");

                err = clEnqueueReleaseExternalMemObjectsKHR(
-                    queue, 1, &opencl_image, 0, nullptr, nullptr);
+                    queue, 1, &imported_image, 0, nullptr, nullptr);
                test_error(err, "clEnqueueReleaseExternalMemObjectsKHR failed");

                // Read buffer and verify
@@ -482,9 +472,6 @@ REGISTER_TEST(test_images_read)
                        }
                    }
                }
-
-                AHardwareBuffer_release(aHardwareBuffer);
-                aHardwareBuffer = nullptr;
            }
        }
    }
@@ -492,7 +479,7 @@ REGISTER_TEST(test_images_read)
    return TEST_PASS;
 }

-REGISTER_TEST(test_enqueue_read_image)
+REGISTER_TEST(enqueue_read_image)
 {
    cl_int err = CL_SUCCESS;
    RandomSeed seed(gRandomSeed);
@@ -540,15 +527,12 @@ REGISTER_TEST(test_enqueue_read_image)

                CHECK_AHARDWARE_BUFFER_SUPPORT(aHardwareBufferDesc, format);

-                AHardwareBuffer *aHardwareBuffer = nullptr;
-                int ahb_result = AHardwareBuffer_allocate(&aHardwareBufferDesc,
-                                                          &aHardwareBuffer);
-                if (ahb_result != 0)
-                {
-                    log_error("AHardwareBuffer_allocate failed with code %d\n",
-                              ahb_result);
-                    return TEST_FAIL;
-                }
+                AHardwareBufferWrapper aHardwareBuffer(&aHardwareBufferDesc);
+
+                log_info(
+                    "Testing %s\n",
+                    ahardwareBufferFormatToString(format.aHardwareBufferFormat)
+                        .c_str());

                // Determine AHB memory layout
                AHardwareBuffer_Desc hardware_buffer_desc = {};
@@ -581,7 +565,7 @@ REGISTER_TEST(test_enqueue_read_image)
                generate_random_image_data(&imageInfo, srcData, seed);

                void *hardware_buffer_data = nullptr;
-                ahb_result = AHardwareBuffer_lock(
+                int ahb_result = AHardwareBuffer_lock(
                    aHardwareBuffer, AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN, -1,
                    nullptr, &hardware_buffer_data);
                if (ahb_result != 0)
@@ -601,9 +585,9 @@ REGISTER_TEST(test_enqueue_read_image)
                    return TEST_FAIL;
                }

-                const cl_mem_properties props[] = {
+                cl_mem_properties props[] = {
                    CL_EXTERNAL_MEMORY_HANDLE_ANDROID_HARDWARE_BUFFER_KHR,
-                    reinterpret_cast<cl_mem_properties>(aHardwareBuffer), 0
+                    aHardwareBuffer.get_props(), 0
                };

                clMemWrapper imported_image = clCreateImageWithProperties(
@@ -662,9 +646,6 @@ REGISTER_TEST(test_enqueue_read_image)
                    out_image_ptr += imageInfo.rowPitch;
                }

-                AHardwareBuffer_release(aHardwareBuffer);
-                aHardwareBuffer = nullptr;
-
                if (total_matched == 0)
                {
                    test_fail("Zero bytes matched");
@@ -676,7 +657,7 @@ REGISTER_TEST(test_enqueue_read_image)
    return TEST_PASS;
 }

-REGISTER_TEST(test_enqueue_copy_image)
+REGISTER_TEST(enqueue_copy_image)
 {
    cl_int err = CL_SUCCESS;
    RandomSeed seed(gRandomSeed);
@@ -724,15 +705,12 @@ REGISTER_TEST(test_enqueue_copy_image)

                CHECK_AHARDWARE_BUFFER_SUPPORT(aHardwareBufferDesc, format);

-                AHardwareBuffer *aHardwareBuffer = nullptr;
-                int ahb_result = AHardwareBuffer_allocate(&aHardwareBufferDesc,
-                                                          &aHardwareBuffer);
-                if (ahb_result != 0)
-                {
-                    log_error("AHardwareBuffer_allocate failed with code %d\n",
-                              ahb_result);
-                    return TEST_FAIL;
-                }
+                AHardwareBufferWrapper aHardwareBuffer(&aHardwareBufferDesc);
+
+                log_info(
+                    "Testing %s\n",
+                    ahardwareBufferFormatToString(format.aHardwareBufferFormat)
+                        .c_str());

                // Determine AHB memory layout
                AHardwareBuffer_Desc hardware_buffer_desc = {};
@@ -765,7 +743,7 @@ REGISTER_TEST(test_enqueue_copy_image)
                generate_random_image_data(&imageInfo, srcData, seed);

                void *hardware_buffer_data = nullptr;
-                ahb_result = AHardwareBuffer_lock(
+                int ahb_result = AHardwareBuffer_lock(
                    aHardwareBuffer, AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN, -1,
                    nullptr, &hardware_buffer_data);
                if (ahb_result != 0)
@@ -787,7 +765,7 @@ REGISTER_TEST(test_enqueue_copy_image)

                cl_mem_properties props[] = {
                    CL_EXTERNAL_MEMORY_HANDLE_ANDROID_HARDWARE_BUFFER_KHR,
-                    reinterpret_cast<cl_mem_properties>(aHardwareBuffer), 0
+                    aHardwareBuffer.get_props(), 0
                };

                clMemWrapper imported_image = clCreateImageWithProperties(
@@ -975,9 +953,6 @@ REGISTER_TEST(test_enqueue_copy_image)
                        }
                    }
                }
-
-                AHardwareBuffer_release(aHardwareBuffer);
-                aHardwareBuffer = nullptr;
            }
        }
    }
@@ -985,7 +960,7 @@ REGISTER_TEST(test_enqueue_copy_image)
    return TEST_PASS;
 }

-REGISTER_TEST(test_enqueue_copy_image_to_buffer)
+REGISTER_TEST(enqueue_copy_image_to_buffer)
 {
    cl_int err = CL_SUCCESS;
    RandomSeed seed(gRandomSeed);
@@ -1033,15 +1008,12 @@ REGISTER_TEST(test_enqueue_copy_image_to_buffer)

                CHECK_AHARDWARE_BUFFER_SUPPORT(aHardwareBufferDesc, format);

-                AHardwareBuffer *aHardwareBuffer = nullptr;
-                int ahb_result = AHardwareBuffer_allocate(&aHardwareBufferDesc,
-                                                          &aHardwareBuffer);
-                if (ahb_result != 0)
-                {
-                    log_error("AHardwareBuffer_allocate failed with code %d\n",
-                              ahb_result);
-                    return TEST_FAIL;
-                }
+                AHardwareBufferWrapper aHardwareBuffer(&aHardwareBufferDesc);
+
+                log_info(
+                    "Testing %s\n",
+                    ahardwareBufferFormatToString(format.aHardwareBufferFormat)
+                        .c_str());

                // Determine AHB memory layout
                AHardwareBuffer_Desc hardware_buffer_desc = {};
@@ -1074,7 +1046,7 @@ REGISTER_TEST(test_enqueue_copy_image_to_buffer)
                generate_random_image_data(&imageInfo, srcData, seed);

                void *hardware_buffer_data = nullptr;
-                ahb_result = AHardwareBuffer_lock(
+                int ahb_result = AHardwareBuffer_lock(
                    aHardwareBuffer, AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN, -1,
                    nullptr, &hardware_buffer_data);
                if (ahb_result != 0)
@@ -1096,7 +1068,7 @@ REGISTER_TEST(test_enqueue_copy_image_to_buffer)

                cl_mem_properties props[] = {
                    CL_EXTERNAL_MEMORY_HANDLE_ANDROID_HARDWARE_BUFFER_KHR,
-                    reinterpret_cast<cl_mem_properties>(aHardwareBuffer), 0
+                    aHardwareBuffer.get_props(), 0
                };

                clMemWrapper imported_image = clCreateImageWithProperties(
@@ -1165,9 +1137,6 @@ REGISTER_TEST(test_enqueue_copy_image_to_buffer)
                    out_buffer_ptr += scanlineSize;
                }

-                AHardwareBuffer_release(aHardwareBuffer);
-                aHardwareBuffer = nullptr;
-
                if (total_matched == 0)
                {
                    test_fail("Zero bytes matched");
@@ -1179,7 +1148,7 @@ REGISTER_TEST(test_enqueue_copy_image_to_buffer)
    return TEST_PASS;
 }

-REGISTER_TEST(test_enqueue_copy_buffer_to_image)
+REGISTER_TEST(enqueue_copy_buffer_to_image)
 {
    cl_int err = CL_SUCCESS;
    RandomSeed seed(gRandomSeed);
@@ -1227,15 +1196,12 @@ REGISTER_TEST(test_enqueue_copy_buffer_to_image)

                CHECK_AHARDWARE_BUFFER_SUPPORT(aHardwareBufferDesc, format);

-                AHardwareBuffer *aHardwareBuffer = nullptr;
-                int ahb_result = AHardwareBuffer_allocate(&aHardwareBufferDesc,
-                                                          &aHardwareBuffer);
-                if (ahb_result != 0)
-                {
-                    log_error("AHardwareBuffer_allocate failed with code %d\n",
-                              ahb_result);
-                    return TEST_FAIL;
-                }
+                AHardwareBufferWrapper aHardwareBuffer(&aHardwareBufferDesc);
+
+                log_info(
+                    "Testing %s\n",
+                    ahardwareBufferFormatToString(format.aHardwareBufferFormat)
+                        .c_str());

                // Determine AHB memory layout
                AHardwareBuffer_Desc hardware_buffer_desc = {};
@@ -1275,7 +1241,7 @@ REGISTER_TEST(test_enqueue_copy_buffer_to_image)

                cl_mem_properties props[] = {
                    CL_EXTERNAL_MEMORY_HANDLE_ANDROID_HARDWARE_BUFFER_KHR,
-                    reinterpret_cast<cl_mem_properties>(aHardwareBuffer), 0
+                    aHardwareBuffer.get_props(), 0
                };

                clMemWrapper imported_image = clCreateImageWithProperties(
@@ -1307,7 +1273,7 @@ REGISTER_TEST(test_enqueue_copy_buffer_to_image)
                                         &hardware_buffer_desc);

                void *hardware_buffer_data = nullptr;
-                ahb_result = AHardwareBuffer_lock(
+                int ahb_result = AHardwareBuffer_lock(
                    aHardwareBuffer, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN, -1,
                    nullptr, &hardware_buffer_data);
                if (ahb_result != 0)
@@ -1366,9 +1332,6 @@ REGISTER_TEST(test_enqueue_copy_buffer_to_image)
                    return TEST_FAIL;
                }

-                AHardwareBuffer_release(aHardwareBuffer);
-                aHardwareBuffer = nullptr;
-
                if (total_matched == 0)
                {
                    test_fail("Zero bytes matched");
@@ -1380,7 +1343,7 @@ REGISTER_TEST(test_enqueue_copy_buffer_to_image)
    return TEST_PASS;
 }

-REGISTER_TEST(test_enqueue_write_image)
+REGISTER_TEST(enqueue_write_image)
 {
    cl_int err = CL_SUCCESS;
    RandomSeed seed(gRandomSeed);
@@ -1428,15 +1391,12 @@ REGISTER_TEST(test_enqueue_write_image)

                CHECK_AHARDWARE_BUFFER_SUPPORT(aHardwareBufferDesc, format);

-                AHardwareBuffer *aHardwareBuffer = nullptr;
-                int ahb_result = AHardwareBuffer_allocate(&aHardwareBufferDesc,
-                                                          &aHardwareBuffer);
-                if (ahb_result != 0)
-                {
-                    log_error("AHardwareBuffer_allocate failed with code %d\n",
-                              ahb_result);
-                    return TEST_FAIL;
-                }
+                AHardwareBufferWrapper aHardwareBuffer(&aHardwareBufferDesc);
+
+                log_info(
+                    "Testing %s\n",
+                    ahardwareBufferFormatToString(format.aHardwareBufferFormat)
+                        .c_str());

                // Determine AHB memory layout
                AHardwareBuffer_Desc hardware_buffer_desc = {};
@@ -1453,7 +1413,7 @@ REGISTER_TEST(test_enqueue_write_image)

                cl_mem_properties props[] = {
                    CL_EXTERNAL_MEMORY_HANDLE_ANDROID_HARDWARE_BUFFER_KHR,
-                    reinterpret_cast<cl_mem_properties>(aHardwareBuffer), 0
+                    aHardwareBuffer.get_props(), 0
                };

                clMemWrapper imported_image = clCreateImageWithProperties(
@@ -1503,7 +1463,7 @@ REGISTER_TEST(test_enqueue_write_image)
                                         &hardware_buffer_desc);

                void *hardware_buffer_data = nullptr;
-                ahb_result = AHardwareBuffer_lock(
+                int ahb_result = AHardwareBuffer_lock(
                    aHardwareBuffer, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN, -1,
                    nullptr, &hardware_buffer_data);
                if (ahb_result != 0)
@@ -1564,9 +1524,6 @@ REGISTER_TEST(test_enqueue_write_image)
                    return TEST_FAIL;
                }

-                AHardwareBuffer_release(aHardwareBuffer);
-                aHardwareBuffer = nullptr;
-
                if (total_matched == 0)
                {
                    test_fail("Zero bytes matched");
@@ -1578,7 +1535,7 @@ REGISTER_TEST(test_enqueue_write_image)
    return TEST_PASS;
 }

-REGISTER_TEST(test_enqueue_fill_image)
+REGISTER_TEST(enqueue_fill_image)
 {
    cl_int err = CL_SUCCESS;
    RandomSeed seed(gRandomSeed);
@@ -1626,15 +1583,12 @@ REGISTER_TEST(test_enqueue_fill_image)

                CHECK_AHARDWARE_BUFFER_SUPPORT(aHardwareBufferDesc, format);

-                AHardwareBuffer *aHardwareBuffer = nullptr;
-                int ahb_result = AHardwareBuffer_allocate(&aHardwareBufferDesc,
-                                                          &aHardwareBuffer);
-                if (ahb_result != 0)
-                {
-                    log_error("AHardwareBuffer_allocate failed with code %d\n",
-                              ahb_result);
-                    return TEST_FAIL;
-                }
+                AHardwareBufferWrapper aHardwareBuffer(&aHardwareBufferDesc);
+
+                log_info(
+                    "Testing %s\n",
+                    ahardwareBufferFormatToString(format.aHardwareBufferFormat)
+                        .c_str());

                // Determine AHB memory layout
                AHardwareBuffer_Desc hardware_buffer_desc = {};
@@ -1650,7 +1604,7 @@ REGISTER_TEST(test_enqueue_fill_image)

                cl_mem_properties props[] = {
                    CL_EXTERNAL_MEMORY_HANDLE_ANDROID_HARDWARE_BUFFER_KHR,
-                    reinterpret_cast<cl_mem_properties>(aHardwareBuffer), 0
+                    aHardwareBuffer.get_props(), 0
                };

                clMemWrapper imported_image = clCreateImageWithProperties(
@@ -1739,7 +1693,7 @@ REGISTER_TEST(test_enqueue_fill_image)
                                         &hardware_buffer_desc);

                void *hardware_buffer_data = nullptr;
-                ahb_result = AHardwareBuffer_lock(
+                int ahb_result = AHardwareBuffer_lock(
                    aHardwareBuffer, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN, -1,
                    nullptr, &hardware_buffer_data);
                if (ahb_result != 0)
@@ -1819,8 +1773,6 @@ REGISTER_TEST(test_enqueue_fill_image)
                    return TEST_FAIL;
                }

-                AHardwareBuffer_release(aHardwareBuffer);
-                aHardwareBuffer = nullptr;
                free(verificationLine);

                if (total_matched == 0)
@@ -1834,7 +1786,7 @@ REGISTER_TEST(test_enqueue_fill_image)
    return TEST_PASS;
 }

-REGISTER_TEST(test_blob)
+REGISTER_TEST(blob)
 {
    cl_int err = CL_SUCCESS;

@@ -1883,19 +1835,17 @@ REGISTER_TEST(test_blob)
            continue;
        }

-        AHardwareBuffer *aHardwareBuffer = nullptr;
-        int ahb_result =
-            AHardwareBuffer_allocate(&aHardwareBufferDesc, &aHardwareBuffer);
-        if (ahb_result != 0)
-        {
-            log_error("AHardwareBuffer_allocate failed with code %d\n",
-                      ahb_result);
-            return TEST_FAIL;
-        }
+        AHardwareBufferWrapper aHardwareBuffer(&aHardwareBufferDesc);
+
+        log_info(
+            "Testing %s\n",
+            ahardwareBufferFormatToString(
+                static_cast<AHardwareBuffer_Format>(aHardwareBufferDesc.format))
+                .c_str());

        cl_mem_properties props[] = {
            CL_EXTERNAL_MEMORY_HANDLE_ANDROID_HARDWARE_BUFFER_KHR,
-            reinterpret_cast<cl_mem_properties>(aHardwareBuffer), 0
+            aHardwareBuffer.get_props(), 0
        };

        cl_mem buffer = clCreateBufferWithProperties(
@@ -1903,8 +1853,6 @@ REGISTER_TEST(test_blob)
        test_error(err, "Failed to create CL buffer from AHardwareBuffer");

        test_error(clReleaseMemObject(buffer), "Failed to release buffer");
-        AHardwareBuffer_release(aHardwareBuffer);
-        aHardwareBuffer = nullptr;
    }

    return TEST_PASS;
--- a/test_conformance/extensions/cl_khr_external_memory_ahb/test_ahb_negative.cpp
+++ b/test_conformance/extensions/cl_khr_external_memory_ahb/test_ahb_negative.cpp
@@ -21,7 +21,7 @@
 #include <android/hardware_buffer.h>
 #include "debug_ahb.h"

-REGISTER_TEST(test_buffer_format_negative)
+REGISTER_TEST(buffer_format_negative)
 {
    cl_int err = CL_SUCCESS;

@@ -98,7 +98,7 @@ REGISTER_TEST(test_buffer_format_negative)
    return TEST_PASS;
 }

-REGISTER_TEST(test_buffer_size_negative)
+REGISTER_TEST(buffer_size_negative)
 {
    cl_int err = CL_SUCCESS;
    constexpr size_t buffer_size = 64;
@@ -175,7 +175,7 @@ REGISTER_TEST(test_buffer_size_negative)
    return TEST_PASS;
 }

-REGISTER_TEST(test_images_negative)
+REGISTER_TEST(images_negative)
 {
    cl_int err = CL_SUCCESS;

@@ -244,3 +244,155 @@ REGISTER_TEST(test_images_negative)

    return TEST_PASS;
 }
+
+REGISTER_TEST(invalid_arguments)
+{
+    cl_int err;
+    constexpr cl_uint buffer_size = 4096;
+
+    if (!is_extension_available(
+            device, "cl_khr_external_memory_android_hardware_buffer"))
+    {
+        log_info("cl_khr_external_memory_android_hardware_buffer is not "
+                 "supported on this platform. Skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    AHardwareBuffer_Desc aHardwareBufferDesc = { 0 };
+    aHardwareBufferDesc.width = buffer_size;
+    aHardwareBufferDesc.height = 1;
+    aHardwareBufferDesc.layers = 1;
+    aHardwareBufferDesc.format = AHARDWAREBUFFER_FORMAT_BLOB;
+    aHardwareBufferDesc.usage = AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN
+        | AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN;
+
+    if (!AHardwareBuffer_isSupported(&aHardwareBufferDesc))
+    {
+        log_unsupported_ahb_format(aHardwareBufferDesc);
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    AHardwareBufferWrapper ahb_buffer(&aHardwareBufferDesc);
+
+    aHardwareBufferDesc.width = 64;
+    aHardwareBufferDesc.height = 64;
+    aHardwareBufferDesc.format = AHARDWAREBUFFER_FORMAT_R8G8B8A8_UNORM;
+
+    if (!AHardwareBuffer_isSupported(&aHardwareBufferDesc))
+    {
+        log_unsupported_ahb_format(aHardwareBufferDesc);
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    AHardwareBufferWrapper ahb_image(&aHardwareBufferDesc);
+
+    const cl_mem_properties props_buffer[] = {
+        CL_EXTERNAL_MEMORY_HANDLE_ANDROID_HARDWARE_BUFFER_KHR,
+        ahb_buffer.get_props(),
+        0,
+    };
+    const cl_mem_properties props_image[] = {
+        CL_EXTERNAL_MEMORY_HANDLE_ANDROID_HARDWARE_BUFFER_KHR,
+        ahb_image.get_props(),
+        0,
+    };
+
+    // stub values
+    cl_image_format image_format = { 0 };
+    cl_image_desc image_desc = { 0 };
+    int host_data = 0;
+    void *host_ptr = reinterpret_cast<void *>(&host_data);
+
+    log_info("Testing buffer error conditions\n");
+
+    // Buffer error conditions
+    clMemWrapper mem =
+        clCreateBufferWithProperties(context, props_buffer, CL_MEM_READ_WRITE,
+                                     buffer_size + 1, nullptr, &err);
+    if (CL_INVALID_BUFFER_SIZE != err)
+    {
+        log_error(
+            "clCreateBufferWithProperties should return CL_INVALID_BUFFER_SIZE "
+            "but returned %s: CL_INVALID_BUFFER_SIZE if size is non-zero and "
+            "greater than the AHardwareBuffer when importing external memory "
+            "using cl_khr_external_memory_android_hardware_buffer\n",
+            IGetErrorString(err));
+        return TEST_FAIL;
+    }
+
+    mem = clCreateBufferWithProperties(context, props_image, CL_MEM_READ_WRITE,
+                                       0, nullptr, &err);
+    if (CL_INVALID_OPERATION != err)
+    {
+        log_error(
+            "clCreateBufferWithProperties should return CL_INVALID_OPERATION "
+            "but returned %s: CL_INVALID_OPERATION if the AHardwareBuffer "
+            "format is not AHARDWAREBUFFER_FORMAT_BLOB\n",
+            IGetErrorString(err));
+        return TEST_FAIL;
+    }
+
+    mem = clCreateBufferWithProperties(context, props_buffer, CL_MEM_READ_WRITE,
+                                       0, host_ptr, &err);
+    if (CL_INVALID_HOST_PTR != err)
+    {
+        log_error(
+            "clCreateBufferWithProperties should return CL_INVALID_HOST_PTR "
+            "but returned %s: CL_INVALID_HOST_PTR if host_ptr is not NULL\n",
+            IGetErrorString(err));
+        return TEST_FAIL;
+    }
+
+    log_info("Testing image error conditions\n");
+
+    // Image error conditions
+    mem = clCreateImageWithProperties(context, props_image, CL_MEM_READ_WRITE,
+                                      &image_format, nullptr, nullptr, &err);
+    if (CL_INVALID_IMAGE_FORMAT_DESCRIPTOR != err)
+    {
+        log_error(
+            "clCreateBufferWithProperties should return "
+            "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR but returned %s: "
+            "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR if image_format is not NULL "
+            "when using cl_khr_external_memory_android_hardware_buffer.\n",
+            IGetErrorString(err));
+        return TEST_FAIL;
+    }
+
+    mem = clCreateImageWithProperties(context, props_image, CL_MEM_READ_WRITE,
+                                      nullptr, &image_desc, nullptr, &err);
+    if (CL_INVALID_IMAGE_DESCRIPTOR != err)
+    {
+        log_error("clCreateBufferWithProperties should return "
+                  "CL_INVALID_IMAGE_DESCRIPTOR but returned %s: "
+                  "CL_INVALID_IMAGE_DESCRIPTOR if image_desc is not NULL when "
+                  "using cl_khr_external_memory_android_hardware_buffer.\n",
+                  IGetErrorString(err));
+        return TEST_FAIL;
+    }
+
+    mem = clCreateImageWithProperties(context, props_buffer, CL_MEM_READ_WRITE,
+                                      nullptr, nullptr, nullptr, &err);
+    if (CL_IMAGE_FORMAT_NOT_SUPPORTED != err)
+    {
+        log_error("clCreateBufferWithProperties should return "
+                  "CL_IMAGE_FORMAT_NOT_SUPPORTED but returned %s: "
+                  "CL_IMAGE_FORMAT_NOT_SUPPORTED if AHardwareBuffer's format "
+                  "is not supported\n",
+                  IGetErrorString(err));
+        return TEST_FAIL;
+    }
+
+    mem = clCreateImageWithProperties(context, props_image, CL_MEM_READ_WRITE,
+                                      nullptr, nullptr, host_ptr, &err);
+    if (CL_INVALID_HOST_PTR != err)
+    {
+        log_error(
+            "clCreateBufferWithProperties should return CL_INVALID_HOST_PTR "
+            "but returned %s: CL_INVALID_HOST_PTR if host_ptr is not NULL\n",
+            IGetErrorString(err));
+        return TEST_FAIL;
+    }
+
+    return TEST_PASS;
+}
--- a/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/CMakeLists.txt
+++ b/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/CMakeLists.txt
@@ -0,0 +1,26 @@
+if (WIN32)
+    include_directories(${CLConform_SOURCE_DIR}/test_common/harness
+            ${CLConform_INCLUDE_DIR})
+    link_directories(${CL_LIB_DIR})
+
+    list(APPEND CLConform_LIBRARIES directx_wrapper)
+
+    set(MODULE_NAME CL_KHR_EXTERNAL_SEMAPHORE_DX_FENCE)
+
+    set(${MODULE_NAME}_SOURCES
+            main.cpp
+            test_external_semaphore_dx_fence.cpp
+            test_external_semaphore_dx_fence_negative_wait_signal.cpp
+            test_external_semaphore_dx_fence_queries.cpp
+            test_external_semaphore_dx_fence_export.cpp
+    )
+
+    set_source_files_properties(
+            ${MODULE_NAME}_SOURCES
+            PROPERTIES LANGUAGE CXX)
+
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+    include_directories("../../common/directx_wrapper")
+    include(../../CMakeCommon.txt)
+endif (WIN32)
+
--- a/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/main.cpp
+++ b/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/main.cpp
@@ -0,0 +1,22 @@
+//
+// Copyright (c) 2025 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "harness/testHarness.h"
+
+int main(int argc, const char *argv[])
+{
+    return runTestHarness(argc, argv, test_registry::getInstance().num_tests(),
+                          test_registry::getInstance().definitions(), false, 0);
+}
--- a/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/semaphore_dx_fence_base.h
+++ b/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/semaphore_dx_fence_base.h
@@ -0,0 +1,117 @@
+//
+// Copyright (c) 2025 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "harness/typeWrappers.h"
+#include "harness/extensionHelpers.h"
+#include "harness/errorHelpers.h"
+#include "directx_wrapper.hpp"
+
+class CLDXSemaphoreWrapper {
+public:
+    CLDXSemaphoreWrapper(cl_device_id device, cl_context context,
+                         ID3D12Device* dx_device)
+        : device(device), context(context), dx_device(dx_device){};
+
+    int createSemaphoreFromFence(ID3D12Fence* fence)
+    {
+        cl_int errcode = CL_SUCCESS;
+
+        GET_PFN(device, clCreateSemaphoreWithPropertiesKHR);
+
+        const HRESULT hr = dx_device->CreateSharedHandle(
+            fence, nullptr, GENERIC_ALL, nullptr, &fence_handle);
+        test_error(FAILED(hr), "Failed to get shared handle from D3D12 fence");
+
+        cl_semaphore_properties_khr sem_props[] = {
+            static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_KHR),
+            static_cast<cl_semaphore_properties_khr>(
+                CL_SEMAPHORE_TYPE_BINARY_KHR),
+            static_cast<cl_semaphore_properties_khr>(
+                CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR),
+            reinterpret_cast<cl_semaphore_properties_khr>(fence_handle), 0
+        };
+        semaphore =
+            clCreateSemaphoreWithPropertiesKHR(context, sem_props, &errcode);
+        test_error(errcode, "Could not create semaphore");
+
+        return CL_SUCCESS;
+    }
+
+    ~CLDXSemaphoreWrapper()
+    {
+        releaseSemaphore();
+        if (fence_handle)
+        {
+            CloseHandle(fence_handle);
+        }
+    };
+
+    const cl_semaphore_khr* operator&() const { return &semaphore; };
+    cl_semaphore_khr operator*() const { return semaphore; };
+
+    HANDLE getHandle() const { return fence_handle; };
+
+private:
+    cl_semaphore_khr semaphore;
+    ComPtr<ID3D12Fence> fence;
+    HANDLE fence_handle;
+    cl_device_id device;
+    cl_context context;
+    ComPtr<ID3D12Device> dx_device;
+
+    int releaseSemaphore() const
+    {
+        GET_PFN(device, clReleaseSemaphoreKHR);
+
+        if (semaphore)
+        {
+            clReleaseSemaphoreKHR(semaphore);
+        }
+
+        return CL_SUCCESS;
+    }
+};
+
+static bool
+is_import_handle_available(cl_device_id device,
+                           const cl_external_memory_handle_type_khr handle_type)
+{
+    int errcode = CL_SUCCESS;
+    size_t import_types_size = 0;
+    errcode =
+        clGetDeviceInfo(device, CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR, 0,
+                        nullptr, &import_types_size);
+    if (errcode != CL_SUCCESS)
+    {
+        log_error("Could not query import semaphore handle types");
+        return false;
+    }
+    std::vector<cl_external_semaphore_handle_type_khr> import_types(
+        import_types_size / sizeof(cl_external_semaphore_handle_type_khr));
+    errcode =
+        clGetDeviceInfo(device, CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR,
+                        import_types_size, import_types.data(), nullptr);
+    if (errcode != CL_SUCCESS)
+    {
+        log_error("Could not query import semaphore handle types");
+        return false;
+    }
+
+    return std::find(import_types.begin(), import_types.end(), handle_type)
+        != import_types.end();
+}
--- a/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/test_external_semaphore_dx_fence.cpp
+++ b/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/test_external_semaphore_dx_fence.cpp
@@ -0,0 +1,324 @@
+//
+// Copyright (c) 2025 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "semaphore_dx_fence_base.h"
+
+// Confirm that a signal followed by a wait in OpenCL will complete successfully
+REGISTER_TEST(test_external_semaphores_signal_wait)
+{
+    int errcode = CL_SUCCESS;
+    const DirectXWrapper dx_wrapper;
+
+    REQUIRE_EXTENSION("cl_khr_external_semaphore");
+    REQUIRE_EXTENSION("cl_khr_external_semaphore_dx_fence");
+
+    // Obtain pointers to semaphore's API
+    GET_PFN(device, clCreateSemaphoreWithPropertiesKHR);
+    GET_PFN(device, clReleaseSemaphoreKHR);
+    GET_PFN(device, clEnqueueSignalSemaphoresKHR);
+    GET_PFN(device, clEnqueueWaitSemaphoresKHR);
+
+    test_error(!is_import_handle_available(device,
+                                           CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR),
+               "Could not find CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR between the "
+               "supported import types");
+
+    // Import D3D12 fence into OpenCL
+    const DirectXFenceWrapper fence(dx_wrapper.getDXDevice());
+    CLDXSemaphoreWrapper semaphore(device, context, dx_wrapper.getDXDevice());
+    test_error(semaphore.createSemaphoreFromFence(*fence),
+               "Could not create semaphore");
+
+    log_info("Calling clEnqueueSignalSemaphoresKHR\n");
+    constexpr cl_semaphore_payload_khr semaphore_payload = 1;
+    clEventWrapper signal_event;
+    errcode = clEnqueueSignalSemaphoresKHR(
+        queue, 1, &semaphore, &semaphore_payload, 0, nullptr, &signal_event);
+    test_error(errcode, "Failed to signal semaphore");
+
+    log_info("Calling clEnqueueWaitSemaphoresKHR\n");
+    clEventWrapper wait_event;
+    errcode = clEnqueueWaitSemaphoresKHR(
+        queue, 1, &semaphore, &semaphore_payload, 0, nullptr, &wait_event);
+    test_error(errcode, "Failed to wait semaphore");
+
+    errcode = clFinish(queue);
+    test_error(errcode, "Could not finish queue");
+
+    // Verify that the events completed.
+    test_assert_event_complete(signal_event);
+    test_assert_event_complete(wait_event);
+
+    return TEST_PASS;
+}
+
+// Confirm that a wait in OpenCL followed by a CPU signal in DX12 will complete
+// successfully
+REGISTER_TEST(test_external_semaphores_signal_dx_cpu)
+{
+    int errcode = CL_SUCCESS;
+    const DirectXWrapper dx_wrapper;
+
+    REQUIRE_EXTENSION("cl_khr_external_semaphore");
+    REQUIRE_EXTENSION("cl_khr_external_semaphore_dx_fence");
+
+    // Obtain pointers to semaphore's API
+    GET_PFN(device, clCreateSemaphoreWithPropertiesKHR);
+    GET_PFN(device, clReleaseSemaphoreKHR);
+    GET_PFN(device, clEnqueueSignalSemaphoresKHR);
+    GET_PFN(device, clEnqueueWaitSemaphoresKHR);
+
+    test_error(!is_import_handle_available(device,
+                                           CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR),
+               "Could not find CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR between the "
+               "supported import types");
+
+    // Import D3D12 fence into OpenCL
+    const DirectXFenceWrapper fence(dx_wrapper.getDXDevice());
+    CLDXSemaphoreWrapper semaphore(device, context, dx_wrapper.getDXDevice());
+    test_error(semaphore.createSemaphoreFromFence(*fence),
+               "Could not create semaphore");
+
+    log_info("Calling clEnqueueWaitSemaphoresKHR\n");
+    constexpr cl_semaphore_payload_khr semaphore_payload = 1;
+    clEventWrapper wait_event;
+    errcode = clEnqueueWaitSemaphoresKHR(
+        queue, 1, &semaphore, &semaphore_payload, 0, nullptr, &wait_event);
+    test_error(errcode, "Failed to call clEnqueueWaitSemaphoresKHR");
+
+    log_info("Calling d3d12_fence->Signal()\n");
+    const HRESULT hr = (*fence)->Signal(semaphore_payload);
+    test_error(FAILED(hr), "Failed to signal D3D12 fence");
+
+    errcode = clFinish(queue);
+    test_error(errcode, "Could not finish queue");
+
+    test_assert_event_complete(wait_event);
+
+    return TEST_PASS;
+}
+
+// Confirm that a wait in OpenCL followed by a GPU signal in DX12 will complete
+// successfully
+REGISTER_TEST(test_external_semaphores_signal_dx_gpu)
+{
+    int errcode = CL_SUCCESS;
+    const DirectXWrapper dx_wrapper;
+
+    REQUIRE_EXTENSION("cl_khr_external_semaphore");
+    REQUIRE_EXTENSION("cl_khr_external_semaphore_dx_fence");
+
+    // Obtain pointers to semaphore's API
+    GET_PFN(device, clCreateSemaphoreWithPropertiesKHR);
+    GET_PFN(device, clReleaseSemaphoreKHR);
+    GET_PFN(device, clEnqueueSignalSemaphoresKHR);
+    GET_PFN(device, clEnqueueWaitSemaphoresKHR);
+
+    test_error(!is_import_handle_available(device,
+                                           CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR),
+               "Could not find CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR between the "
+               "supported import types");
+
+    // Import D3D12 fence into OpenCL
+    const DirectXFenceWrapper fence(dx_wrapper.getDXDevice());
+    CLDXSemaphoreWrapper semaphore(device, context, dx_wrapper.getDXDevice());
+    test_error(semaphore.createSemaphoreFromFence(*fence),
+               "Could not create semaphore");
+
+    log_info("Calling clEnqueueWaitSemaphoresKHR\n");
+    constexpr cl_semaphore_payload_khr semaphore_payload = 1;
+    clEventWrapper wait_event;
+    errcode = clEnqueueWaitSemaphoresKHR(
+        queue, 1, &semaphore, &semaphore_payload, 0, nullptr, &wait_event);
+    test_error(errcode, "Failed to call clEnqueueWaitSemaphoresKHR");
+
+    log_info("Calling d3d12_command_queue->Signal()\n");
+    const HRESULT hr =
+        dx_wrapper.getDXCommandQueue()->Signal(*fence, semaphore_payload);
+    test_error(FAILED(hr), "Failed to signal D3D12 fence");
+
+    errcode = clFinish(queue);
+    test_error(errcode, "Could not finish queue");
+
+    test_assert_event_complete(wait_event);
+
+    return TEST_PASS;
+}
+
+// Confirm that interlocking waits between OpenCL and DX12 will complete
+// successfully
+REGISTER_TEST(test_external_semaphores_cl_dx_interlock)
+{
+    int errcode = CL_SUCCESS;
+    const DirectXWrapper dx_wrapper;
+
+    REQUIRE_EXTENSION("cl_khr_external_semaphore");
+    REQUIRE_EXTENSION("cl_khr_external_semaphore_dx_fence");
+
+    // Obtain pointers to semaphore's API
+    GET_PFN(device, clCreateSemaphoreWithPropertiesKHR);
+    GET_PFN(device, clReleaseSemaphoreKHR);
+    GET_PFN(device, clEnqueueSignalSemaphoresKHR);
+    GET_PFN(device, clEnqueueWaitSemaphoresKHR);
+
+    test_error(!is_import_handle_available(device,
+                                           CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR),
+               "Could not find CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR between the "
+               "supported import types");
+
+    // Import D3D12 fence into OpenCL
+    const DirectXFenceWrapper fence(dx_wrapper.getDXDevice());
+    CLDXSemaphoreWrapper semaphore(device, context, dx_wrapper.getDXDevice());
+    test_error(semaphore.createSemaphoreFromFence(*fence),
+               "Could not create semaphore");
+
+    log_info("Calling d3d12_command_queue->Wait(1)\n");
+    cl_semaphore_payload_khr semaphore_payload = 1;
+    HRESULT hr =
+        dx_wrapper.getDXCommandQueue()->Wait(*fence, semaphore_payload);
+    test_error(FAILED(hr), "Failed to wait on D3D12 fence");
+
+    log_info("Calling d3d12_command_queue->Signal(2)\n");
+    hr = dx_wrapper.getDXCommandQueue()->Signal(*fence, semaphore_payload + 1);
+    test_error(FAILED(hr), "Failed to signal D3D12 fence");
+
+    log_info("Calling clEnqueueSignalSemaphoresKHR(1)\n");
+    clEventWrapper signal_event;
+    errcode = clEnqueueSignalSemaphoresKHR(
+        queue, 1, &semaphore, &semaphore_payload, 0, nullptr, &signal_event);
+    test_error(errcode, "Failed to call clEnqueueSignalSemaphoresKHR");
+
+    log_info("Calling clEnqueueWaitSemaphoresKHR(2)\n");
+    semaphore_payload += 1;
+    clEventWrapper wait_event;
+    errcode = clEnqueueWaitSemaphoresKHR(
+        queue, 1, &semaphore, &semaphore_payload, 0, nullptr, &wait_event);
+    test_error(errcode, "Failed to call clEnqueueWaitSemaphoresKHR");
+
+    errcode = clFinish(queue);
+    test_error(errcode, "Could not finish queue");
+
+    test_assert_event_complete(wait_event);
+    test_assert_event_complete(signal_event);
+
+    return TEST_PASS;
+}
+
+// Confirm that multiple waits in OpenCL followed by signals in DX12 and waits
+// in DX12 followed by signals in OpenCL complete successfully
+REGISTER_TEST(test_external_semaphores_multiple_wait_signal)
+{
+    int errcode = CL_SUCCESS;
+    const DirectXWrapper dx_wrapper;
+
+    REQUIRE_EXTENSION("cl_khr_external_semaphore");
+    REQUIRE_EXTENSION("cl_khr_external_semaphore_dx_fence");
+
+    // Obtain pointers to semaphore's API
+    GET_PFN(device, clCreateSemaphoreWithPropertiesKHR);
+    GET_PFN(device, clReleaseSemaphoreKHR);
+    GET_PFN(device, clEnqueueSignalSemaphoresKHR);
+    GET_PFN(device, clEnqueueWaitSemaphoresKHR);
+
+    test_error(!is_import_handle_available(device,
+                                           CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR),
+               "Could not find CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR between the "
+               "supported import types");
+
+    // Import D3D12 fence into OpenCL
+    const DirectXFenceWrapper fence_1(dx_wrapper.getDXDevice());
+    CLDXSemaphoreWrapper semaphore_1(device, context, dx_wrapper.getDXDevice());
+    test_error(semaphore_1.createSemaphoreFromFence(*fence_1),
+               "Could not create semaphore");
+
+    const DirectXFenceWrapper fence_2(dx_wrapper.getDXDevice());
+    CLDXSemaphoreWrapper semaphore_2(device, context, dx_wrapper.getDXDevice());
+    test_error(semaphore_2.createSemaphoreFromFence(*fence_2),
+               "Could not create semaphore");
+
+    const cl_semaphore_khr semaphore_list[] = { *semaphore_1, *semaphore_2 };
+    constexpr cl_semaphore_payload_khr semaphore_payload = 1;
+    cl_semaphore_payload_khr semaphore_payload_list[] = {
+        semaphore_payload, semaphore_payload + 1
+    };
+
+    log_info("Calling clEnqueueWaitSemaphoresKHR\n");
+    clEventWrapper wait_event;
+    errcode = clEnqueueWaitSemaphoresKHR(queue, 2, semaphore_list,
+                                         semaphore_payload_list, 0, nullptr,
+                                         &wait_event);
+    test_error(errcode, "Failed to call clEnqueueWaitSemaphoresKHR");
+
+    log_info("Calling d3d12_command_queue->Signal()\n");
+    HRESULT hr =
+        dx_wrapper.getDXCommandQueue()->Signal(*fence_2, semaphore_payload + 1);
+    test_error(FAILED(hr), "Failed to signal D3D12 fence 2");
+    hr = dx_wrapper.getDXCommandQueue()->Signal(*fence_1, semaphore_payload);
+    test_error(FAILED(hr), "Failed to signal D3D12 fence 1");
+
+    log_info("Calling d3d12_command_queue->Wait() with different payloads\n");
+    hr = dx_wrapper.getDXCommandQueue()->Wait(*fence_1, semaphore_payload + 3);
+    test_error(FAILED(hr), "Failed to wait on D3D12 fence 1");
+    hr = dx_wrapper.getDXCommandQueue()->Wait(*fence_2, semaphore_payload + 2);
+    test_error(FAILED(hr), "Failed to wait on D3D12 fence 2");
+
+    errcode = clFinish(queue);
+    test_error(errcode, "Could not finish queue");
+
+    test_assert_event_complete(wait_event);
+
+    semaphore_payload_list[0] = semaphore_payload + 3;
+    semaphore_payload_list[1] = semaphore_payload + 2;
+
+    log_info("Calling clEnqueueSignalSemaphoresKHR\n");
+    clEventWrapper signal_event;
+    errcode = clEnqueueSignalSemaphoresKHR(queue, 2, semaphore_list,
+                                           semaphore_payload_list, 0, nullptr,
+                                           &signal_event);
+    test_error(errcode, "Could not call clEnqueueSignalSemaphoresKHR");
+
+    // Wait until the GPU has completed commands up to this fence point.
+    log_info("Waiting for D3D12 command queue completion\n");
+    if ((*fence_1)->GetCompletedValue() < semaphore_payload_list[0])
+    {
+        const HANDLE event_handle =
+            CreateEventEx(nullptr, false, false, EVENT_ALL_ACCESS);
+        hr = (*fence_1)->SetEventOnCompletion(semaphore_payload_list[0],
+                                              event_handle);
+        test_error(FAILED(hr),
+                   "Failed to set D3D12 fence 1 event on completion");
+        WaitForSingleObject(event_handle, INFINITE);
+        CloseHandle(event_handle);
+    }
+    if ((*fence_2)->GetCompletedValue() < semaphore_payload_list[1])
+    {
+        const HANDLE event_handle =
+            CreateEventEx(nullptr, false, false, EVENT_ALL_ACCESS);
+        hr = (*fence_2)->SetEventOnCompletion(semaphore_payload_list[1],
+                                              event_handle);
+        test_error(FAILED(hr),
+                   "Failed to set D3D12 fence 2 event on completion");
+        WaitForSingleObject(event_handle, INFINITE);
+        CloseHandle(event_handle);
+    }
+
+    errcode = clFinish(queue);
+    test_error(errcode, "Could not finish queue");
+
+    test_assert_event_complete(signal_event);
+
+    return TEST_PASS;
+}
--- a/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/test_external_semaphore_dx_fence_export.cpp
+++ b/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/test_external_semaphore_dx_fence_export.cpp
@@ -0,0 +1,220 @@
+//
+// Copyright (c) 2025 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "semaphore_dx_fence_base.h"
+
+// Confirm that a wait followed by a signal in DirectX 12 using an exported
+// semaphore will complete successfully
+REGISTER_TEST(test_external_semaphores_export_dx_signal)
+{
+    int errcode = CL_SUCCESS;
+    const DirectXWrapper dx_wrapper;
+
+    REQUIRE_EXTENSION("cl_khr_external_semaphore");
+    REQUIRE_EXTENSION("cl_khr_external_semaphore_dx_fence");
+
+    // Obtain pointers to semaphore's API
+    GET_PFN(device, clCreateSemaphoreWithPropertiesKHR);
+    GET_PFN(device, clReleaseSemaphoreKHR);
+    GET_PFN(device, clEnqueueSignalSemaphoresKHR);
+    GET_PFN(device, clEnqueueWaitSemaphoresKHR);
+    GET_PFN(device, clGetSemaphoreInfoKHR);
+    GET_PFN(device, clGetSemaphoreHandleForTypeKHR);
+
+    size_t export_types_size = 0;
+    errcode =
+        clGetDeviceInfo(device, CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR, 0,
+                        nullptr, &export_types_size);
+    test_error(errcode, "Could not query export semaphore handle types");
+    std::vector<cl_external_semaphore_handle_type_khr> export_types(
+        export_types_size / sizeof(cl_external_semaphore_handle_type_khr));
+    errcode =
+        clGetDeviceInfo(device, CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR,
+                        export_types_size, export_types.data(), nullptr);
+    test_error(errcode, "Could not query export semaphore handle types");
+
+    if (std::find(export_types.begin(), export_types.end(),
+                  CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR)
+        == export_types.end())
+    {
+        log_info("Could not find CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR between "
+                 "the supported export types\n");
+        return TEST_FAIL;
+    }
+
+    constexpr cl_semaphore_properties_khr sem_props[] = {
+        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_KHR),
+        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_BINARY_KHR),
+        static_cast<cl_semaphore_properties_khr>(
+            CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR),
+        static_cast<cl_semaphore_properties_khr>(
+            CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR),
+        static_cast<cl_semaphore_properties_khr>(
+            CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR),
+        0
+    };
+    cl_semaphore_khr semaphore =
+        clCreateSemaphoreWithPropertiesKHR(context, sem_props, &errcode);
+    test_error(errcode, "Could not create semaphore");
+
+    cl_bool is_exportable = CL_FALSE;
+    errcode =
+        clGetSemaphoreInfoKHR(semaphore, CL_SEMAPHORE_EXPORTABLE_KHR,
+                              sizeof(is_exportable), &is_exportable, nullptr);
+    test_error(errcode, "Could not get semaphore info");
+    test_error(!is_exportable, "Semaphore is not exportable");
+
+    log_info("Calling clEnqueueWaitSemaphoresKHR\n");
+    constexpr cl_semaphore_payload_khr semaphore_payload = 1;
+    clEventWrapper wait_event;
+    errcode = clEnqueueWaitSemaphoresKHR(
+        queue, 1, &semaphore, &semaphore_payload, 0, nullptr, &wait_event);
+    test_error(errcode, "Failed to wait semaphore");
+
+    HANDLE semaphore_handle = nullptr;
+    errcode = clGetSemaphoreHandleForTypeKHR(
+        semaphore, device, CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR,
+        sizeof(semaphore_handle), &semaphore_handle, nullptr);
+    test_error(errcode, "Could not get semaphore handle");
+
+    ID3D12Fence *fence = nullptr;
+    errcode = dx_wrapper.getDXDevice()->OpenSharedHandle(semaphore_handle,
+                                                         IID_PPV_ARGS(&fence));
+    test_error(errcode, "Could not open semaphore handle");
+
+    log_info("Calling fence->Signal()\n");
+    const HRESULT hr = fence->Signal(semaphore_payload);
+    test_error(FAILED(hr), "Failed to signal D3D12 fence");
+
+    errcode = clFinish(queue);
+    test_error(errcode, "Could not finish queue");
+
+    test_assert_event_complete(wait_event);
+
+    // Release resources
+    CloseHandle(semaphore_handle);
+    test_error(clReleaseSemaphoreKHR(semaphore), "Could not release semaphore");
+    fence->Release();
+
+    return TEST_PASS;
+}
+
+// Confirm that a signal in OpenCL followed by a wait in DirectX 12 using an
+// exported semaphore will complete successfully
+REGISTER_TEST(test_external_semaphores_export_dx_wait)
+{
+    int errcode = CL_SUCCESS;
+    const DirectXWrapper dx_wrapper;
+
+    REQUIRE_EXTENSION("cl_khr_external_semaphore");
+    REQUIRE_EXTENSION("cl_khr_external_semaphore_dx_fence");
+
+    // Obtain pointers to semaphore's API
+    GET_PFN(device, clCreateSemaphoreWithPropertiesKHR);
+    GET_PFN(device, clReleaseSemaphoreKHR);
+    GET_PFN(device, clEnqueueSignalSemaphoresKHR);
+    GET_PFN(device, clEnqueueWaitSemaphoresKHR);
+    GET_PFN(device, clGetSemaphoreInfoKHR);
+    GET_PFN(device, clGetSemaphoreHandleForTypeKHR);
+
+    size_t export_types_size = 0;
+    errcode =
+        clGetDeviceInfo(device, CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR, 0,
+                        nullptr, &export_types_size);
+    test_error(errcode, "Could not query export semaphore handle types");
+    std::vector<cl_external_semaphore_handle_type_khr> export_types(
+        export_types_size / sizeof(cl_external_semaphore_handle_type_khr));
+    errcode =
+        clGetDeviceInfo(device, CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR,
+                        export_types_size, export_types.data(), nullptr);
+    test_error(errcode, "Could not query export semaphore handle types");
+
+    if (std::find(export_types.begin(), export_types.end(),
+                  CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR)
+        == export_types.end())
+    {
+        log_info("Could not find CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR between "
+                 "the supported export types\n");
+        return TEST_FAIL;
+    }
+
+    constexpr cl_semaphore_properties_khr sem_props[] = {
+        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_KHR),
+        static_cast<cl_semaphore_properties_khr>(CL_SEMAPHORE_TYPE_BINARY_KHR),
+        static_cast<cl_semaphore_properties_khr>(
+            CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR),
+        static_cast<cl_semaphore_properties_khr>(
+            CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR),
+        static_cast<cl_semaphore_properties_khr>(
+            CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR),
+        0
+    };
+    cl_semaphore_khr semaphore =
+        clCreateSemaphoreWithPropertiesKHR(context, sem_props, &errcode);
+    test_error(errcode, "Could not create semaphore");
+
+    cl_bool is_exportable = CL_FALSE;
+    errcode =
+        clGetSemaphoreInfoKHR(semaphore, CL_SEMAPHORE_EXPORTABLE_KHR,
+                              sizeof(is_exportable), &is_exportable, nullptr);
+    test_error(errcode, "Could not get semaphore info");
+    test_error(!is_exportable, "Semaphore is not exportable");
+
+    log_info("Calling clEnqueueSignalSemaphoresKHR\n");
+    constexpr cl_semaphore_payload_khr semaphore_payload = 1;
+    clEventWrapper signal_event;
+    errcode = clEnqueueSignalSemaphoresKHR(
+        queue, 1, &semaphore, &semaphore_payload, 0, nullptr, &signal_event);
+    test_error(errcode, "Failed to signal semaphore");
+
+    HANDLE semaphore_handle = nullptr;
+    errcode = clGetSemaphoreHandleForTypeKHR(
+        semaphore, device, CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR,
+        sizeof(semaphore_handle), &semaphore_handle, nullptr);
+    test_error(errcode, "Could not get semaphore handle");
+
+    ID3D12Fence *fence = nullptr;
+    errcode = dx_wrapper.getDXDevice()->OpenSharedHandle(semaphore_handle,
+                                                         IID_PPV_ARGS(&fence));
+    test_error(errcode, "Could not open semaphore handle");
+
+    log_info("Calling dx_wrapper.get_d3d12_command_queue()->Wait()\n");
+    HRESULT hr = dx_wrapper.getDXCommandQueue()->Wait(fence, semaphore_payload);
+    test_error(FAILED(hr), "Failed to wait on D3D12 fence");
+
+    log_info("Calling WaitForSingleObject\n");
+    if (fence->GetCompletedValue() < semaphore_payload)
+    {
+        const HANDLE event =
+            CreateEventEx(nullptr, false, false, EVENT_ALL_ACCESS);
+        hr = fence->SetEventOnCompletion(semaphore_payload, event);
+        test_error(FAILED(hr), "Failed to set event on completion");
+        WaitForSingleObject(event, INFINITE);
+        CloseHandle(event);
+    }
+
+    errcode = clFinish(queue);
+    test_error(errcode, "Could not finish queue");
+
+    test_assert_event_complete(signal_event);
+
+    // Release resources
+    CloseHandle(semaphore_handle);
+    test_error(clReleaseSemaphoreKHR(semaphore), "Could not release semaphore");
+    fence->Release();
+
+    return TEST_PASS;
+}
--- a/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/test_external_semaphore_dx_fence_negative_wait_signal.cpp
+++ b/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/test_external_semaphore_dx_fence_negative_wait_signal.cpp
@@ -0,0 +1,89 @@
+//
+// Copyright (c) 2025 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "semaphore_dx_fence_base.h"
+
+// Confirm that a wait without a semaphore payload list will return
+// CL_INVALID_VALUE
+REGISTER_TEST(test_external_semaphores_dx_fence_negative_wait)
+{
+    int errcode = CL_SUCCESS;
+    const DirectXWrapper dx_wrapper;
+
+    REQUIRE_EXTENSION("cl_khr_external_semaphore");
+    REQUIRE_EXTENSION("cl_khr_external_semaphore_dx_fence");
+
+    // Obtain pointers to semaphore's API
+    GET_PFN(device, clCreateSemaphoreWithPropertiesKHR);
+    GET_PFN(device, clReleaseSemaphoreKHR);
+    GET_PFN(device, clEnqueueWaitSemaphoresKHR);
+
+    test_error(!is_import_handle_available(device,
+                                           CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR),
+               "Could not find CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR between the "
+               "supported import types");
+
+    // Import D3D12 fence into OpenCL
+    const DirectXFenceWrapper fence(dx_wrapper.getDXDevice());
+    CLDXSemaphoreWrapper semaphore(device, context, dx_wrapper.getDXDevice());
+    test_error(semaphore.createSemaphoreFromFence(*fence),
+               "Could not create semaphore");
+
+    log_info("Calling clEnqueueWaitSemaphoresKHR\n");
+    errcode = clEnqueueWaitSemaphoresKHR(queue, 1, &semaphore, nullptr, 0,
+                                         nullptr, nullptr);
+    test_assert_error(
+        errcode == CL_INVALID_VALUE,
+        "Unexpected error code returned from clEnqueueWaitSemaphores");
+
+    return TEST_PASS;
+}
+
+// Confirm that a signal without a semaphore payload list will return
+// CL_INVALID_VALUE
+REGISTER_TEST(test_external_semaphores_dx_fence_negative_signal)
+{
+    int errcode = CL_SUCCESS;
+    const DirectXWrapper dx_wrapper;
+
+    REQUIRE_EXTENSION("cl_khr_external_semaphore");
+    REQUIRE_EXTENSION("cl_khr_external_semaphore_dx_fence");
+
+    // Obtain pointers to semaphore's API
+    GET_PFN(device, clCreateSemaphoreWithPropertiesKHR);
+    GET_PFN(device, clReleaseSemaphoreKHR);
+    GET_PFN(device, clEnqueueSignalSemaphoresKHR);
+
+    test_error(!is_import_handle_available(device,
+                                           CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR),
+               "Could not find CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR between the "
+               "supported import types");
+
+    // Import D3D12 fence into OpenCL
+    const DirectXFenceWrapper fence(dx_wrapper.getDXDevice());
+    CLDXSemaphoreWrapper semaphore(device, context, dx_wrapper.getDXDevice());
+    test_error(semaphore.createSemaphoreFromFence(*fence),
+               "Could not create semaphore");
+
+    log_info("Calling clEnqueueWaitSemaphoresKHR\n");
+    errcode = clEnqueueSignalSemaphoresKHR(queue, 1, &semaphore, nullptr, 0,
+                                           nullptr, nullptr);
+    test_assert_error(
+        errcode == CL_INVALID_VALUE,
+        "Unexpected error code returned from clEnqueueSignalSemaphores");
+
+    return TEST_PASS;
+}
--- a/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/test_external_semaphore_dx_fence_queries.cpp
+++ b/test_conformance/extensions/cl_khr_external_semaphore_dx_fence/test_external_semaphore_dx_fence_queries.cpp
@@ -0,0 +1,69 @@
+//
+// Copyright (c) 2025 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "semaphore_dx_fence_base.h"
+
+// Confirm that the CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR property is in the
+// properties returned by clGetSemaphoreInfo
+REGISTER_TEST(test_external_semaphores_dx_fence_query_properties)
+{
+    int errcode = CL_SUCCESS;
+    const DirectXWrapper dx_wrapper;
+
+    REQUIRE_EXTENSION("cl_khr_external_semaphore");
+    REQUIRE_EXTENSION("cl_khr_external_semaphore_dx_fence");
+
+    // Obtain pointers to semaphore's API
+    GET_PFN(device, clCreateSemaphoreWithPropertiesKHR);
+    GET_PFN(device, clReleaseSemaphoreKHR);
+    GET_PFN(device, clGetSemaphoreInfoKHR);
+
+    test_error(!is_import_handle_available(device,
+                                           CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR),
+               "Could not find CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR between the "
+               "supported import types");
+
+    // Import D3D12 fence into OpenCL
+    const DirectXFenceWrapper fence(dx_wrapper.getDXDevice());
+    CLDXSemaphoreWrapper semaphore(device, context, dx_wrapper.getDXDevice());
+    test_error(semaphore.createSemaphoreFromFence(*fence),
+               "Could not create semaphore");
+
+    size_t properties_size_bytes = 0;
+    errcode = clGetSemaphoreInfoKHR(*semaphore, CL_SEMAPHORE_PROPERTIES_KHR, 0,
+                                    nullptr, &properties_size_bytes);
+    test_error(errcode, "Could not get semaphore info");
+    std::vector<cl_semaphore_properties_khr> semaphore_properties(
+        properties_size_bytes / sizeof(cl_semaphore_properties_khr));
+    errcode = clGetSemaphoreInfoKHR(*semaphore, CL_SEMAPHORE_PROPERTIES_KHR,
+                                    properties_size_bytes,
+                                    semaphore_properties.data(), nullptr);
+    test_error(errcode, "Could not get semaphore info");
+
+    for (unsigned i = 0; i < semaphore_properties.size() - 1; i++)
+    {
+        if (semaphore_properties[i] == CL_SEMAPHORE_HANDLE_D3D12_FENCE_KHR
+            && semaphore_properties[i + 1]
+                == reinterpret_cast<cl_semaphore_properties_khr>(
+                    semaphore.getHandle()))
+        {
+            return TEST_PASS;
+        }
+    }
+    log_error(
+        "Failed to find the dx fence handle type in the semaphore properties");
+    return TEST_FAIL;
+}
--- a/test_conformance/generic_address_space/atomic_tests.cpp
+++ b/test_conformance/generic_address_space/atomic_tests.cpp
@@ -35,7 +35,7 @@ kernel void testKernel(global atomic_int* globalPtr, local atomic_int* localPtr)
    int wgid = get_group_id(0);
    int wgsize = get_local_size(0);

-    if (tid == 0) atomic_store(localPtr, 0);
+    if (tid == 0) atomic_store_explicit(localPtr, 0, memory_order_relaxed, memory_scope_work_group);

    barrier(CLK_LOCAL_MEM_FENCE);

@@ -47,12 +47,12 @@ kernel void testKernel(global atomic_int* globalPtr, local atomic_int* localPtr)
    if ((wgid % 2) == 0)
        ptr = localPtr;

-    int inc = atomic_fetch_add(ptr, 1);
+    int inc = atomic_fetch_add_explicit(ptr, 1, memory_order_relaxed, memory_scope_work_group);

    // In the cases where the local memory ptr was used,
    // save off the final value.
    if ((wgid % 2) == 0 && inc == (wgsize-1))
-        atomic_store(&globalPtr[wgid], inc);
+        atomic_store_explicit(&globalPtr[wgid], inc, memory_order_relaxed, memory_scope_work_group);
 }
 )OpenCLC";

@@ -67,7 +67,7 @@ kernel void testKernel(global atomic_int* globalPtr, local atomic_int* localPtr)
    int wgid = get_group_id(0);
    int wgsize = get_local_size(0);

-    if (tid == 0) atomic_store(localPtr, 0);
+    if (tid == 0) atomic_store_explicit(localPtr, 0, memory_order_relaxed, memory_scope_work_group);

    barrier(CLK_LOCAL_MEM_FENCE);

@@ -79,14 +79,17 @@ kernel void testKernel(global atomic_int* globalPtr, local atomic_int* localPtr)
    if ((tid % 2) == 0)
        ptr = localPtr;

-    atomic_fetch_add(ptr, 1);
+    atomic_fetch_add_explicit(ptr, 1, memory_order_relaxed, memory_scope_work_group);

    barrier(CLK_LOCAL_MEM_FENCE);

    // In the cases where the local memory ptr was used,
    // save off the final value.
    if (tid == 0)
-        atomic_store(&globalPtr[(wgid * 2) + 1], atomic_load(localPtr));
+        atomic_store_explicit(&globalPtr[(wgid * 2) + 1],
+                                atomic_load_explicit(localPtr, memory_order_relaxed, memory_scope_work_group),
+                                memory_order_relaxed,
+                                memory_scope_work_group);
 }
 )OpenCLC";
 }
--- a/test_conformance/gl/main.cpp
+++ b/test_conformance/gl/main.cpp
@@ -200,7 +200,7 @@ int main(int argc, const char *argv[])
        }
    }

-    if (argc > 1 && strcmp(argv[1], "-list") == 0)
+    if (gListTests)
    {
        log_info("Available 2.x tests:\n");
        for (int i = 0; i < test_num; i++)
--- a/test_conformance/spir/half.zip
+++ b/test_conformance/spir/half.zip
--- a/test_conformance/spir/main.cpp
+++ b/test_conformance/spir/main.cpp
--- a/test_conformance/spir/run_build_test.cpp
+++ b/test_conformance/spir/run_build_test.cpp
@@ -43,35 +43,37 @@
 //
 // Task
 //
-Task::Task(cl_device_id device, const char* options):
-m_devid(device) {
-  if (options)
-    m_options = options;
+Task::Task(cl_device_id device, const char* options): m_devid(device)
+{
+    if (options) m_options = options;
 }

 Task::~Task() {}

-const char* Task::getErrorLog() const {
-  return m_log.c_str();
-}
+const char* Task::getErrorLog() const { return m_log.c_str(); }

-void Task::setErrorLog(cl_program prog) {
+void Task::setErrorLog(cl_program prog)
+{
    size_t len = 0;
    std::vector<char> log;

-    cl_int err_code = clGetProgramBuildInfo(prog, m_devid, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
-    if(err_code != CL_SUCCESS)
+    cl_int err_code = clGetProgramBuildInfo(prog, m_devid, CL_PROGRAM_BUILD_LOG,
+                                            0, NULL, &len);
+    if (err_code != CL_SUCCESS)
    {
-        m_log = "Error: clGetProgramBuildInfo(CL_PROGRAM_BUILD_LOG, &len) failed.\n";
+        m_log = "Error: clGetProgramBuildInfo(CL_PROGRAM_BUILD_LOG, &len) "
+                "failed.\n";
        return;
    }

    log.resize(len, 0);

-    err_code = clGetProgramBuildInfo(prog, m_devid, CL_PROGRAM_BUILD_LOG, len, &log[0], NULL);
-    if(err_code != CL_SUCCESS)
+    err_code = clGetProgramBuildInfo(prog, m_devid, CL_PROGRAM_BUILD_LOG, len,
+                                     &log[0], NULL);
+    if (err_code != CL_SUCCESS)
    {
-        m_log = "Error: clGetProgramBuildInfo(CL_PROGRAM_BUILD_LOG, &log) failed.\n";
+        m_log = "Error: clGetProgramBuildInfo(CL_PROGRAM_BUILD_LOG, &log) "
+                "failed.\n";
        return;
    }
    m_log.append(&log[0]);
@@ -84,10 +86,11 @@ BuildTask::BuildTask(cl_program prog, cl_device_id dev, const char* options)
    : Task(dev, options), m_program(prog)
 {}

-bool BuildTask::execute() {
-    cl_int err_code = clBuildProgram(m_program, 0, NULL, m_options.c_str(), NULL, NULL);
-    if(CL_SUCCESS == err_code)
-        return true;
+bool BuildTask::execute()
+{
+    cl_int err_code =
+        clBuildProgram(m_program, 0, NULL, m_options.c_str(), NULL, NULL);
+    if (CL_SUCCESS == err_code) return true;

    setErrorLog(m_program);
    return false;
@@ -96,8 +99,10 @@ bool BuildTask::execute() {
 //
 // SpirBuildTask
 //
-SpirBuildTask::SpirBuildTask(cl_program prog, cl_device_id dev, const char* options) :
-    BuildTask(prog, dev, options) {}
+SpirBuildTask::SpirBuildTask(cl_program prog, cl_device_id dev,
+                             const char* options)
+    : BuildTask(prog, dev, options)
+{}

 //
 // CompileTask
@@ -107,47 +112,43 @@ CompileTask::CompileTask(cl_program prog, cl_device_id dev, const char* options)
    : Task(dev, options), m_program(prog)
 {}

-void CompileTask::addHeader(const char* hname, cl_program hprog) {
+void CompileTask::addHeader(const char* hname, cl_program hprog)
+{
    m_headers.push_back(std::make_pair(hname, hprog));
 }

-const char* first(std::pair<const char*,cl_program>& p) {
-    return p.first;
-}
+const char* first(std::pair<const char*, cl_program>& p) { return p.first; }

-cl_program second(const std::pair<const char*, cl_program>& p) {
+cl_program second(const std::pair<const char*, cl_program>& p)
+{
    return p.second;
 }

-bool CompileTask::execute() {
+bool CompileTask::execute()
+{
    // Generating the header names vector.
    std::vector<const char*> names;
    std::transform(m_headers.begin(), m_headers.end(), names.begin(), first);

    // Generating the header programs vector.
    std::vector<cl_program> programs;
-    std::transform(m_headers.begin(), m_headers.end(), programs.begin(), second);
+    std::transform(m_headers.begin(), m_headers.end(), programs.begin(),
+                   second);

    const char** h_names = NULL;
    const cl_program* h_programs = NULL;
    if (!m_headers.empty())
    {
        h_programs = &programs[0];
-        h_names    = &names[0];
+        h_names = &names[0];
    }

    // Compiling with the headers.
-    cl_int err_code = clCompileProgram(
-        m_program,
-        1U,
-        &m_devid,
-        m_options.c_str(),
-        m_headers.size(), // # of headers
-        h_programs,
-        h_names,
-        NULL, NULL);
-    if (CL_SUCCESS == err_code)
-        return true;
+    cl_int err_code =
+        clCompileProgram(m_program, 1U, &m_devid, m_options.c_str(),
+                         m_headers.size(), // # of headers
+                         h_programs, h_names, NULL, NULL);
+    if (CL_SUCCESS == err_code) return true;

    setErrorLog(m_program);
    return false;
@@ -156,8 +157,10 @@ bool CompileTask::execute() {
 //
 // SpirCompileTask
 //
-SpirCompileTask::SpirCompileTask(cl_program prog, cl_device_id dev, const char* options) :
-    CompileTask(prog, dev, options) {}
+SpirCompileTask::SpirCompileTask(cl_program prog, cl_device_id dev,
+                                 const char* options)
+    : CompileTask(prog, dev, options)
+{}


 //
@@ -169,13 +172,16 @@ LinkTask::LinkTask(cl_program* programs, int num_programs, cl_context ctxt,
      m_numPrograms(num_programs), m_context(ctxt)
 {}

-bool LinkTask::execute() {
+bool LinkTask::execute()
+{
    cl_int err_code;
    int i;

-    for(i = 0; i < m_numPrograms; ++i)
+    for (i = 0; i < m_numPrograms; ++i)
    {
-        err_code = clCompileProgram(m_programs[i], 1, &m_devid, "-x spir -spir-std=1.2 -cl-kernel-arg-info", 0, NULL, NULL, NULL, NULL);
+        err_code = clCompileProgram(m_programs[i], 1, &m_devid,
+                                    "-x spir -spir-std=1.2 -cl-kernel-arg-info",
+                                    0, NULL, NULL, NULL, NULL);
        if (CL_SUCCESS != err_code)
        {
            setErrorLog(m_programs[i]);
@@ -183,91 +189,78 @@ bool LinkTask::execute() {
        }
    }

-    m_executable = clLinkProgram(m_context, 1, &m_devid, m_options.c_str(), m_numPrograms, m_programs, NULL, NULL, &err_code);
-    if (CL_SUCCESS == err_code)
-      return true;
+    m_executable =
+        clLinkProgram(m_context, 1, &m_devid, m_options.c_str(), m_numPrograms,
+                      m_programs, NULL, NULL, &err_code);
+    if (CL_SUCCESS == err_code) return true;

-    if(m_executable) setErrorLog(m_executable);
+    if (m_executable) setErrorLog(m_executable);
    return false;
 }

-cl_program LinkTask::getExecutable() const {
-    return m_executable;
-}
+cl_program LinkTask::getExecutable() const { return m_executable; }

-LinkTask::~LinkTask() {
-    if(m_executable) clReleaseProgram(m_executable);
+LinkTask::~LinkTask()
+{
+    if (m_executable) clReleaseProgram(m_executable);
 }

 //
 // KernelEnumerator
 //
-void KernelEnumerator::process(cl_program prog) {
+void KernelEnumerator::process(cl_program prog)
+{
    const size_t MAX_KERNEL_NAME = 64;
    size_t num_kernels;

-    cl_int err_code = clGetProgramInfo(
-        prog,
-        CL_PROGRAM_NUM_KERNELS,
-        sizeof(size_t),
-        &num_kernels,
-        NULL
-    );
-    if (CL_SUCCESS != err_code)
-        return;
+    cl_int err_code = clGetProgramInfo(prog, CL_PROGRAM_NUM_KERNELS,
+                                       sizeof(size_t), &num_kernels, NULL);
+    if (CL_SUCCESS != err_code) return;

    // Querying for the number of kernels.
-    size_t buffer_len = sizeof(char)*num_kernels*MAX_KERNEL_NAME;
+    size_t buffer_len = sizeof(char) * num_kernels * MAX_KERNEL_NAME;
    char* kernel_names = new char[buffer_len];
    memset(kernel_names, '\0', buffer_len);
    size_t str_len = 0;
-    err_code = clGetProgramInfo(
-        prog,
-        CL_PROGRAM_KERNEL_NAMES,
-        buffer_len,
-        (void *)kernel_names,
-        &str_len
-    );
-    if (CL_SUCCESS != err_code)
-        return;
+    err_code = clGetProgramInfo(prog, CL_PROGRAM_KERNEL_NAMES, buffer_len,
+                                (void*)kernel_names, &str_len);
+    if (CL_SUCCESS != err_code) return;

-    //parsing the names and inserting them to the list
+    // parsing the names and inserting them to the list
    std::string names(kernel_names);
-    assert (str_len == 1+names.size() && "incompatible string lengths");
+    assert(str_len == 1 + names.size() && "incompatible string lengths");
    size_t offset = 0;
-    for(size_t i=0 ; i<names.size() ; ++i){
-        //kernel names are separated by semi colons
-        if (names[i] == ';'){
-            m_kernels.push_back(names.substr(offset, i-offset));
-            offset = i+1;
+    for (size_t i = 0; i < names.size(); ++i)
+    {
+        // kernel names are separated by semi colons
+        if (names[i] == ';')
+        {
+            m_kernels.push_back(names.substr(offset, i - offset));
+            offset = i + 1;
        }
    }
-    m_kernels.push_back(names.substr(offset, names.size()-offset));
+    m_kernels.push_back(names.substr(offset, names.size() - offset));
    delete[] kernel_names;
 }

-KernelEnumerator::KernelEnumerator(cl_program prog) {
-    process(prog);
-}
+KernelEnumerator::KernelEnumerator(cl_program prog) { process(prog); }

-KernelEnumerator::iterator KernelEnumerator::begin(){
+KernelEnumerator::iterator KernelEnumerator::begin()
+{
    return m_kernels.begin();
 }

-KernelEnumerator::iterator KernelEnumerator::end(){
-    return m_kernels.end();
-}
+KernelEnumerator::iterator KernelEnumerator::end() { return m_kernels.end(); }

-size_t KernelEnumerator::size() const {
-    return m_kernels.size();
-}
+size_t KernelEnumerator::size() const { return m_kernels.size(); }

 /**
 Run the single test - run the test for both CL and SPIR versions of the kernel
 */
-static bool run_test(cl_context context, cl_command_queue queue, cl_program clprog,
-    cl_program bcprog, const std::string& kernel_name, std::string& err, const cl_device_id device,
-    float ulps)
+static bool run_test(cl_context context, cl_command_queue queue,
+                     cl_program clprog, cl_program bcprog,
+                     const std::string& kernel_name, std::string& err,
+                     const cl_device_id device, float ulps)
 {
    WorkSizeInfo ws;
    TestResult cl_result;
@@ -276,28 +269,29 @@ static bool run_test(cl_context context, cl_command_queue queue, cl_program clpr
    {
        // make sure that the kernel will be released before the program
        clKernelWrapper kernel = create_kernel_helper(clprog, kernel_name);
-        // based on the kernel characteristics, we are generating and initializing the arguments for both phases (cl and bc executions)
+        // based on the kernel characteristics, we are generating and
+        // initializing the arguments for both phases (cl and bc executions)
        generate_kernel_data(context, kernel, ws, cl_result);
        bc_result.reset(cl_result.clone(context, ws, kernel, device));
-        assert (compare_results(cl_result, *bc_result, ulps) && "not equal?");
-        run_kernel( kernel, queue, ws, cl_result );
+        assert(compare_results(cl_result, *bc_result, ulps) && "not equal?");
+        run_kernel(kernel, queue, ws, cl_result);
    }
    // now, run the single BC test
    {
        // make sure that the kernel will be released before the program
        clKernelWrapper kernel = create_kernel_helper(bcprog, kernel_name);
-        run_kernel( kernel, queue, ws, *bc_result );
+        run_kernel(kernel, queue, ws, *bc_result);
    }

    int error = clFinish(queue);
-    if( CL_SUCCESS != error)
+    if (CL_SUCCESS != error)
    {
        err = "clFinish failed\n";
        return false;
    }

    // compare the results
-    if( !compare_results(cl_result, *bc_result, ulps) )
+    if (!compare_results(cl_result, *bc_result, ulps))
    {
        err = " (result diff in kernel '" + kernel_name + "').";
        return false;
@@ -308,16 +302,16 @@ static bool run_test(cl_context context, cl_command_queue queue, cl_program clpr
 /**
 Get the maximum relative error defined as ULP of floating-point math functions
 */
-static float get_max_ulps(const char *test_name)
+static float get_max_ulps(const char* test_name)
 {
    float ulps = 0.f;
    // Get ULP values from math_brute_force functionList
    if (strstr(test_name, "math_kernel"))
    {
-        for( size_t i = 0; i < functionListCount; i++ )
+        for (size_t i = 0; i < functionListCount; i++)
        {
            char name[64];
-            const Func *func = &functionList[ i ];
+            const Func* func = &functionList[i];
            sprintf(name, ".%s_float", func->name);
            if (strstr(test_name, name))
            {
@@ -336,16 +330,17 @@ static float get_max_ulps(const char *test_name)
    return ulps;
 }

-TestRunner::TestRunner(EventHandler *success, EventHandler *failure,
-                       const OclExtensions& devExt):
-    m_successHandler(success), m_failureHandler(failure), m_devExt(&devExt) {}
+TestRunner::TestRunner(EventHandler* success, EventHandler* failure,
+                       const OclExtensions& devExt)
+    : m_successHandler(success), m_failureHandler(failure), m_devExt(&devExt)
+{}

 /**
 Based on the test name build the cl file name, the bc file name and execute
 the kernel for both modes (cl and bc).
 */
-bool TestRunner::runBuildTest(cl_device_id device, const char *folder,
-                              const char *test_name, cl_uint size_t_width)
+bool TestRunner::runBuildTest(cl_device_id device, const char* folder,
+                              const char* test_name, cl_uint size_t_width)
 {
    int failures = 0;
    // Composing the name of the CSV file.
@@ -365,28 +360,35 @@ bool TestRunner::runBuildTest(cl_device_id device, const char *folder,
    cl_bool images3D = khrDb.isImages3DRequired(folder, test_name);

    char deviceProfile[64];
-    clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(deviceProfile), &deviceProfile, NULL);
+    clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(deviceProfile),
+                    &deviceProfile, NULL);
    std::string device_profile(deviceProfile, 64);

-    if(images == CL_TRUE && checkForImageSupport(device) != 0)
+    if (images == CL_TRUE && checkForImageSupport(device) != 0)
    {
        (*m_successHandler)(test_name, "");
-        std::cout << "Skipped. (Cannot run on device due to Images is not supported)." << std::endl;
+        std::cout
+            << "Skipped. (Cannot run on device due to Images is not supported)."
+            << std::endl;
        return true;
    }

-    if(images3D == CL_TRUE && checkFor3DImageSupport(device) != 0)
+    if (images3D == CL_TRUE && checkFor3DImageSupport(device) != 0)
    {
        (*m_successHandler)(test_name, "");
-        std::cout << "Skipped. (Cannot run on device as 3D images are not supported)." << std::endl;
+        std::cout
+            << "Skipped. (Cannot run on device as 3D images are not supported)."
+            << std::endl;
        return true;
    }

    OclExtensions requiredExt = khrDb.getRequiredExtensions(folder, test_name);
-    if(!m_devExt->supports(requiredExt))
+    if (!m_devExt->supports(requiredExt))
    {
        (*m_successHandler)(test_name, "");
-        std::cout << "Skipped. (Cannot run on device due to missing extensions: " << m_devExt->get_missing(requiredExt) << " )." << std::endl;
+        std::cout
+            << "Skipped. (Cannot run on device due to missing extensions: "
+            << m_devExt->get_missing(requiredExt) << " )." << std::endl;
        return true;
    }

@@ -409,17 +411,26 @@ bool TestRunner::runBuildTest(cl_device_id device, const char *folder,

    cl_device_fp_config gFloatCapabilities = 0;
    cl_int err;
-    if ((err = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(gFloatCapabilities), &gFloatCapabilities, NULL)))
+    if ((err = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG,
+                               sizeof(gFloatCapabilities), &gFloatCapabilities,
+                               NULL)))
    {
-        log_info("Unable to get device CL_DEVICE_SINGLE_FP_CONFIG. (%d)\n", err);
+        log_info("Unable to get device CL_DEVICE_SINGLE_FP_CONFIG. (%d)\n",
+                 err);
    }

-    if (strstr(test_name, "div_cr") || strstr(test_name, "sqrt_cr")) {
-        if ((gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT) == 0) {
+    if (strstr(test_name, "div_cr") || strstr(test_name, "sqrt_cr"))
+    {
+        if ((gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT) == 0)
+        {
            (*m_successHandler)(test_name, "");
-            std::cout << "Skipped. (Cannot run on device due to missing CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT property.)" << std::endl;
+            std::cout << "Skipped. (Cannot run on device due to missing "
+                         "CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT property.)"
+                      << std::endl;
            return true;
-        } else {
+        }
+        else
+        {
            bcoptions += " -cl-fp32-correctly-rounded-divide-sqrt";
            cloptions += " -cl-fp32-correctly-rounded-divide-sqrt";
        }
@@ -427,33 +438,39 @@ bool TestRunner::runBuildTest(cl_device_id device, const char *folder,

    // Building the programs.
    BuildTask clBuild(clprog, device, cloptions.c_str());
-    if (!clBuild.execute()) {
+    if (!clBuild.execute())
+    {
        std::cerr << clBuild.getErrorLog() << std::endl;
+        (*m_failureHandler)(test_name, "");
        return false;
    }

    SpirBuildTask bcBuild(bcprog, device, bcoptions.c_str());
-    if (!bcBuild.execute()) {
+    if (!bcBuild.execute())
+    {
        std::cerr << bcBuild.getErrorLog() << std::endl;
+        (*m_failureHandler)(test_name, "");
        return false;
    }

-    KernelEnumerator clkernel_enumerator(clprog),
-                     bckernel_enumerator(bcprog);
-    if (clkernel_enumerator.size() != bckernel_enumerator.size()) {
+    KernelEnumerator clkernel_enumerator(clprog), bckernel_enumerator(bcprog);
+    if (clkernel_enumerator.size() != bckernel_enumerator.size())
+    {
        std::cerr << "number of kernels in test" << test_name
                  << " doesn't match in bc and cl files" << std::endl;
+        (*m_failureHandler)(test_name, "");
        return false;
    }
    KernelEnumerator::iterator it = clkernel_enumerator.begin(),
-        e = clkernel_enumerator.end();
+                               e = clkernel_enumerator.end();
    while (it != e)
    {
        std::string kernel_name = *it++;
        std::string err;
        try
        {
-            bool success = run_test(context, queue, clprog, bcprog, kernel_name, err, device, ulps);
+            bool success = run_test(context, queue, clprog, bcprog, kernel_name,
+                                    err, device, ulps);
            if (success)
            {
                log_info("kernel '%s' passed.\n", kernel_name.c_str());
@@ -468,7 +485,8 @@ bool TestRunner::runBuildTest(cl_device_id device, const char *folder,
        } catch (const std::runtime_error& err)
        {
            ++failures;
-            log_info("kernel '%s' failed: %s\n", kernel_name.c_str(), err.what());
+            log_info("kernel '%s' failed: %s\n", kernel_name.c_str(),
+                     err.what());
            (*m_failureHandler)(test_name, kernel_name);
        }
    }
@@ -476,4 +494,3 @@ bool TestRunner::runBuildTest(cl_device_id device, const char *folder,
    log_info("%s %s\n", test_name, failures ? "FAILED" : "passed.");
    return failures == 0;
 }
-
--- a/test_conformance/vulkan/shaders/CMakeLists.txt
+++ b/test_conformance/vulkan/shaders/CMakeLists.txt
@@ -17,7 +17,7 @@ else()
    add_custom_command(
      OUTPUT ${VULKAN_TEST_RESOURCES}/buffer.spv
      COMMAND ${Vulkan_glslang_binary}
-              --target-env vulkan1.0
+              --target-env vulkan1.1
              -o ${VULKAN_TEST_RESOURCES}/buffer.spv
              ${CMAKE_CURRENT_SOURCE_DIR}/buffer.comp
      DEPENDS buffer.comp
@@ -35,7 +35,7 @@ else()
        add_custom_command(
          OUTPUT ${VULKAN_TEST_RESOURCES}/image2D_${GLSL_FORMAT}.spv
          COMMAND ${Vulkan_glslang_binary}
-                  --target-env vulkan1.0
+                  --target-env vulkan1.1
                  -o ${VULKAN_TEST_RESOURCES}/image2D_${GLSL_FORMAT}.spv
                  ${CMAKE_CURRENT_BINARY_DIR}/image2D_${GLSL_FORMAT}.comp
          DEPENDS image2D_${GLSL_FORMAT}.comp