Merge branch 'main' into cl_khr_unified_svm

2026-03-21 06:49:02 +00:00 · 2025-02-11 13:14:23 -08:00
parent ca082ed48e 3618402c3a
commit d08eea3b1e
101 changed files with 2896 additions and 1306 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.5.1)
+cmake_minimum_required(VERSION 3.12.0)

 set( CONFORMANCE_SUFFIX "" )
 set(CLConform_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
@@ -107,6 +107,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR "${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?Clang"
    add_cxx_flag_if_supported(-Wno-error=cpp) # Allow #warning directive
    add_cxx_flag_if_supported(-Wno-unknown-pragmas) # Issue #785
    add_cxx_flag_if_supported(-Wno-error=asm-operand-widths) # Issue #784
+    add_cxx_flag_if_supported(-Wno-strict-aliasing)  # Issue 2234

    # -msse -mfpmath=sse to force gcc to use sse for float math,
    # avoiding excess precision problems that cause tests like int2float
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -2415,6 +2415,12 @@ int debug_find_vector_in_image(void *imagePtr, image_descriptor *imageInfo,
                (imageInfo->height >> lod) ? (imageInfo->height >> lod) : 1;
            depth = (imageInfo->depth >> lod) ? (imageInfo->depth >> lod) : 1;
            break;
+        default:
+            log_error("ERROR: Invalid imageInfo->type = %d\n", imageInfo->type);
+            width = 0;
+            depth = 0;
+            height = 0;
+            break;
    }

    row_pitch = width * get_pixel_size(imageInfo->format);
@@ -3661,6 +3667,11 @@ void copy_image_data(image_descriptor *srcImageInfo,
                    ? (srcImageInfo->height >> src_lod)
                    : 1;
                break;
+            default:
+                log_error("ERROR: Invalid srcImageInfo->type = %d\n",
+                          srcImageInfo->type);
+                src_lod = 0;
+                break;
        }
        src_mip_level_offset = compute_mip_level_offset(srcImageInfo, src_lod);
        src_row_pitch_lod =
@@ -3707,6 +3718,11 @@ void copy_image_data(image_descriptor *srcImageInfo,
                    ? (dstImageInfo->height >> dst_lod)
                    : 1;
                break;
+            default:
+                log_error("ERROR: Invalid dstImageInfo->num_mip_levels = %d\n",
+                          dstImageInfo->num_mip_levels);
+                dst_lod = 0;
+                break;
        }
        dst_mip_level_offset = compute_mip_level_offset(dstImageInfo, dst_lod);
        dst_row_pitch_lod =
--- a/test_common/harness/os_helpers.cpp
+++ b/test_common/harness/os_helpers.cpp
@@ -577,7 +577,7 @@ char* get_temp_filename()
    close(fd);
 #elif defined(_WIN32)
    UINT ret = GetTempFileName(".", "tmp", 0, gFileName);
-    if (ret == 0) return gFileName;
+    if (ret == 0) return strdup(gFileName);
 #else
    MTdata d = init_genrand((cl_uint)time(NULL));
    sprintf(gFileName, "tmpfile.%u", genrand_int32(d));
--- a/test_common/harness/typeWrappers.cpp
+++ b/test_common/harness/typeWrappers.cpp
@@ -348,7 +348,7 @@ cl_int clProtectedImage::Create(cl_context context,
                                const cl_image_format *fmt, size_t width,
                                size_t height, size_t depth, size_t arraySize)
 {
-    cl_int error;
+    cl_int error = 0;
 #if defined(__APPLE__)
    int protect_pages = 1;
    cl_device_id devices[16];
--- a/test_conformance/SVM/test_enqueue_api.cpp
+++ b/test_conformance/SVM/test_enqueue_api.cpp
@@ -160,8 +160,8 @@ REGISTER_TEST(svm_enqueue_api)
            error = clSetUserEventStatus(userEvent, CL_COMPLETE);
            test_error(error, "clSetUserEventStatus failed");

-            cl_uchar *src_ptr;
-            cl_uchar *dst_ptr;
+            cl_uchar *src_ptr = nullptr;
+            cl_uchar *dst_ptr = nullptr;
            if (test_case.srcAlloc == host)
            {
                src_ptr = srcHostData.data();
--- a/test_conformance/allocations/allocation_execute.cpp
+++ b/test_conformance/allocations/allocation_execute.cpp
@@ -26,13 +26,18 @@ const char *buffer_kernel_pattern = {
    "\tint tid = get_global_id(0);\n"
    "\tuint r = 0;\n"
    "\t%s i;\n"
-    "\tfor(i=(%s)tid*(%s)per_item; i<(%s)(1+tid)*(%s)per_item; i++) {\n"
    "%s"
-    "\t}\n"
    "\tresult[tid] = r;\n"
    "}\n"
 };

+const char *accumulate_pattern = {
+    "\t%s end%d = min((%s)(1+tid)*(%s)per_item, array_sizes[%d]);\n"
+    "\tfor(i=(%s)tid*(%s)per_item; i<end%d; i++) {\n"
+    "\t\tr += buffer%d[i];\n"
+    "\t}\n"
+};
+
 const char *image_kernel_pattern = {
    "__kernel void sample_test(%s __global uint *result)\n"
    "{\n"
@@ -159,7 +164,7 @@ int check_image(cl_command_queue queue, cl_mem mem)
 int execute_kernel(cl_context context, cl_command_queue *queue,
                   cl_device_id device_id, int test, cl_mem mems[],
                   int number_of_mems_used, int verify_checksum,
-                   unsigned int number_of_work_itmes)
+                   unsigned int number_of_work_items)
 {

    char *argument_string;
@@ -174,24 +179,38 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
    cl_uint per_item;
    cl_uint per_item_uint;
    cl_uint final_result;
-    std::vector<cl_uint> returned_results(number_of_work_itmes);
+    std::vector<cl_uint> returned_results(number_of_work_items);
    clEventWrapper event;
    cl_int event_status;

    // Allocate memory for the kernel source
+    char *used_pattern = nullptr;
+    if (test == BUFFER || test == BUFFER_NON_BLOCKING)
+    {
+        used_pattern = (char *)accumulate_pattern;
+    }
+    else if (test == IMAGE_READ || test == IMAGE_READ_NON_BLOCKING)
+    {
+        used_pattern = (char *)read_pattern;
+    }
+    else if (test == IMAGE_WRITE || test == IMAGE_WRITE_NON_BLOCKING)
+    {
+        used_pattern = (char *)write_pattern;
+    }
    argument_string =
        (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE * 64);
    access_string = (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE
-                                   * (strlen(read_pattern) + 10));
+                                   * (strlen(used_pattern) + 10));
    kernel_string = (char *)malloc(sizeof(char) * MAX_NUMBER_TO_ALLOCATE
-                                       * (strlen(read_pattern) + 10 + 64)
+                                       * (strlen(used_pattern) + 10 + 64)
                                   + 1024);
+
    argument_string[0] = '\0';
    access_string[0] = '\0';
    kernel_string[0] = '\0';

    // Zero the results.
-    for (i = 0; i < number_of_work_itmes; i++) returned_results[i] = 0;
+    for (i = 0; i < number_of_work_items; i++) returned_results[i] = 0;

    // detect if device supports ulong/int64
    // detect whether profile of the device is embedded
@@ -209,13 +228,6 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
    // Build the kernel source
    if (test == BUFFER || test == BUFFER_NON_BLOCKING)
    {
-        for (i = 0; i < number_of_mems_used; i++)
-        {
-            sprintf(argument_string + strlen(argument_string),
-                    " __global uint *buffer%d, ", i);
-            sprintf(access_string + strlen(access_string),
-                    "\t\tif (i<array_sizes[%d]) r += buffer%d[i];\n", i, i);
-        }
        char type[10];
        if (support64)
        {
@@ -225,8 +237,15 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
        {
            sprintf(type, "uint");
        }
+        for (i = 0; i < number_of_mems_used; i++)
+        {
+            sprintf(argument_string + strlen(argument_string),
+                    " __global uint *buffer%d, ", i);
+            sprintf(access_string + strlen(access_string), accumulate_pattern,
+                    type, i, type, type, i, type, type, i, i);
+        }
        sprintf(kernel_string, buffer_kernel_pattern, argument_string, type,
-                type, type, type, type, type, access_string);
+                type, access_string);
    }
    else if (test == IMAGE_READ || test == IMAGE_READ_NON_BLOCKING)
    {
@@ -282,14 +301,14 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
    // Set the result
    result_mem =
        clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-                       sizeof(cl_uint) * number_of_work_itmes,
+                       sizeof(cl_uint) * number_of_work_items,
                       returned_results.data(), &error);
    test_error(error, "clCreateBuffer failed");
    error = clSetKernelArg(kernel, i, sizeof(result_mem), &result_mem);
    test_error(error, "clSetKernelArg failed");

    // Thread dimensions for execution
-    global_dims[0] = number_of_work_itmes;
+    global_dims[0] = number_of_work_items;
    global_dims[1] = 1;
    global_dims[2] = 1;

@@ -427,7 +446,7 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
    // Verify the checksum.
    // Read back the result
    error = clEnqueueReadBuffer(*queue, result_mem, CL_TRUE, 0,
-                                sizeof(cl_uint) * number_of_work_itmes,
+                                sizeof(cl_uint) * number_of_work_items,
                                returned_results.data(), 0, NULL, NULL);
    test_error_abort(error, "clEnqueueReadBuffer failed");
    final_result = 0;
@@ -436,7 +455,7 @@ int execute_kernel(cl_context context, cl_command_queue *queue,
    {
        // For buffers or read images we are just looking at the sum of what
        // each thread summed up
-        for (i = 0; i < number_of_work_itmes; i++)
+        for (i = 0; i < number_of_work_items; i++)
        {
            final_result += returned_results[i];
        }
--- a/test_conformance/allocations/main.cpp
+++ b/test_conformance/allocations/main.cpp
@@ -125,7 +125,7 @@ int doTest(cl_device_id device, cl_context context, cl_command_queue queue,
    int number_of_mems_used;
    cl_ulong max_individual_allocation_size = g_max_individual_allocation_size;
    cl_ulong global_mem_size = g_global_mem_size;
-    unsigned int number_of_work_itmes = 8192 * 32;
+    unsigned int number_of_work_items = 8192 * 32;
    const bool allocate_image =
        (alloc_type != BUFFER) && (alloc_type != BUFFER_NON_BLOCKING);

@@ -183,7 +183,7 @@ int doTest(cl_device_id device, cl_context context, cl_command_queue queue,
                 g_reduction_percentage);
        g_max_size = (size_t)((double)g_max_size
                              * (double)g_reduction_percentage / 100.0);
-        number_of_work_itmes = 8192 * 2;
+        number_of_work_items = 8192 * 2;
    }

    // Round to nearest MB.
@@ -220,7 +220,7 @@ int doTest(cl_device_id device, cl_context context, cl_command_queue queue,
                error =
                    execute_kernel(context, &queue, device, alloc_type, mems,
                                   number_of_mems_used, g_write_allocations,
-                                   number_of_work_itmes);
+                                   number_of_work_items);
            }

            // If we failed to allocate more than 1/8th of the requested amount
--- a/test_conformance/api/test_wg_suggested_local_work_size.cpp
+++ b/test_conformance/api/test_wg_suggested_local_work_size.cpp
@@ -208,7 +208,7 @@ int do_test_work_group_suggested_local_size(
    bool (*skip_cond)(size_t), size_t start, size_t end, size_t incr,
    cl_ulong max_local_mem_size, size_t global_work_offset[], num_dims dim)
 {
-    int err;
+    int err = 0;
    size_t test_values[] = { 1, 1, 1 };
    std::string kernel_names[6] = {
        "test_wg_scan_local_work_group_size",
--- a/test_conformance/basic/test_imagereadwrite.cpp
+++ b/test_conformance/basic/test_imagereadwrite.cpp
@@ -314,6 +314,12 @@ test_imagereadwrite(cl_device_id device, cl_context context, cl_command_queue qu
                }
                outp = (void *)rgbafp_outptr;
                break;
+            default:
+                log_error("ERROR Invalid j = %d\n", j);
+                elem_size = 0;
+                p = nullptr;
+                outp = nullptr;
+                break;
        }

        const char* update_packed_pitch_name = "";
--- a/test_conformance/basic/test_imagereadwrite3d.cpp
+++ b/test_conformance/basic/test_imagereadwrite3d.cpp
@@ -320,6 +320,12 @@ test_imagereadwrite3d(cl_device_id device, cl_context context, cl_command_queue
                }
                outp = (void *)rgbafp_outptr;
                break;
+            default:
+                log_error("ERROR Invalid j = %d\n", j);
+                elem_size = 0;
+                p = nullptr;
+                outp = nullptr;
+                break;
        }

        const char* update_packed_pitch_name = "";
--- a/test_conformance/buffers/test_buffer_write.cpp
+++ b/test_conformance/buffers/test_buffer_write.cpp
@@ -852,8 +852,8 @@ int test_buffer_write_struct( cl_device_id deviceID, cl_context context, cl_comm
                    buffers[0] =
                        clCreateBuffer(context, flag_set[src_flag_id],
                                       ptrSizes[i] * num_elements, NULL, &err);
-                if ( err ){
-                    align_free( outptr[i] );
+                if (err)
+                {
                    print_error(err, " clCreateBuffer failed\n" );
                    free_mtdata(d);
                    return -1;
--- a/test_conformance/contractions/contractions.cpp
+++ b/test_conformance/contractions/contractions.cpp
@@ -365,16 +365,18 @@ static int ParseArgs( int argc, const char **argv )
    int length_of_seed = 0;

    { // Extract the app name
-        strncpy( appName, argv[0], MAXPATHLEN );
+        strncpy(appName, argv[0], MAXPATHLEN - 1);
+        appName[MAXPATHLEN - 1] = '\0';

 #if (defined( __APPLE__ ) || defined(__linux__) || defined(__MINGW32__))
        char baseName[MAXPATHLEN];
        char *base = NULL;
-        strncpy( baseName, argv[0], MAXPATHLEN );
+        strncpy(baseName, argv[0], MAXPATHLEN - 1);
+        baseName[MAXPATHLEN - 1] = '\0';
        base = basename( baseName );
        if( NULL != base )
        {
-            strncpy( appName, base, sizeof( appName )  );
+            strncpy(appName, base, sizeof(appName) - 1);
            appName[ sizeof( appName ) -1 ] = '\0';
        }
 #elif defined (_WIN32)
@@ -385,7 +387,7 @@ static int ParseArgs( int argc, const char **argv )
                                   fname, _MAX_FNAME, ext, _MAX_EXT );
        if (err == 0) { // no error
            strcat (fname, ext); //just cat them, size of frame can keep both
-            strncpy (appName, fname, sizeof(appName));
+            strncpy(appName, fname, sizeof(appName) - 1);
            appName[ sizeof( appName ) -1 ] = '\0';
        }
 #endif
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -1448,7 +1448,9 @@ cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
        char inName[32];
        char outName[32];
        strncpy(inName, gTypeNames[inType], sizeof(inName));
+        inName[sizeof(inName) - 1] = '\0';
        strncpy(outName, gTypeNames[outType], sizeof(outName));
+        outName[sizeof(outName) - 1] = '\0';
        sprintf(testName, "test_implicit_%s_%s", outName, inName);

        source << "__kernel void " << testName << "( __global " << inName
@@ -1473,8 +1475,10 @@ cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
        switch (vectorSizetmp)
        {
            case 1:
-                strncpy(inName, gTypeNames[inType], sizeof(inName));
-                strncpy(outName, gTypeNames[outType], sizeof(outName));
+                strncpy(inName, gTypeNames[inType], sizeof(inName) - 1);
+                inName[sizeof(inName) - 1] = '\0';
+                strncpy(outName, gTypeNames[outType], sizeof(outName) - 1);
+                outName[sizeof(outName) - 1] = '\0';
                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
                         outName, gSaturationNames[sat],
                         gRoundingModeNames[round]);
@@ -1482,8 +1486,10 @@ cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
                vlog("Building %s( %s ) test\n", convertString, inName);
                break;
            case 3:
-                strncpy(inName, gTypeNames[inType], sizeof(inName));
-                strncpy(outName, gTypeNames[outType], sizeof(outName));
+                strncpy(inName, gTypeNames[inType], sizeof(inName) - 1);
+                inName[sizeof(inName) - 1] = '\0';
+                strncpy(outName, gTypeNames[outType], sizeof(outName) - 1);
+                outName[sizeof(outName) - 1] = '\0';
                snprintf(convertString, sizeof(convertString),
                         "convert_%s3%s%s", outName, gSaturationNames[sat],
                         gRoundingModeNames[round]);
--- a/test_conformance/conversions/test_conversions.cpp
+++ b/test_conformance/conversions/test_conversions.cpp
@@ -182,11 +182,12 @@ static int ParseArgs(int argc, const char **argv)
 #if (defined(__APPLE__) || defined(__linux__) || defined(__MINGW32__))
    { // Extract the app name
        char baseName[MAXPATHLEN];
-        strncpy(baseName, argv[0], MAXPATHLEN);
+        strncpy(baseName, argv[0], MAXPATHLEN - 1);
+        baseName[sizeof(baseName) - 1] = '\0';
        char *base = basename(baseName);
        if (NULL != base)
        {
-            strncpy(appName, base, sizeof(appName));
+            strncpy(appName, base, sizeof(appName) - 1);
            appName[sizeof(appName) - 1] = '\0';
        }
    }
@@ -200,7 +201,7 @@ static int ParseArgs(int argc, const char **argv)
        if (err == 0)
        { // no error
            strcat(fname, ext); // just cat them, size of frame can keep both
-            strncpy(appName, fname, sizeof(appName));
+            strncpy(appName, fname, sizeof(appName) - 1);
            appName[sizeof(appName) - 1] = '\0';
        }
    }
--- a/test_conformance/events/test_event_dependencies.cpp
+++ b/test_conformance/events/test_event_dependencies.cpp
@@ -89,7 +89,7 @@ int test_event_enqueue_wait_for_events_run_test(

    // If we are to use two devices, then get them and create a context with
    // both.
-    cl_device_id *two_device_ids;
+    cl_device_id *two_device_ids = nullptr;
    if (two_devices)
    {
        two_device_ids = (cl_device_id *)malloc(sizeof(cl_device_id) * 2);
--- a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.cpp
@@ -61,10 +61,9 @@ bool BasicCommandBufferTest::Skip()
               "CL_DEVICE_COMMAND_BUFFER_SUPPORTED_QUEUE_PROPERTIES_KHR");

    cl_command_queue_properties queue_properties;
-    error = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
-                                  sizeof(queue_properties), &queue_properties,
-                                  NULL);
-    test_error(error, "Unable to query CL_QUEUE_PROPERTIES");
+    error = clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES,
+                            sizeof(queue_properties), &queue_properties, NULL);
+    test_error(error, "Unable to query CL_DEVICE_QUEUE_PROPERTIES");
    queue_out_of_order_support =
        queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;

@@ -198,8 +197,6 @@ struct MultiFlagCreationTest : public BasicCommandBufferTest
    cl_int Run() override
    {
        cl_command_buffer_properties_khr flags = 0;
-        size_t num_flags_set = 0;
-        bool multi_flags_supported = true;
        cl_int error = CL_SUCCESS;

        // First try to find multiple flags that are supported by the driver and
@@ -207,30 +204,18 @@ struct MultiFlagCreationTest : public BasicCommandBufferTest
        if (simultaneous_use_support)
        {
            flags |= CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;
-            num_flags_set++;
        }

-        if (device_side_enqueue_support)
+        if (is_extension_available(
+                device, CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_NAME))
        {
            flags |= CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR;
-            num_flags_set++;
        }

        if (is_extension_available(
                device, CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME))
        {
            flags |= CL_COMMAND_BUFFER_MUTABLE_KHR;
-            num_flags_set++;
-        }
-
-        // If we can't find multiple supported flags, still set a bitfield but
-        // expect CL_INVALID_PROPERTY to be returned on creation.
-        if (num_flags_set < 2)
-        {
-            flags = CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR
-                | CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR;
-
-            multi_flags_supported = false;
        }

        cl_command_buffer_properties_khr props[] = {
@@ -238,17 +223,7 @@ struct MultiFlagCreationTest : public BasicCommandBufferTest
        };

        command_buffer = clCreateCommandBufferKHR(1, &queue, props, &error);
-        if (multi_flags_supported)
-        {
-            test_error(error, "clCreateCommandBufferKHR failed");
-        }
-        else
-        {
-            test_failure_error_ret(
-                error, CL_INVALID_PROPERTY,
-                "clCreateCommandBufferKHR should return CL_INVALID_PROPERTY",
-                TEST_FAIL);
-        }
+        test_error(error, "clCreateCommandBufferKHR failed");

        return CL_SUCCESS;
    }
--- a/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/basic_command_buffer.h
@@ -104,9 +104,9 @@ int MakeAndRunTest(cl_device_id device, cl_context context,
        cl_version extension_version =
            get_extension_version(device, "cl_khr_command_buffer");

-        if (extension_version != CL_MAKE_VERSION(0, 9, 6))
+        if (extension_version != CL_MAKE_VERSION(0, 9, 7))
        {
-            log_info("cl_khr_command_buffer version 0.9.6 is required to run "
+            log_info("cl_khr_command_buffer version 0.9.7 is required to run "
                     "the test, skipping.\n ");
            return TEST_SKIPPED_ITSELF;
        }
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_profiling.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_profiling.cpp
@@ -21,11 +21,75 @@

 namespace {

+#define ADD_PROF_PARAM(prop)                                                   \
+    {                                                                          \
+        prop, #prop, 0                                                         \
+    }
+
+struct ProfilingParam
+{
+    cl_profiling_info param;
+    std::string name;
+    cl_ulong value;
+};
+
+cl_int VerifyResult(const clEventWrapper& event)
+{
+    cl_int error = CL_SUCCESS;
+    cl_int status;
+    error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
+                           sizeof(status), &status, NULL);
+    test_error(error, "clGetEventInfo() failed");
+
+    if (status != CL_SUCCESS)
+        test_fail("Kernel execution status %d! (%s:%d)\n", status, __FILE__,
+                  __LINE__);
+
+    std::vector<ProfilingParam> prof_params = {
+        ADD_PROF_PARAM(CL_PROFILING_COMMAND_QUEUED),
+        ADD_PROF_PARAM(CL_PROFILING_COMMAND_SUBMIT),
+        ADD_PROF_PARAM(CL_PROFILING_COMMAND_START),
+        ADD_PROF_PARAM(CL_PROFILING_COMMAND_END),
+    };
+
+    // gather profiling timestamps
+    for (auto&& p : prof_params)
+    {
+        error = clGetEventProfilingInfo(event, p.param, sizeof(p.value),
+                                        &p.value, NULL);
+        test_error(error, "clGetEventProfilingInfo() failed");
+    }
+
+    // verify the results by comparing timestamps
+    bool all_vals_0 = prof_params.front().value != 0;
+    for (size_t i = 1; i < prof_params.size(); i++)
+    {
+        all_vals_0 = (prof_params[i].value != 0) ? false : all_vals_0;
+        if (prof_params[i - 1].value > prof_params[i].value)
+        {
+            log_error("Profiling %s=0x%x should be smaller than or equal "
+                      "to %s=0x%x for "
+                      "kernels that use the on-device queue",
+                      prof_params[i - 1].name.c_str(), prof_params[i - 1].param,
+                      prof_params[i].name.c_str(), prof_params[i].param);
+            return TEST_FAIL;
+        }
+    }
+
+    if (all_vals_0)
+    {
+        log_error("All values are 0. This is exceedingly unlikely.\n");
+        return TEST_FAIL;
+    }
+
+    log_info("Profiling info for command-buffer kernel succeeded.\n");
+    return TEST_PASS;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Command-buffer profiling test cases:
 // -all commands are recorded to a single command-queue
 // -profiling a command-buffer with simultaneous use
-
 template <bool simultaneous_request>
 struct CommandBufferProfiling : public BasicCommandBufferTest
 {
@@ -133,73 +197,6 @@ struct CommandBufferProfiling : public BasicCommandBufferTest
        return CL_SUCCESS;
    }

-    //--------------------------------------------------------------------------
-#define ADD_PROF_PARAM(prop)                                                   \
-    {                                                                          \
-        prop, #prop, 0                                                         \
-    }
-    struct ProfilingParam
-    {
-        cl_profiling_info param;
-        std::string name;
-        cl_ulong value;
-    };
-
-    //--------------------------------------------------------------------------
-    cl_int VerifyResult(const clEventWrapper& event)
-    {
-        cl_int error = CL_SUCCESS;
-        cl_int status;
-        error = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS,
-                               sizeof(status), &status, NULL);
-        test_error(error, "clGetEventInfo() failed");
-
-        if (status != CL_SUCCESS)
-            test_fail("Kernel execution status %d! (%s:%d)\n", status, __FILE__,
-                      __LINE__);
-
-        std::vector<ProfilingParam> prof_params = {
-            ADD_PROF_PARAM(CL_PROFILING_COMMAND_QUEUED),
-            ADD_PROF_PARAM(CL_PROFILING_COMMAND_SUBMIT),
-            ADD_PROF_PARAM(CL_PROFILING_COMMAND_START),
-            ADD_PROF_PARAM(CL_PROFILING_COMMAND_END),
-        };
-
-        // gather profiling timestamps
-        for (auto&& p : prof_params)
-        {
-            error = clGetEventProfilingInfo(event, p.param, sizeof(p.value),
-                                            &p.value, NULL);
-            test_error(error, "clGetEventProfilingInfo() failed");
-        }
-
-        // verify the results by comparing timestamps
-        bool all_vals_0 = prof_params.front().value != 0;
-        for (size_t i = 1; i < prof_params.size(); i++)
-        {
-            all_vals_0 = (prof_params[i].value != 0) ? false : all_vals_0;
-            if (prof_params[i - 1].value > prof_params[i].value)
-            {
-                log_error("Profiling %s=0x%x should be smaller than or equal "
-                          "to %s=0x%x for "
-                          "kernels that use the on-device queue",
-                          prof_params[i - 1].name.c_str(),
-                          prof_params[i - 1].param, prof_params[i].name.c_str(),
-                          prof_params[i].param);
-                return TEST_FAIL;
-            }
-        }
-
-        if (all_vals_0)
-        {
-            log_error("All values are 0. This is exceedingly unlikely.\n");
-            return TEST_FAIL;
-        }
-
-        log_info("Profiling info for command-buffer kernel succeeded.\n");
-        return TEST_PASS;
-    }
-
    //--------------------------------------------------------------------------
    cl_int RunSingle()
    {
@@ -301,6 +298,63 @@ struct CommandBufferProfiling : public BasicCommandBufferTest
    const cl_int pattern = 0xA;
 };

+// Test that we can create a command-buffer using a queue without the profiling
+// property, which is enqueued to an queue with the profiling property, and
+// the event returned can queried for profiling info.
+struct CommandBufferSubstituteQueueProfiling : public BasicCommandBufferTest
+{
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+
+    cl_int Run() override
+    {
+        cl_int error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        clEventWrapper event;
+        error = clEnqueueCommandBufferKHR(1, &profiling_queue, command_buffer,
+                                          0, nullptr, &event);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(profiling_queue);
+        test_error(error, "clFinish failed");
+
+        error = VerifyResult(event);
+        test_error(error, "VerifyResult failed");
+
+        return CL_SUCCESS;
+    }
+
+    cl_int SetUp(int elements) override
+    {
+        cl_command_queue_properties supported_properties;
+        cl_int error = clGetDeviceInfo(
+            device, CL_DEVICE_COMMAND_BUFFER_SUPPORTED_QUEUE_PROPERTIES_KHR,
+            sizeof(supported_properties), &supported_properties, NULL);
+        test_error(error,
+                   "Unable to query "
+                   "CL_DEVICE_COMMAND_BUFFER_SUPPORTED_QUEUE_PROPERTIES_KHR");
+
+        // CL_QUEUE_PROFILING_ENABLE is mandated minimum property returned by
+        // CL_DEVICE_COMMAND_BUFFER_SUPPORTED_QUEUE_PROPERTIES_KHR
+        if (!(supported_properties & CL_QUEUE_PROFILING_ENABLE))
+        {
+            return TEST_FAIL;
+        }
+
+        profiling_queue = clCreateCommandQueue(
+            context, device, CL_QUEUE_PROFILING_ENABLE, &error);
+        test_error(error, "clCreateCommandQueue failed");
+
+        return BasicCommandBufferTest::SetUp(elements);
+    }
+
+    clCommandQueueWrapper profiling_queue = nullptr;
+};
 } // anonymous namespace

 int test_basic_profiling(cl_device_id device, cl_context context,
@@ -316,3 +370,10 @@ int test_simultaneous_profiling(cl_device_id device, cl_context context,
    return MakeAndRunTest<CommandBufferProfiling<true>>(device, context, queue,
                                                        num_elements);
 }
+
+int test_substitute_queue_profiling(cl_device_id device, cl_context context,
+                                    cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<CommandBufferSubstituteQueueProfiling>(
+        device, context, queue, num_elements);
+}
--- a/test_conformance/extensions/cl_khr_command_buffer/command_buffer_queue_substitution.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/command_buffer_queue_substitution.cpp
@@ -252,6 +252,148 @@ struct SubstituteQueueTest : public BasicCommandBufferTest
    clEventWrapper user_event;
 };

+// Command-queue substitution tests which handles below cases:
+// * Template param is true - Create a command-buffer with an in-order queue,
+//   and enqueue command-buffer to an out-of-order queue.
+// * Template param is false - Create a command-buffer with an out-of-order
+//   queue, and enqueue command-buffer to an in-order queue.
+template <bool is_ooo_test>
+struct QueueOrderTest : public BasicCommandBufferTest
+{
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+
+    QueueOrderTest(cl_device_id device, cl_context context,
+                   cl_command_queue queue)
+        : BasicCommandBufferTest(device, context, queue), ooo_queue(nullptr),
+          ooo_command_buffer(this)
+    {}
+
+    cl_int RecordOutOfOrderCommandBuffer()
+    {
+        cl_sync_point_khr sync_points[2];
+        const cl_int pattern = pattern_pri;
+        cl_int error =
+            clCommandFillBufferKHR(ooo_command_buffer, nullptr, nullptr, in_mem,
+                                   &pattern, sizeof(cl_int), 0, data_size(), 0,
+                                   nullptr, &sync_points[0], nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+
+        error = clCommandFillBufferKHR(ooo_command_buffer, nullptr, nullptr,
+                                       out_mem, &overwritten_pattern,
+                                       sizeof(cl_int), 0, data_size(), 0,
+                                       nullptr, &sync_points[1], nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+
+        error = clCommandNDRangeKernelKHR(
+            ooo_command_buffer, nullptr, nullptr, kernel, 1, nullptr,
+            &num_elements, nullptr, 2, sync_points, nullptr, nullptr);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    cl_int RecordInOrderCommandBuffer()
+    {
+        const cl_int pattern = pattern_pri;
+        cl_int error = clCommandFillBufferKHR(
+            command_buffer, nullptr, nullptr, in_mem, &pattern, sizeof(cl_int),
+            0, data_size(), 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+
+        error = clCommandFillBufferKHR(
+            command_buffer, nullptr, nullptr, out_mem, &overwritten_pattern,
+            sizeof(cl_int), 0, data_size(), 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandFillBufferKHR failed");
+
+        error = clCommandNDRangeKernelKHR(
+            command_buffer, nullptr, nullptr, kernel, 1, nullptr, &num_elements,
+            nullptr, 0, nullptr, nullptr, nullptr);
+        test_error(error, "clCommandNDRangeKernelKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    cl_int Run() override
+    {
+        cl_int error = CL_SUCCESS;
+        if (is_ooo_test)
+        {
+            // command-buffer created in-order, but executed on ooo queue
+            error = RecordInOrderCommandBuffer();
+            test_error(error, "RecordInOrderCommandBuffer failed");
+        }
+        else
+        {
+            // command-buffer created ooo with sync point deps, but
+            // executed on in-order queue
+            error = RecordOutOfOrderCommandBuffer();
+            test_error(error, "RecordOutOfOrderCommandBuffer failed");
+        }
+
+        clCommandBufferWrapper& test_command_buffer =
+            is_ooo_test ? command_buffer : ooo_command_buffer;
+        error = clFinalizeCommandBufferKHR(test_command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        clCommandQueueWrapper& test_queue = is_ooo_test ? ooo_queue : queue;
+        error = clEnqueueCommandBufferKHR(1, &test_queue, test_command_buffer,
+                                          0, nullptr, nullptr);
+        test_error(error, "clEnqueueCommandBufferKHR failed");
+
+        error = clFinish(test_queue);
+        test_error(error, "clFinish failed");
+
+        // Verify output
+        std::vector<cl_int> output_buffer(num_elements);
+        error = clEnqueueReadBuffer(queue, out_mem, CL_TRUE, 0, data_size(),
+                                    output_buffer.data(), 0, nullptr, nullptr);
+        test_error(error, "clEnqueueReadBuffer failed");
+
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            CHECK_VERIFICATION_ERROR(pattern_pri, output_buffer[i], i);
+        }
+
+        return CL_SUCCESS;
+    }
+
+    cl_int SetUp(int elements) override
+    {
+        cl_int error = BasicCommandBufferTest::SetUp(elements);
+        test_error(error, "BasicCommandBufferTest::SetUp failed");
+
+        ooo_queue = clCreateCommandQueue(
+            context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error);
+        test_error(error,
+                   "clCreateCommandQueue with "
+                   "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE failed");
+
+        ooo_command_buffer =
+            clCreateCommandBufferKHR(1, &ooo_queue, nullptr, &error);
+        test_error(error, "clCreateCommandBufferKHR failed");
+
+        return CL_SUCCESS;
+    }
+
+    bool Skip() override
+    {
+        if (BasicCommandBufferTest::Skip()) return true;
+
+        // Skip if we want to enqueue to an out-of-order command-queue,
+        // and this isn't supported.
+        bool skip = is_ooo_test ? !out_of_order_support : false;
+
+        // Skip if device doesn't support out-of-order queues, we need
+        // to create one for both instantiations of the test.
+        return skip || !queue_out_of_order_support;
+    }
+
+    clCommandQueueWrapper ooo_queue;
+    clCommandBufferWrapper ooo_command_buffer;
+
+    const cl_int overwritten_pattern = 0xACDC;
+    const cl_int pattern_pri = 42;
+};
 } // anonymous namespace

 int test_queue_substitution(cl_device_id device, cl_context context,
@@ -276,3 +418,17 @@ int test_simultaneous_queue_substitution(cl_device_id device,
    return MakeAndRunTest<SubstituteQueueTest<false, true>>(
        device, context, queue, num_elements);
 }
+
+int test_queue_substitute_in_order(cl_device_id device, cl_context context,
+                                   cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<QueueOrderTest<false>>(device, context, queue,
+                                                 num_elements);
+}
+
+int test_queue_substitute_out_of_order(cl_device_id device, cl_context context,
+                                       cl_command_queue queue, int num_elements)
+{
+    return MakeAndRunTest<QueueOrderTest<true>>(device, context, queue,
+                                                num_elements);
+}
--- a/test_conformance/extensions/cl_khr_command_buffer/main.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/main.cpp
@@ -30,6 +30,7 @@ test_definition test_list[] = {
    ADD_TEST(info_context),
    ADD_TEST(basic_profiling),
    ADD_TEST(simultaneous_profiling),
+    ADD_TEST(substitute_queue_profiling),
    ADD_TEST(regular_wait_for_command_buffer),
    ADD_TEST(command_buffer_wait_for_command_buffer),
    ADD_TEST(command_buffer_wait_for_sec_command_buffer),
@@ -44,6 +45,8 @@ test_definition test_list[] = {
    ADD_TEST(queue_substitution),
    ADD_TEST(properties_queue_substitution),
    ADD_TEST(simultaneous_queue_substitution),
+    ADD_TEST(queue_substitute_in_order),
+    ADD_TEST(queue_substitute_out_of_order),
    ADD_TEST(fill_image),
    ADD_TEST(fill_buffer),
    ADD_TEST(fill_svm_buffer),
@@ -93,9 +96,6 @@ test_definition test_list[] = {
    ADD_TEST(negative_create_command_buffer_null_queues),
    ADD_TEST(negative_create_command_buffer_repeated_properties),
    ADD_TEST(negative_create_command_buffer_not_supported_properties),
-    ADD_TEST(negative_create_command_buffer_queue_without_min_properties),
-    ADD_TEST(
-        negative_create_command_buffer_device_does_not_support_out_of_order_queue),
    ADD_TEST(negative_command_ndrange_queue_not_null),
    ADD_TEST(negative_command_ndrange_kernel_with_different_context),
    ADD_TEST(negative_command_ndrange_kernel_sync_points_null_or_num_zero),
@@ -155,10 +155,12 @@ test_definition test_list[] = {
    ADD_TEST(
        negative_enqueue_command_buffer_num_queues_not_zero_different_while_buffer_creation),
    ADD_TEST(negative_enqueue_command_buffer_not_valid_queue_in_queues),
-    ADD_TEST(negative_enqueue_queue_not_compatible),
    ADD_TEST(negative_enqueue_queue_with_different_context),
    ADD_TEST(negative_enqueue_command_buffer_different_context_than_event),
    ADD_TEST(negative_enqueue_event_wait_list_null_or_events_null),
+    ADD_TEST(negative_enqueue_queue_without_reqd_properties),
+    ADD_TEST(negative_enqueue_with_unsupported_queue_property),
+    ADD_TEST(negative_enqueue_inconsistent_device),
 };

 int main(int argc, const char *argv[])
--- a/test_conformance/extensions/cl_khr_command_buffer/negative_command_buffer_create.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/negative_command_buffer_create.cpp
@@ -137,7 +137,8 @@ struct CreateCommandBufferRepeatedProperties : public BasicCommandBufferTest
            rep_prop = CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;
            skip = false;
        }
-        else if (device_side_enqueue_support)
+        else if (is_extension_available(
+                     device, CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_NAME))
        {
            rep_prop = CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR;
            skip = false;
@@ -190,116 +191,12 @@ struct CreateCommandBufferNotSupportedProperties : public BasicCommandBufferTest
            unsupported_prop = CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR;
            skip = false;
        }
-        else if (!device_side_enqueue_support)
-        {
-            unsupported_prop = CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR;
-            skip = false;
-        }

        return skip;
    }

    cl_command_buffer_properties_khr unsupported_prop = 0;
 };
-
-// CL_INCOMPATIBLE_COMMAND_QUEUE_KHR if the properties of any command-queue in
-// queues does not contain the minimum properties specified by
-// CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR.
-struct CreateCommandBufferQueueWithoutMinProperties
-    : public BasicCommandBufferTest
-{
-    using BasicCommandBufferTest::BasicCommandBufferTest;
-
-    cl_int Run() override
-    {
-        cl_int error = CL_SUCCESS;
-
-        command_buffer = clCreateCommandBufferKHR(1, &queue, nullptr, &error);
-        test_failure_error_ret(error, CL_INCOMPATIBLE_COMMAND_QUEUE_KHR,
-                               "clCreateCommandBufferKHR should return "
-                               "CL_INCOMPATIBLE_COMMAND_QUEUE_KHR",
-                               TEST_FAIL);
-
-        return CL_SUCCESS;
-    }
-
-    bool Skip() override
-    {
-        if (BasicCommandBufferTest::Skip()) return true;
-
-        cl_command_queue_properties required_properties;
-        cl_int error = clGetDeviceInfo(
-            device, CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR,
-            sizeof(required_properties), &required_properties, NULL);
-        test_error(error,
-                   "Unable to query "
-                   "CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR");
-
-        cl_command_queue_properties queue_properties;
-        error = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
-                                      sizeof(queue_properties),
-                                      &queue_properties, NULL);
-        test_error(error, "Unable to query CL_QUEUE_PROPERTIES");
-
-        // Skip if queue properties contains those required
-        return required_properties == (required_properties & queue_properties);
-    }
-};
-
-// CL_INCOMPATIBLE_COMMAND_QUEUE_KHR if any command-queue in queues is an
-// out-of-order command-queue and the device associated with the command-queue
-// does not return CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE from
-// CL_DEVICE_COMMAND_BUFFER_SUPPORTED_QUEUE_PROPERTIES_KHR
-struct CreateCommandBufferDeviceDoesNotSupportOutOfOderQueue
-    : public BasicCommandBufferTest
-{
-    CreateCommandBufferDeviceDoesNotSupportOutOfOderQueue(
-        cl_device_id device, cl_context context, cl_command_queue queue)
-        : BasicCommandBufferTest(device, context, queue),
-          out_of_order_queue(nullptr)
-    {}
-
-    cl_int Run() override
-    {
-        cl_int error = CL_SUCCESS;
-
-        command_buffer =
-            clCreateCommandBufferKHR(1, &out_of_order_queue, nullptr, &error);
-        test_failure_error_ret(error, CL_INCOMPATIBLE_COMMAND_QUEUE_KHR,
-                               "clCreateCommandBufferKHR should return "
-                               "CL_INCOMPATIBLE_COMMAND_QUEUE_KHR",
-                               TEST_FAIL);
-
-        return CL_SUCCESS;
-    }
-
-    cl_int SetUp(int elements) override
-    {
-        cl_int error = CL_SUCCESS;
-
-        error = BasicCommandBufferTest::SetUp(elements);
-        test_error(error, "BasicCommandBufferTest::SetUp failed");
-
-        out_of_order_queue = clCreateCommandQueue(
-            context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error);
-        test_error(error,
-                   "clCreateCommandQueue with "
-                   "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE failed");
-
-        return CL_SUCCESS;
-    }
-
-    bool Skip() override
-    {
-        if (BasicCommandBufferTest::Skip()) return true;
-
-        // If device does not support out of order queue or if device supports
-        // out of order command buffer test should be skipped
-        return !queue_out_of_order_support || out_of_order_support;
-    }
-
-    clCommandQueueWrapper out_of_order_queue;
-};
 };

 int test_negative_create_command_buffer_num_queues(cl_device_id device,
@@ -335,20 +232,3 @@ int test_negative_create_command_buffer_not_supported_properties(
    return MakeAndRunTest<CreateCommandBufferNotSupportedProperties>(
        device, context, queue, num_elements);
 }
-
-int test_negative_create_command_buffer_queue_without_min_properties(
-    cl_device_id device, cl_context context, cl_command_queue queue,
-    int num_elements)
-{
-    return MakeAndRunTest<CreateCommandBufferQueueWithoutMinProperties>(
-        device, context, queue, num_elements);
-}
-
-int test_negative_create_command_buffer_device_does_not_support_out_of_order_queue(
-    cl_device_id device, cl_context context, cl_command_queue queue,
-    int num_elements)
-{
-    return MakeAndRunTest<
-        CreateCommandBufferDeviceDoesNotSupportOutOfOderQueue>(
-        device, context, queue, num_elements);
-}
--- a/test_conformance/extensions/cl_khr_command_buffer/negative_command_buffer_enqueue.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/negative_command_buffer_enqueue.cpp
@@ -16,7 +16,6 @@
 #include "basic_command_buffer.h"
 #include "procs.h"

-
 //--------------------------------------------------------------------------
 namespace {

@@ -293,63 +292,6 @@ struct EnqueueCommandBufferNotValidQueueInQueues : public BasicCommandBufferTest
    }
 };

-// CL_INCOMPATIBLE_COMMAND_QUEUE_KHR if any element of queues is not compatible
-// with the command-queue set on command_buffer creation at the same list index.
-struct EnqueueCommandBufferQueueNotCompatible : public BasicCommandBufferTest
-{
-    EnqueueCommandBufferQueueNotCompatible(cl_device_id device,
-                                           cl_context context,
-                                           cl_command_queue queue)
-        : BasicCommandBufferTest(device, context, queue),
-          queue_not_compatible(nullptr)
-    {}
-
-    cl_int Run() override
-    {
-        cl_int error = clFinalizeCommandBufferKHR(command_buffer);
-        test_error(error, "clFinalizeCommandBufferKHR failed");
-
-        error = clEnqueueCommandBufferKHR(1, &queue_not_compatible,
-                                          command_buffer, 0, nullptr, nullptr);
-
-        test_failure_error_ret(error, CL_INCOMPATIBLE_COMMAND_QUEUE_KHR,
-                               "clEnqueueCommandBufferKHR should return "
-                               "CL_INCOMPATIBLE_COMMAND_QUEUE_KHR",
-                               TEST_FAIL);
-
-        return CL_SUCCESS;
-    }
-
-    cl_int SetUp(int elements) override
-    {
-        cl_int error = BasicCommandBufferTest::SetUp(elements);
-        test_error(error, "BasicCommandBufferTest::SetUp failed");
-
-        queue_not_compatible = clCreateCommandQueue(
-            context, device, CL_QUEUE_PROFILING_ENABLE, &error);
-        test_error(error, "clCreateCommandQueue failed");
-
-        cl_command_queue_properties queue_properties;
-        error = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
-                                      sizeof(queue_properties),
-                                      &queue_properties, NULL);
-        test_error(error, "Unable to query CL_QUEUE_PROPERTIES");
-
-        cl_command_queue_properties queue_not_compatible_properties;
-        error = clGetCommandQueueInfo(queue_not_compatible, CL_QUEUE_PROPERTIES,
-                                      sizeof(queue_not_compatible_properties),
-                                      &queue_not_compatible_properties, NULL);
-        test_error(error, "Unable to query CL_QUEUE_PROPERTIES");
-
-        test_assert_error(queue_properties != queue_not_compatible_properties,
-                          "Queues properties must be different");
-
-        return CL_SUCCESS;
-    }
-
-    clCommandQueueWrapper queue_not_compatible;
-};
-
 // CL_INVALID_CONTEXT if any element of queues does not have the same context as
 // the command-queue set on command_buffer creation at the same list index.
 struct EnqueueCommandBufferQueueWithDifferentContext
@@ -491,6 +433,185 @@ struct EnqueueCommandBufferEventWaitListNullOrEventsNull
        return CL_SUCCESS;
    }
 };
+
+// CL_INCOMPATIBLE_COMMAND_QUEUE_KHR if the properties of any command-queue in
+// queues does not contain the minimum properties specified by
+// CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR.
+struct EnqueueCommandBufferQueueWithoutReqdProperties
+    : public BasicCommandBufferTest
+{
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+
+    cl_int Run() override
+    {
+        cl_int error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(0, nullptr, command_buffer, 0,
+                                          nullptr, nullptr);
+        test_failure_error_ret(error, CL_INCOMPATIBLE_COMMAND_QUEUE_KHR,
+                               "clEnqueueCommandBufferKHR should return "
+                               "CL_INCOMPATIBLE_COMMAND_QUEUE_KHR",
+                               TEST_FAIL);
+
+        error = clEnqueueCommandBufferKHR(1, &queue, command_buffer, 0, nullptr,
+                                          nullptr);
+        test_failure_error_ret(error, CL_INCOMPATIBLE_COMMAND_QUEUE_KHR,
+                               "clEnqueueCommandBufferKHR should return "
+                               "CL_INCOMPATIBLE_COMMAND_QUEUE_KHR",
+                               TEST_FAIL);
+
+        return CL_SUCCESS;
+    }
+
+    bool Skip() override
+    {
+        // Omit BasicCommandBufferTest::Skip() here because it skips
+        // if we don't have required properties, which is what we want to
+        // test an error for.
+
+        cl_command_queue_properties required_properties;
+        cl_int error = clGetDeviceInfo(
+            device, CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR,
+            sizeof(required_properties), &required_properties, NULL);
+        test_error(error,
+                   "Unable to query "
+                   "CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR");
+
+        cl_command_queue_properties queue_properties;
+        error = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
+                                      sizeof(queue_properties),
+                                      &queue_properties, NULL);
+        test_error(error, "Unable to query CL_QUEUE_PROPERTIES");
+
+        // Skip if queue properties contains those required
+        return required_properties == (required_properties & queue_properties);
+    }
+};
+
+// CL_INCOMPATIBLE_COMMAND_QUEUE_KHR if any command-queue in queues is an
+// out-of-order command-queue and the device associated with the command-queue
+// does not return CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE from
+// CL_DEVICE_COMMAND_BUFFER_SUPPORTED_QUEUE_PROPERTIES_KHR
+struct EnqueueCommandBufferWithUnsupportedQueueProperty
+    : public BasicCommandBufferTest
+{
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+
+    cl_int Run() override
+    {
+        cl_int error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(1, &out_of_order_queue,
+                                          command_buffer, 0, nullptr, nullptr);
+        test_failure_error_ret(error, CL_INCOMPATIBLE_COMMAND_QUEUE_KHR,
+                               "clEnqueueCommandBufferKHR should return "
+                               "CL_INCOMPATIBLE_COMMAND_QUEUE_KHR",
+                               TEST_FAIL);
+        return CL_SUCCESS;
+    }
+
+    cl_int SetUp(int elements) override
+    {
+        cl_int error = BasicCommandBufferTest::SetUp(elements);
+        test_error(error, "BasicCommandBufferTest::SetUp failed");
+
+        out_of_order_queue = clCreateCommandQueue(
+            context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &error);
+        test_error(error,
+                   "clCreateCommandQueue with "
+                   "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE failed");
+
+        return CL_SUCCESS;
+    }
+
+    bool Skip() override
+    {
+        if (BasicCommandBufferTest::Skip()) return true;
+
+        // If device does not support out of order queue or if device supports
+        // out of order command buffer test should be skipped
+        return !queue_out_of_order_support || out_of_order_support;
+    }
+
+    clCommandQueueWrapper out_of_order_queue = nullptr;
+};
+
+// CL_INVALID_DEVICE if any element of queues does not have the same device
+// as the command-queue set on command_buffer creation at the
+// same list index.
+struct EnqueueCommandBufferInconsistentDevice : public BasicCommandBufferTest
+{
+    using BasicCommandBufferTest::BasicCommandBufferTest;
+
+    cl_int Run() override
+    {
+        cl_int error = clFinalizeCommandBufferKHR(command_buffer);
+        test_error(error, "clFinalizeCommandBufferKHR failed");
+
+        error = clEnqueueCommandBufferKHR(1, &second_device_queue,
+                                          command_buffer, 0, nullptr, nullptr);
+        test_failure_error_ret(error, CL_INCOMPATIBLE_COMMAND_QUEUE_KHR,
+                               "clEnqueueCommandBufferKHR should return "
+                               "CL_INCOMPATIBLE_COMMAND_QUEUE_KHR",
+                               TEST_FAIL);
+        return CL_SUCCESS;
+    }
+
+    cl_int SetUp(int elements) override
+    {
+        cl_int error = BasicCommandBufferTest::SetUp(elements);
+        test_error(error, "BasicCommandBufferTest::SetUp failed");
+
+        cl_device_id second_device = nullptr;
+        for (auto query_device : devices)
+        {
+            if (query_device != device)
+            {
+                second_device = query_device;
+                break;
+            }
+        }
+
+        test_assert_error(second_device != nullptr,
+                          "Second device not found for testing");
+
+        second_device_queue =
+            clCreateCommandQueue(context, second_device, 0, &error);
+        test_error(error, "clCreateCommandQueue failed");
+
+        return CL_SUCCESS;
+    }
+
+    bool Skip() override
+    {
+        if (BasicCommandBufferTest::Skip()) return true;
+
+        size_t context_devices_size;
+        cl_int error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL,
+                                        &context_devices_size);
+        test_error(error, "clGetContextInfo failed");
+
+        size_t num_devices = context_devices_size / sizeof(cl_device_id);
+
+        if (num_devices < 2)
+        {
+            // We need a second device for test
+            return true;
+        }
+
+        devices.resize(num_devices);
+        error = clGetContextInfo(context, CL_CONTEXT_DEVICES, num_devices,
+                                 devices.data(), nullptr);
+        test_error(error, "clGetContextInfo failed");
+
+        return false;
+    }
+
+    std::vector<cl_device_id> devices;
+    clCommandQueueWrapper second_device_queue = nullptr;
+};
 };

 int test_negative_enqueue_command_buffer_invalid_command_buffer(
@@ -544,15 +665,6 @@ int test_negative_enqueue_command_buffer_not_valid_queue_in_queues(
        device, context, queue, num_elements);
 }

-int test_negative_enqueue_queue_not_compatible(cl_device_id device,
-                                               cl_context context,
-                                               cl_command_queue queue,
-                                               int num_elements)
-{
-    return MakeAndRunTest<EnqueueCommandBufferQueueNotCompatible>(
-        device, context, queue, num_elements);
-}
-
 int test_negative_enqueue_queue_with_different_context(cl_device_id device,
                                                       cl_context context,
                                                       cl_command_queue queue,
@@ -577,3 +689,29 @@ int test_negative_enqueue_event_wait_list_null_or_events_null(
    return MakeAndRunTest<EnqueueCommandBufferEventWaitListNullOrEventsNull>(
        device, context, queue, num_elements);
 }
+
+int test_negative_enqueue_queue_without_reqd_properties(cl_device_id device,
+                                                        cl_context context,
+                                                        cl_command_queue queue,
+                                                        int num_elements)
+{
+    return MakeAndRunTest<EnqueueCommandBufferQueueWithoutReqdProperties>(
+        device, context, queue, num_elements);
+}
+
+int test_negative_enqueue_with_unsupported_queue_property(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    int num_elements)
+{
+    return MakeAndRunTest<EnqueueCommandBufferWithUnsupportedQueueProperty>(
+        device, context, queue, num_elements);
+}
+
+int test_negative_enqueue_inconsistent_device(cl_device_id device,
+                                              cl_context context,
+                                              cl_command_queue queue,
+                                              int num_elements)
+{
+    return MakeAndRunTest<EnqueueCommandBufferInconsistentDevice>(
+        device, context, queue, num_elements);
+}
--- a/test_conformance/extensions/cl_khr_command_buffer/procs.h
+++ b/test_conformance/extensions/cl_khr_command_buffer/procs.h
@@ -91,6 +91,10 @@ extern int test_basic_profiling(cl_device_id device, cl_context context,
 extern int test_simultaneous_profiling(cl_device_id device, cl_context context,
                                       cl_command_queue queue,
                                       int num_elements);
+extern int test_substitute_queue_profiling(cl_device_id device,
+                                           cl_context context,
+                                           cl_command_queue queue,
+                                           int num_elements);
 extern int test_queue_substitution(cl_device_id device, cl_context context,
                                   cl_command_queue queue, int num_elements);
 extern int test_properties_queue_substitution(cl_device_id device,
@@ -101,6 +105,14 @@ extern int test_simultaneous_queue_substitution(cl_device_id device,
                                                cl_context context,
                                                cl_command_queue queue,
                                                int num_elements);
+extern int test_queue_substitute_in_order(cl_device_id device,
+                                          cl_context context,
+                                          cl_command_queue queue,
+                                          int num_elements);
+extern int test_queue_substitute_out_of_order(cl_device_id device,
+                                              cl_context context,
+                                              cl_command_queue queue,
+                                              int num_elements);
 extern int test_fill_image(cl_device_id device, cl_context context,
                           cl_command_queue queue, int num_elements);
 extern int test_fill_buffer(cl_device_id device, cl_context context,
@@ -211,13 +223,6 @@ extern int test_negative_create_command_buffer_repeated_properties(
 extern int test_negative_create_command_buffer_not_supported_properties(
    cl_device_id device, cl_context context, cl_command_queue queue,
    int num_elements);
-extern int test_negative_create_command_buffer_queue_without_min_properties(
-    cl_device_id device, cl_context context, cl_command_queue queue,
-    int num_elements);
-extern int
-test_negative_create_command_buffer_device_does_not_support_out_of_order_queue(
-    cl_device_id device, cl_context context, cl_command_queue queue,
-    int num_elements);
 extern int test_negative_command_ndrange_queue_not_null(cl_device_id device,
                                                        cl_context context,
                                                        cl_command_queue queue,
@@ -383,10 +388,6 @@ extern int test_negative_command_buffer_copy_image_mutable_handle_not_null(
 extern int test_negative_enqueue_command_buffer_not_valid_queue_in_queues(
    cl_device_id device, cl_context context, cl_command_queue queue,
    int num_elements);
-extern int test_negative_enqueue_queue_not_compatible(cl_device_id device,
-                                                      cl_context context,
-                                                      cl_command_queue queue,
-                                                      int num_elements);
 extern int test_negative_enqueue_queue_with_different_context(
    cl_device_id device, cl_context context, cl_command_queue queue,
    int num_elements);
@@ -396,6 +397,14 @@ extern int test_negative_enqueue_command_buffer_different_context_than_event(
 extern int test_negative_enqueue_event_wait_list_null_or_events_null(
    cl_device_id device, cl_context context, cl_command_queue queue,
    int num_elements);
-
-
+extern int test_negative_enqueue_queue_without_reqd_properties(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_negative_enqueue_with_unsupported_queue_property(
+    cl_device_id device, cl_context context, cl_command_queue queue,
+    int num_elements);
+extern int test_negative_enqueue_inconsistent_device(cl_device_id device,
+                                                     cl_context context,
+                                                     cl_command_queue queue,
+                                                     int num_elements);
 #endif // CL_KHR_COMMAND_BUFFER_PROCS_H
--- a/test_conformance/half/Test_vStoreHalf.cpp
+++ b/test_conformance/half/Test_vStoreHalf.cpp
@@ -341,8 +341,8 @@ int Test_vStoreHalf_private(cl_device_id device, f2h referenceFunc,
    int vectorSize, error;
    cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3];
    cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
-    cl_program resetProgram;
-    cl_kernel resetKernel;
+    cl_program resetProgram = nullptr;
+    cl_kernel resetKernel = nullptr;

    uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
    uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
@@ -1225,8 +1225,8 @@ int Test_vStoreaHalf_private(cl_device_id device, f2h referenceFunc,
    int vectorSize, error;
    cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3];
    cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
-    cl_program resetProgram;
-    cl_kernel resetKernel;
+    cl_program resetProgram = nullptr;
+    cl_kernel resetKernel = nullptr;

    uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
    uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
--- a/test_conformance/half/main.cpp
+++ b/test_conformance/half/main.cpp
@@ -144,11 +144,12 @@ static int ParseArgs( int argc, const char **argv )
 #if (defined( __APPLE__ ) || defined(__linux__) || defined(__MINGW32__))
    { // Extract the app name
        char baseName[ MAXPATHLEN ];
-        strncpy( baseName, argv[0], MAXPATHLEN );
+        strncpy(baseName, argv[0], MAXPATHLEN - 1);
+        baseName[MAXPATHLEN - 1] = '\0';
        char *base = basename( baseName );
        if( NULL != base )
        {
-            strncpy( appName, base, sizeof( appName )  );
+            strncpy(appName, base, sizeof(appName) - 1);
            appName[ sizeof( appName ) -1 ] = '\0';
        }
    }
--- a/test_conformance/images/clCopyImage/test_copy_1D.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_1D.cpp
@@ -25,7 +25,7 @@ int test_copy_image_size_1D( cl_context context, cl_command_queue queue, image_d
    size_t src_lod = 0, src_width_lod = imageInfo->width, src_row_pitch_lod;
    size_t dst_lod = 0, dst_width_lod = imageInfo->width, dst_row_pitch_lod;
    size_t width_lod = imageInfo->width;
-    size_t max_mip_level;
+    size_t max_mip_level = 0;

    if( gTestMipmaps )
    {
--- a/test_conformance/images/clCopyImage/test_copy_1D_array.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_1D_array.cpp
@@ -25,7 +25,7 @@ int test_copy_image_size_1D_array( cl_context context, cl_command_queue queue, i
    size_t src_lod = 0, src_width_lod = imageInfo->width, src_row_pitch_lod;
    size_t dst_lod = 0, dst_width_lod = imageInfo->width, dst_row_pitch_lod;
    size_t width_lod = imageInfo->width;
-    size_t max_mip_level;
+    size_t max_mip_level = 0;

    if( gTestMipmaps )
    {
--- a/test_conformance/images/clCopyImage/test_copy_2D.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_2D.cpp
@@ -27,7 +27,7 @@ int test_copy_image_size_2D( cl_context context, cl_command_queue queue, image_d
    size_t dst_lod = 0, dst_width_lod = imageInfo->width, dst_row_pitch_lod;
    size_t dst_height_lod = imageInfo->height;
    size_t width_lod = imageInfo->width, height_lod = imageInfo->height;
-    size_t max_mip_level;
+    size_t max_mip_level = 0;

    if( gTestMipmaps )
    {
--- a/test_conformance/images/clCopyImage/test_copy_2D_2D_array.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_2D_2D_array.cpp
@@ -72,7 +72,7 @@ int test_copy_image_size_2D_2D_array( cl_context context, cl_command_queue queue
    size_t threeImage_lod = 0, threeImage_width_lod = threeImage->width, threeImage_row_pitch_lod, threeImage_slice_pitch_lod;
    size_t threeImage_height_lod = threeImage->height;
    size_t width_lod, height_lod;
-    size_t twoImage_max_mip_level,threeImage_max_mip_level;
+    size_t twoImage_max_mip_level = 0, threeImage_max_mip_level = 0;

    if( gTestMipmaps )
    {
--- a/test_conformance/images/clCopyImage/test_copy_2D_3D.cpp
+++ b/test_conformance/images/clCopyImage/test_copy_2D_3D.cpp
@@ -68,7 +68,7 @@ int test_copy_image_size_2D_3D( cl_context context, cl_command_queue queue, imag
    size_t threeImage_lod = 0, threeImage_width_lod = threeImage->width, threeImage_row_pitch_lod, threeImage_slice_pitch_lod;
    size_t threeImage_height_lod = threeImage->height, depth_lod = threeImage->depth;
    size_t width_lod, height_lod;
-    size_t twoImage_max_mip_level,threeImage_max_mip_level;
+    size_t twoImage_max_mip_level = 0, threeImage_max_mip_level = 0;

    if( gTestMipmaps )
    {
--- a/test_conformance/images/clCopyImage/test_loops.cpp
+++ b/test_conformance/images/clCopyImage/test_loops.cpp
@@ -39,8 +39,8 @@ extern int test_copy_image_set_1D_buffer_1D(cl_device_id device,

 int test_image_type( cl_device_id device, cl_context context, cl_command_queue queue, MethodsToTest testMethod, cl_mem_flags flags )
 {
-    const char *name;
-    cl_mem_object_type imageType;
+    const char *name = nullptr;
+    cl_mem_object_type imageType = 0;

    if ( gTestMipmaps )
    {
--- a/test_conformance/images/clFillImage/test_fill_generic.cpp
+++ b/test_conformance/images/clFillImage/test_fill_generic.cpp
@@ -277,6 +277,11 @@ cl_mem create_image( cl_context context, cl_command_queue queue, BufferOwningPtr
            depth = imageInfo->depth;
            imageSize = imageInfo->slicePitch * imageInfo->depth;
            break;
+        default:
+            log_error("ERROR Invalid imageInfo->type = %d\n", imageInfo->type);
+            height = 0;
+            depth = 0;
+            break;
    }

    size_t origin[ 3 ] = { 0, 0, 0 };
--- a/test_conformance/images/clGetInfo/test_2D.cpp
+++ b/test_conformance/images/clGetInfo/test_2D.cpp
@@ -181,6 +181,10 @@ int test_get_image_info_single( cl_context context, image_descriptor *imageInfo,
    case CL_MEM_OBJECT_IMAGE3D:
      required_height = imageInfo->height;
      break;
+    default:
+        log_error("ERROR: Invalid imageInfo->type = %d\n", imageInfo->type);
+        required_height = 0;
+        break;
  }

    size_t outHeight;
@@ -204,6 +208,10 @@ int test_get_image_info_single( cl_context context, image_descriptor *imageInfo,
    case CL_MEM_OBJECT_IMAGE3D:
      required_depth = imageInfo->depth;
      break;
+    default:
+        log_error("ERROR: Invalid imageInfo->type = %d\n", imageInfo->type);
+        required_depth = 0;
+        break;
  }

  size_t outDepth;
@@ -227,6 +235,10 @@ int test_get_image_info_single( cl_context context, image_descriptor *imageInfo,
    case CL_MEM_OBJECT_IMAGE2D_ARRAY:
      required_array_size = imageInfo->arraySize;
      break;
+    default:
+        log_error("ERROR: Invalid imageInfo->type = %d\n", imageInfo->type);
+        required_array_size = 0;
+        break;
  }

  size_t outArraySize;
--- a/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
+++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_buffer.hpp
@@ -69,7 +69,7 @@ static inline size_t get_format_size(cl_context context,
    }

    cl_int error = 0;
-    cl_mem buffer;
+    cl_mem buffer = nullptr;
    if (imageType == CL_MEM_OBJECT_IMAGE1D_BUFFER)
    {
        buffer = clCreateBuffer(context, flags,
--- a/test_conformance/images/kernel_read_write/test_cl_ext_image_from_buffer.cpp
+++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_from_buffer.cpp
@@ -769,7 +769,7 @@ int image_from_buffer_fill_positive(cl_device_id device, cl_context context,
                err = clFinish(queue);
                test_error(err, "Error clFinish");

-                cl_mem image1d_buffer;
+                cl_mem image1d_buffer = nullptr;
                if (imageType == CL_MEM_OBJECT_IMAGE1D_BUFFER)
                {
                    image1d_buffer = clCreateBuffer(context, flag, buffer_size,
--- a/test_conformance/images/kernel_read_write/test_cl_ext_image_raw10_raw12.cpp
+++ b/test_conformance/images/kernel_read_write/test_cl_ext_image_raw10_raw12.cpp
@@ -20,6 +20,7 @@

 extern int gTypesToTest;
 extern int gtestTypesToRun;
+extern int gNormalizedModeToUse;
 extern bool gTestImage2DFromBuffer;
 extern cl_mem_flags gMemFlagsToUse;

@@ -38,16 +39,20 @@ static int test_image_set(cl_device_id device, cl_context context,
    log_info("---- Supported %s %s formats for this device for "
             "cl_ext_image_raw10_raw12---- \n",
             convert_image_type_to_string(imageType), "read");
-    log_info("  %-7s %-24s %d\n", "CL_R", "CL_UNSIGNED_INT_RAW10_EXT", 0);
-    log_info("  %-7s %-24s %d\n", "CL_R", "CL_UNSIGNED_INT_RAW12_EXT", 0);
+    log_info("  %-7s %-24s %d\n", "CL_R", "CL_UNSIGNED_INT_RAW10_EXT", 1);
+    log_info("  %-7s %-24s %d\n", "CL_R", "CL_UNSIGNED_INT_RAW12_EXT", 1);
    log_info("------------------------------------------- \n");

    image_sampler_data imageSampler;
    ImageTestTypes test{ kTestUInt, kUInt, uintFormats, "uint" };
+
    if (gTypesToTest & test.type)
    {
        std::vector<bool> filterFlags(formatList.size(), false);
        imageSampler.filter_mode = CL_FILTER_NEAREST;
+        // `CL_UNSIGNED_INT_RAW10_EXT` and `CL_UNSIGNED_INT_RAW12_EXT` image
+        // channel data types are unnormalised
+        imageSampler.normalized_coords = false;
        ret = test_read_image_formats(device, context, queue, formatList,
                                      filterFlags, &imageSampler,
                                      test.explicitType, imageType);
@@ -60,19 +65,34 @@ int ext_image_raw10_raw12(cl_device_id device, cl_context context,
 {
    int ret = 0;

-    if (0 == is_extension_available(device, "cl_ext_image_raw10_raw12"))
+    if (true != gNormalizedModeToUse)
    {
-        log_info("-----------------------------------------------------\n");
-        log_info("This device does not support "
-                 "cl_ext_image_raw10_raw12.\n");
-        log_info("Skipping cl_ext_image_raw10_raw12 "
-                 "image test.\n");
-        log_info("-----------------------------------------------------\n\n");
-        return 0;
+        if (0 == is_extension_available(device, "cl_ext_image_raw10_raw12"))
+        {
+            log_info("-----------------------------------------------------\n");
+            log_info("This device does not support "
+                     "cl_ext_image_raw10_raw12.\n");
+            log_info("Skipping cl_ext_image_raw10_raw12 "
+                     "image test.\n");
+            log_info(
+                "-----------------------------------------------------\n\n");
+            ret = TEST_SKIPPED_ITSELF;
+        }
+        else
+        {
+            gtestTypesToRun = kReadTests;
+            ret +=
+                test_image_set(device, context, queue, CL_MEM_OBJECT_IMAGE2D);
+        }
+    }
+    else
+    {
+        // skip the test if it is forced to be NORMALIZED from the command line
+        // argument i.e. gNormalizedModeToUse is true
+        log_info("cl_ext_image_raw10_raw12 does not support normalized channel "
+                 "components. Skipping the test.\n");
+        ret = TEST_SKIPPED_ITSELF;
    }
-    gtestTypesToRun = kReadTests;
-
-    ret += test_image_set(device, context, queue, CL_MEM_OBJECT_IMAGE2D);

    return ret;
 }
--- a/test_conformance/images/kernel_read_write/test_iterations.cpp
+++ b/test_conformance/images/kernel_read_write/test_iterations.cpp
@@ -1191,7 +1191,7 @@ int test_read_image_2D( cl_context context, cl_command_queue queue, cl_kernel ke
 {
    int error;
    static int initHalf = 0;
-    cl_mem imageBuffer;
+    cl_mem imageBuffer = nullptr;
    cl_mem_flags    image_read_write_flags = CL_MEM_READ_ONLY;
    size_t threads[2];

--- a/test_conformance/images/kernel_read_write/test_write_image.cpp
+++ b/test_conformance/images/kernel_read_write/test_write_image.cpp
@@ -223,7 +223,7 @@ int test_write_image( cl_device_id device, cl_context context, cl_command_queue
        clProtectedImage protImage;
        clMemWrapper unprotImage;
        cl_mem image;
-        cl_mem imageBuffer;
+        cl_mem imageBuffer = nullptr;

        if( gMemFlagsToUse == CL_MEM_USE_HOST_PTR )
        {
@@ -910,7 +910,7 @@ int test_write_image_formats(cl_device_id device, cl_context context,
        gTestCount++;

        print_write_header( &imageFormat, false );
-        int retCode;
+        int retCode = 0;
        switch (imageType)
        {
            case CL_MEM_OBJECT_IMAGE1D:
--- a/test_conformance/integer_ops/test_unary_ops.cpp
+++ b/test_conformance/integer_ops/test_unary_ops.cpp
@@ -97,7 +97,7 @@ int test_unary_op( cl_command_queue queue, cl_context context, OpKonstants which
        get_explicit_type_size(vecType) * vecSize * TEST_SIZE, inData, &error);
    test_error( error, "Creating input data array failed" );

-    cl_uint bits;
+    cl_uint bits = 0;
    for( i = 0; i < TEST_SIZE; i++ )
    {
        size_t which = i & 7;
--- a/test_conformance/math_brute_force/binary_double.cpp
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -344,26 +344,15 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/binary_float.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -349,26 +349,15 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/binary_half.cpp
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -229,26 +229,15 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -346,26 +346,15 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -337,26 +337,15 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/binary_i_half.cpp
+++ b/test_conformance/math_brute_force/binary_i_half.cpp
@@ -225,26 +225,15 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -214,6 +214,12 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
    cl_double *s;
    cl_double *s2;

+    bool reciprocal = strcmp(name, "reciprocal") == 0;
+    const double reciprocalArrayX[] = { 1.0 };
+    const double *specialValuesX =
+        reciprocal ? reciprocalArrayX : specialValues;
+    size_t specialValuesCountX = reciprocal ? 1 : specialValuesCount;
+
    Force64BitFPUPrecision();

    cl_event e[VECTOR_SIZE_COUNT];
@@ -242,7 +248,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
    cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
    cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
    cl_uint idx = 0;
-    int totalSpecialValueCount = specialValuesCount * specialValuesCount;
+    int totalSpecialValueCount = specialValuesCountX * specialValuesCount;
    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;

    // Test edge cases
@@ -252,14 +258,15 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
        cl_double *fp2 = (cl_double *)p2;
        uint32_t x, y;

-        x = (job_id * buffer_elements) % specialValuesCount;
+        x = (job_id * buffer_elements) % specialValuesCountX;
        y = (job_id * buffer_elements) / specialValuesCount;

        for (; idx < buffer_elements; idx++)
        {
-            fp[idx] = specialValues[x];
+            fp[idx] = specialValuesX[x];
            fp2[idx] = specialValues[y];
-            if (++x >= specialValuesCount)
+            ++x;
+            if (x >= specialValuesCountX)
            {
                x = 0;
                y++;
@@ -271,7 +278,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
    // Init any remaining values
    for (; idx < buffer_elements; idx++)
    {
-        p[idx] = genrand_int64(d);
+        p[idx] =
+            reciprocal ? ((cl_ulong *)specialValuesX)[0] : genrand_int64(d);
        p2[idx] = genrand_int64(d);
    }

@@ -337,26 +345,15 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
@@ -375,8 +372,13 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
    s = (cl_double *)gIn + thread_id * buffer_elements;
    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
-    for (size_t j = 0; j < buffer_elements; j++)
-        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
+
+    if (reciprocal)
+        for (size_t j = 0; j < buffer_elements; j++)
+            r[j] = (float)func.f_f(s2[j]);
+    else
+        for (size_t j = 0; j < buffer_elements; j++)
+            r[j] = (cl_double)func.f_ff(s[j], s2[j]);

    // Read the data back -- no need to wait for the first N-1 buffers but wait
    // for the last buffer. This is an in order queue.
@@ -406,7 +408,9 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            if (t[j] != q[j])
            {
                cl_double test = ((cl_double *)q)[j];
-                long double correct = func.f_ff(s[j], s2[j]);
+                long double correct =
+                    reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);
+
                float err = Bruteforce_Ulp_Error_Double(test, correct);
                int fail = !(fabsf(err) <= ulps);

@@ -479,8 +483,11 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
                    }
                    else if (IsDoubleSubnormal(s2[j]))
                    {
-                        long double correct2 = func.f_ff(s[j], 0.0);
-                        long double correct3 = func.f_ff(s[j], -0.0);
+                        long double correct2 =
+                            reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
+                        long double correct3 =
+                            reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);
+
                        float err2 =
                            Bruteforce_Ulp_Error_Double(test, correct2);
                        float err3 =
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -208,6 +208,11 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
    cl_float *s2 = 0;
    RoundingMode oldRoundMode;

+    bool reciprocal = strcmp(name, "reciprocal") == 0;
+    const float reciprocalArrayX[] = { 1.f };
+    const float *specialValuesX = reciprocal ? reciprocalArrayX : specialValues;
+    size_t specialValuesCountX = reciprocal ? 1 : specialValuesCount;
+
    if (relaxedMode)
    {
        func = job->f->rfunc;
@@ -239,7 +244,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
    cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
    cl_uint idx = 0;
-    int totalSpecialValueCount = specialValuesCount * specialValuesCount;
+    int totalSpecialValueCount = specialValuesCountX * specialValuesCount;
    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;

    if (job_id <= (cl_uint)lastSpecialJobIndex)
@@ -247,15 +252,15 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
        // Insert special values
        uint32_t x, y;

-        x = (job_id * buffer_elements) % specialValuesCount;
+        x = (job_id * buffer_elements) % specialValuesCountX;
        y = (job_id * buffer_elements) / specialValuesCount;

        for (; idx < buffer_elements; idx++)
        {
-            p[idx] = ((cl_uint *)specialValues)[x];
+            p[idx] = ((cl_uint *)specialValuesX)[x];
            p2[idx] = ((cl_uint *)specialValues)[y];
            ++x;
-            if (x >= specialValuesCount)
+            if (x >= specialValuesCountX)
            {
                x = 0;
                y++;
@@ -269,13 +274,19 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
                if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
                if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
            }
+            else if (relaxedMode && reciprocal)
+            {
+                cl_uint p2j = p2[idx] & 0x7fffffff;
+                // Replace values outside [2^-126, 2^126] with QNaN
+                if (p2j < 0x00807d99 || p2j > 0x7e800000) p2[idx] = 0x7fc00000;
+            }
        }
    }

    // Init any remaining values
    for (; idx < buffer_elements; idx++)
    {
-        p[idx] = genrand_int32(d);
+        p[idx] = reciprocal ? ((cl_uint *)specialValuesX)[0] : genrand_int32(d);
        p2[idx] = genrand_int32(d);

        if (relaxedMode && strcmp(name, "divide") == 0)
@@ -286,6 +297,12 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
            if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
        }
+        else if (relaxedMode && reciprocal)
+        {
+            cl_uint p2j = p2[idx] & 0x7fffffff;
+            // Replace values outside [2^-126, 2^126] with QNaN
+            if (p2j < 0x00807d99 || p2j > 0x7e800000) p2[idx] = 0x7fc00000;
+        }
    }

    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -350,26 +367,15 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
@@ -402,18 +408,31 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
    s2 = (float *)gIn2 + thread_id * buffer_elements;
    if (gInfNanSupport)
    {
-        for (size_t j = 0; j < buffer_elements; j++)
-            r[j] = (float)func.f_ff(s[j], s2[j]);
+        if (reciprocal)
+            for (size_t j = 0; j < buffer_elements; j++)
+                r[j] = (float)func.f_f(s2[j]);
+        else
+            for (size_t j = 0; j < buffer_elements; j++)
+                r[j] = (float)func.f_ff(s[j], s2[j]);
    }
    else
    {
-        for (size_t j = 0; j < buffer_elements; j++)
-        {
-            feclearexcept(FE_OVERFLOW);
-            r[j] = (float)func.f_ff(s[j], s2[j]);
-            overflow[j] =
-                FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
-        }
+        if (reciprocal)
+            for (size_t j = 0; j < buffer_elements; j++)
+            {
+                feclearexcept(FE_OVERFLOW);
+                r[j] = (float)func.f_f(s2[j]);
+                overflow[j] =
+                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+            }
+        else
+            for (size_t j = 0; j < buffer_elements; j++)
+            {
+                feclearexcept(FE_OVERFLOW);
+                r[j] = (float)func.f_ff(s[j], s2[j]);
+                overflow[j] =
+                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+            }
    }

    if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
@@ -448,7 +467,8 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            if (t[j] != q[j])
            {
                float test = ((float *)q)[j];
-                double correct = func.f_ff(s[j], s2[j]);
+                double correct =
+                    reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);

                // Per section 10 paragraph 6, accept any result if an input or
                // output is a infinity or NaN or overflow
@@ -485,7 +505,7 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
                    }

                    // retry per section 6.5.3.3
-                    if (IsFloatSubnormal(s[j]))
+                    if (!reciprocal && IsFloatSubnormal(s[j]))
                    {
                        double correct2, correct3;
                        float err2, err3;
@@ -591,8 +611,10 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)

                        if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);

-                        correct2 = func.f_ff(s[j], 0.0);
-                        correct3 = func.f_ff(s[j], -0.0);
+                        correct2 =
+                            reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
+                        correct3 =
+                            reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);

                        // Per section 10 paragraph 6, accept any result if an
                        // input or output is a infinity or NaN or overflow
@@ -625,7 +647,6 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
                    }
                }

-
                if (fabsf(err) > tinfo->maxError)
                {
                    tinfo->maxError = fabsf(err);
--- a/test_conformance/math_brute_force/binary_operator_half.cpp
+++ b/test_conformance/math_brute_force/binary_operator_half.cpp
@@ -120,6 +120,12 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
    std::vector<float> s(0), s2(0);
    RoundingMode oldRoundMode;

+    bool reciprocal = strcmp(name, "reciprocal") == 0;
+    const cl_half reciprocalArrayHalfX[] = { 0x3c00 };
+    const cl_half *specialValuesHalfX =
+        reciprocal ? reciprocalArrayHalfX : specialValuesHalf;
+    size_t specialValuesHalfCountX = reciprocal ? 1 : specialValuesHalfCount;
+
    cl_event e[VECTOR_SIZE_COUNT];
    cl_half *out[VECTOR_SIZE_COUNT];

@@ -148,7 +154,7 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
    cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements;
    cl_uint idx = 0;
    int totalSpecialValueCount =
-        specialValuesHalfCount * specialValuesHalfCount;
+        specialValuesHalfCountX * specialValuesHalfCount;
    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;

    if (job_id <= (cl_uint)lastSpecialJobIndex)
@@ -156,14 +162,15 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
        // Insert special values
        uint32_t x, y;

-        x = (job_id * buffer_elements) % specialValuesHalfCount;
+        x = (job_id * buffer_elements) % specialValuesHalfCountX;
        y = (job_id * buffer_elements) / specialValuesHalfCount;

        for (; idx < buffer_elements; idx++)
        {
-            p[idx] = specialValuesHalf[x];
+            p[idx] = specialValuesHalfX[x];
            p2[idx] = specialValuesHalf[y];
-            if (++x >= specialValuesHalfCount)
+            ++x;
+            if (x >= specialValuesHalfCountX)
            {
                x = 0;
                y++;
@@ -175,7 +182,8 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
    // Init any remaining values
    for (; idx < buffer_elements; idx++)
    {
-        p[idx] = (cl_half)genrand_int32(d);
+        p[idx] = reciprocal ? ((cl_half *)specialValuesHalfX)[0]
+                            : (cl_half)genrand_int32(d);
        p2[idx] = (cl_half)genrand_int32(d);
    }
    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
@@ -232,26 +240,15 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
@@ -283,11 +280,23 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
    s.resize(buffer_elements);
    s2.resize(buffer_elements);

-    for (size_t j = 0; j < buffer_elements; j++)
+    if (reciprocal)
    {
-        s[j] = HTF(p[j]);
-        s2[j] = HTF(p2[j]);
-        r[j] = HFF(func.f_ff(s[j], s2[j]));
+        for (size_t j = 0; j < buffer_elements; j++)
+        {
+            s[j] = HTF(p[j]);
+            s2[j] = HTF(p2[j]);
+            r[j] = HFF(func.f_f(s2[j]));
+        }
+    }
+    else
+    {
+        for (size_t j = 0; j < buffer_elements; j++)
+        {
+            s[j] = HTF(p[j]);
+            s2[j] = HTF(p2[j]);
+            r[j] = HFF(func.f_ff(s[j], s2[j]));
+        }
    }

    if (ftz) RestoreFPState(&oldMode);
@@ -320,7 +329,8 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
            if (r[j] != q[j])
            {
                float test = HTF(q[j]);
-                float correct = func.f_ff(s[j], s2[j]);
+                float correct =
+                    reciprocal ? func.f_f(s2[j]) : func.f_ff(s[j], s2[j]);

                // Per section 10 paragraph 6, accept any result if an input or
                // output is a infinity or NaN or overflow
@@ -446,9 +456,10 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                        double correct2, correct3;
                        float err2, err3;

-                        correct2 = func.f_ff(s[j], 0.0);
-                        correct3 = func.f_ff(s[j], -0.0);
-
+                        correct2 =
+                            reciprocal ? func.f_f(0.0) : func.f_ff(s[j], 0.0);
+                        correct3 =
+                            reciprocal ? func.f_f(-0.0) : func.f_ff(s[j], -0.0);

                        // Per section 10 paragraph 6, accept any result if an
                        // input or output is a infinity or NaN or overflow
--- a/test_conformance/math_brute_force/binary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -185,31 +185,18 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error =
-                     clSetKernelArg(kernels[j][thread_id], 1,
-                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1,
+                                   sizeof(gOutBuffer2[j]), &gOutBuffer2[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 3, sizeof(gInBuffer2),
+                                   &gInBuffer2);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/binary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -187,31 +187,18 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error =
-                     clSetKernelArg(kernels[j][thread_id], 1,
-                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1,
+                                   sizeof(gOutBuffer2[j]), &gOutBuffer2[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 3, sizeof(gInBuffer2),
+                                   &gInBuffer2);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/binary_two_results_i_half.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_half.cpp
@@ -179,31 +179,18 @@ int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode)
            // align working group size with the bigger output type
            size_t vectorSize = sizeValues[j] * sizeof(int32_t);
            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error =
-                     clSetKernelArg(kernels[j][thread_id], 1,
-                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1,
+                                   sizeof(gOutBuffer2[j]), &gOutBuffer2[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 3, sizeof(gInBuffer2),
+                                   &gInBuffer2);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -29,31 +29,32 @@
 // Only use ulps information in spir test
 #ifdef FUNCTION_LIST_ULPS_ONLY

-#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)            \
+#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _half_embedded_ulp,       \
+              _rmode, _type)                                                   \
    {                                                                          \
        STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },      \
-            _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,  \
-            RELAXED_OFF, _type                                                 \
+            _ulp, _ulp, _half_ulp, _half_embedded_ulp, _embedded_ulp,          \
+            INFINITY, INFINITY, _rmode, RELAXED_OFF, _type                     \
    }
-#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \
-                  _type, _relaxed_embedded_ulp)                                \
+#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _half_embedded_ulp,   \
+                  _relaxed_ulp, _rmode, _type, _relaxed_embedded_ulp)          \
    {                                                                          \
        STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },      \
-            _ulp, _ulp, _half_ulp, _embedded_ulp, _relaxed_ulp,                \
-            _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type                   \
+            _ulp, _ulp, _half_ulp, _half_embedded_ulp, _embedded_ulp,          \
+            _relaxed_ulp, _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type     \
    }
 #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                  \
    {                                                                          \
        "half_" STRINGIFY(_name), "half_" STRINGIFY(_name), { NULL },          \
-            { NULL }, { NULL }, _ulp, _ulp, _ulp, _embedded_ulp, INFINITY,     \
-            INFINITY, _rmode, RELAXED_OFF, _type                               \
+            { NULL }, { NULL }, _ulp, _ulp, _ulp, _ulp, _embedded_ulp,         \
+            INFINITY, INFINITY, _rmode, RELAXED_OFF, _type                     \
    }
 #define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp,       \
-                       _rmode, _type)                                          \
+                       _half_embedded_ulp, _rmode, _type)                      \
    {                                                                          \
        STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \
-            _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \
-            _type                                                              \
+            _half_ulp, _half_embedded_ulp, _embedded_ulp, INFINITY, INFINITY,  \
+            _rmode, RELAXED_OFF, _type                                         \
    }

 #define unaryF NULL
@@ -78,41 +79,47 @@
 #define reference_copysign NULL
 #define reference_sqrt NULL
 #define reference_sqrtl NULL
+#define reference_reciprocal NULL
+#define reference_reciprocall NULL
+#define reference_relaxed_reciprocal NULL
+
 #define reference_divide NULL
 #define reference_dividel NULL
 #define reference_relaxed_divide NULL

 #else // FUNCTION_LIST_ULPS_ONLY

-#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)            \
+#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _half_embedded_ulp,       \
+              _rmode, _type)                                                   \
    {                                                                          \
        STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },      \
            { (void*)reference_##_name##l }, { (void*)reference_##_name },     \
-            _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,  \
-            RELAXED_OFF, _type                                                 \
+            _ulp, _ulp, _half_ulp, _half_embedded_ulp, _embedded_ulp,          \
+            INFINITY, INFINITY, _rmode, RELAXED_OFF, _type                     \
    }
-#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \
-                  _type, _relaxed_embedded_ulp)                                \
+#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _half_embedded_ulp,   \
+                  _relaxed_ulp, _rmode, _type, _relaxed_embedded_ulp)          \
    {                                                                          \
        STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },      \
            { (void*)reference_##_name##l },                                   \
            { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _half_ulp,     \
-            _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode,        \
-            RELAXED_ON, _type                                                  \
+            _half_embedded_ulp, _embedded_ulp, _relaxed_ulp,                   \
+            _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type                   \
    }
 #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                  \
    {                                                                          \
        "half_" STRINGIFY(_name), "half_" STRINGIFY(_name),                    \
            { (void*)reference_##_name }, { NULL }, { NULL }, _ulp, _ulp,      \
-            _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF,      \
-            _type                                                              \
+            _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,             \
+            RELAXED_OFF, _type                                                 \
    }
 #define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp,       \
-                       _rmode, _type)                                          \
+                       _half_embedded_ulp, _rmode, _type)                      \
    {                                                                          \
        STRINGIFY(_name), _operator, { (void*)reference_##_name },             \
            { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _half_ulp,  \
-            _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
+            _half_embedded_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,     \
+            RELAXED_OFF, _type                                                 \
    }

 static constexpr vtbl _unary = {
@@ -248,19 +255,19 @@ static constexpr vtbl _mad_tbl = {

 // clang-format off
 const Func functionList[] = {
-    ENTRY_EXT(acos, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
-    ENTRY(acosh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
-    ENTRY(acospi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(asin, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
-    ENTRY(asinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
-    ENTRY(asinpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(atan, 5.0f, 5.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
-    ENTRY(atanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
-    ENTRY(atanpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
-    ENTRY(atan2, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF),
-    ENTRY(atan2pi, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF),
-    ENTRY(cbrt, 2.0f, 4.0f, 2.f, FTZ_OFF, unaryF),
-    ENTRY(ceil, 0.0f, 0.0f, 0.f, FTZ_OFF, unaryF),
+    ENTRY_EXT(acos, 4.0f, 4.0f, 2.0f, 3.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
+    ENTRY(acosh, 4.0f, 4.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
+    ENTRY(acospi, 5.0f, 5.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(asin, 4.0f, 4.0f, 2.0f, 3.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
+    ENTRY(asinh, 4.0f, 4.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
+    ENTRY(asinpi, 5.0f, 5.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(atan, 5.0f, 5.0f, 2.0f, 3.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
+    ENTRY(atanh, 5.0f, 5.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
+    ENTRY(atanpi, 5.0f, 5.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
+    ENTRY(atan2, 6.0f, 6.0f, 2.0f, 3.0f, FTZ_OFF, binaryF),
+    ENTRY(atan2pi, 6.0f, 6.0f, 2.0f, 3.0f, FTZ_OFF, binaryF),
+    ENTRY(cbrt, 2.0f, 4.0f, 2.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(ceil, 0.0f, 0.0f, 0.f, 0.f, FTZ_OFF, unaryF),
    { "copysign",
      "copysign",
      { (void*)reference_copysignf },
@@ -270,97 +277,97 @@ const Func functionList[] = {
      0.0f,
      0.0f,
      0.0f,
+      0.0f,
      INFINITY,
      INFINITY,
      FTZ_OFF,
      RELAXED_OFF,
      binaryF },
-    ENTRY_EXT(cos, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY_EXT(cos, 4.0f, 4.0f, 2.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,
              0.00048828125f), // relaxed ulp 2^-11
-    ENTRY(cosh, 4.0f, 4.0f, 2.f, FTZ_OFF, unaryF),
-    ENTRY_EXT(cospi, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY(cosh, 4.0f, 4.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(cospi, 4.0f, 4.0f, 2.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,
              0.00048828125f), // relaxed ulp 2^-11
-    ENTRY(erfc, 16.0f, 16.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(erf,  16.0f, 16.0f, 4.0f, FTZ_OFF, unaryF),
+    ENTRY(erfc, 16.0f, 16.0f, 4.0f, 4.0f, FTZ_OFF, unaryF),
+    ENTRY(erf,  16.0f, 16.0f, 4.0f, 4.0f, FTZ_OFF, unaryF),

    // relaxed error is overwritten in unary.c as it is 3+floor(fabs(2*x))
-    ENTRY_EXT(exp, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF, 4.0f),
+    ENTRY_EXT(exp, 3.0f, 4.0f, 2.0f, 3.0f, 3.0f, FTZ_OFF, unaryF, 4.0f),

    // relaxed error is overwritten in unary.c as it is 3+floor(fabs(2*x))
-    ENTRY_EXT(exp2, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF, 4.0f),
+    ENTRY_EXT(exp2, 3.0f, 4.0f, 2.0f, 3.0f, 3.0f, FTZ_OFF, unaryF, 4.0f),

    // relaxed error is overwritten in unary.c as it is 3+floor(fabs(2*x)) in derived mode;
    // in non-derived mode it uses the ulp error for half_exp10.
-    ENTRY_EXT(exp10, 3.0f, 4.0f, 2.f, 8192.0f, FTZ_OFF, unaryF, 8192.0f),
+    ENTRY_EXT(exp10, 3.0f, 4.0f, 2.0f, 3.0f, 8192.0f, FTZ_OFF, unaryF, 8192.0f),

-    ENTRY(expm1, 3.0f, 4.0f, 2.f, FTZ_OFF, unaryF),
-    ENTRY(fabs, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(fdim, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(floor, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(fma, 0.0f, 0.0f, 0.0f, FTZ_OFF, ternaryF),
-    ENTRY(fmax, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(fmin, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(fmod, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(fract, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
-    ENTRY(frexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i),
-    ENTRY(hypot, 4.0f, 4.0f, 2.0f, FTZ_OFF, binaryF),
-    ENTRY(ilogb, 0.0f, 0.0f, 0.0f, FTZ_OFF, i_unaryF),
-    ENTRY(isequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isfinite, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isgreaterequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isinf, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isless, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(islessequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(islessgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isnan, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isnormal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isnotequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isunordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(ldexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_i),
-    ENTRY(lgamma, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF),
-    ENTRY(lgamma_r, INFINITY, INFINITY, INFINITY, FTZ_OFF,
+    ENTRY(expm1, 3.0f, 4.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
+    ENTRY(fabs, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(fdim, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(floor, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(fma, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, ternaryF),
+    ENTRY(fmax, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(fmin, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(fmod, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(fract, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
+    ENTRY(frexp, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i),
+    ENTRY(hypot, 4.0f, 4.0f,  2.0f, 3.0f, FTZ_OFF, binaryF),
+    ENTRY(ilogb, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, i_unaryF),
+    ENTRY(isequal, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isfinite, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isgreater, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isgreaterequal, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isinf, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isless, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(islessequal, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(islessgreater, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isnan, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isnormal, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isnotequal, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isordered, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isunordered, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(ldexp, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_i),
+    ENTRY(lgamma, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF),
+    ENTRY(lgamma_r, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF,
          unaryF_two_results_i),
-    ENTRY_EXT(log, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
+    ENTRY_EXT(log, 3.0f, 4.0f, 2.0f, 3.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
              4.76837158203125e-7f), // relaxed ulp 2^-21
-    ENTRY_EXT(log2, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
+    ENTRY_EXT(log2, 3.0f, 4.0f, 2.0f, 3.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
              4.76837158203125e-7f), // relaxed ulp 2^-21
-    ENTRY_EXT(log10, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
+    ENTRY_EXT(log10, 3.0f, 4.0f, 2.0f, 3.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
              4.76837158203125e-7f), // relaxed ulp 2^-21
-    ENTRY(log1p, 2.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
-    ENTRY(logb, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(log1p, 2.0f, 4.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
+    ENTRY(logb, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),

    // In fast-relaxed-math mode it has to be either exactly rounded fma or exactly rounded a*b+c
-    ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function, INFINITY),
+    ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function, INFINITY),

-    ENTRY(maxmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(minmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(modf, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
-    ENTRY(nan, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_u),
-    ENTRY(nextafter, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter),
+    ENTRY(maxmag, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(minmag, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(modf, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
+    ENTRY(nan, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_u),
+    ENTRY(nextafter, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter),

    // In derived mode the ulp error is calculated as exp2(y*log2(x)).
    // In non-derived it is the same as half_pow.
-    ENTRY_EXT(pow, 16.0f, 16.0f, 4.0f, 8192.0f, FTZ_OFF, binaryF, 8192.0f),
+    ENTRY_EXT(pow, 16.0f, 16.0f, 4.0f, 5.0f, 8192.0f, FTZ_OFF, binaryF, 8192.0f),

-    ENTRY(pown, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i),
-    ENTRY(powr, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF),
-    //ENTRY(reciprocal, 1.0f, 1.0f, FTZ_OFF, unaryF),
-    ENTRY(remainder, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(remquo, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i),
-    ENTRY(rint, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(rootn, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i),
-    ENTRY(round, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(rsqrt, 2.0f, 4.0f, 1.0f, FTZ_OFF, unaryF),
-    ENTRY(signbit, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY_EXT(sin, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY(pown, 16.0f, 16.0f, 4.0f, 5.0f, FTZ_OFF, binaryF_i),
+    ENTRY(powr, 16.0f, 16.0f, 4.0f, 5.0f, FTZ_OFF, binaryF),
+    ENTRY(remainder, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(remquo, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i),
+    ENTRY(rint, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(rootn, 16.0f, 16.0f, 4.0f, 5.0f, FTZ_OFF, binaryF_i),
+    ENTRY(round, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(rsqrt, 2.0f, 4.0f, 1.0f, 1.0f, FTZ_OFF, unaryF),
+    ENTRY(signbit, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY_EXT(sin, 4.0f, 4.0f, 2.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,
              0.00048828125f), // relaxed ulp 2^-11
-    ENTRY_EXT(sincos, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF,
+    ENTRY_EXT(sincos, 4.0f, 4.0f, 2.0f, 2.0f, 0.00048828125f, FTZ_OFF,
              unaryF_two_results,
              0.00048828125f), // relaxed ulp 2^-11
-    ENTRY(sinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(sinpi, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY(sinh, 4.0f, 4.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(sinpi, 4.0f, 4.0f, 2.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,
              0.00048828125f), // relaxed ulp 2^-11
    { "sqrt",
      "sqrt",
@@ -370,6 +377,7 @@ const Func functionList[] = {
      3.0f,
      0.0f,
      0.0f,
+      1.0f,
      4.0f,
      INFINITY,
      INFINITY,
@@ -387,18 +395,19 @@ const Func functionList[] = {
      INFINITY,
      INFINITY,
      INFINITY,
+      INFINITY,
      FTZ_OFF,
      RELAXED_OFF,
      unaryOF /* only for single precision */ },

    // In derived mode it the ulp error is calculated as sin/cos.
    // In non-derived mode it is the same as half_tan.
-    ENTRY_EXT(tan, 5.0f, 5.0f, 2.0f, 8192.0f, FTZ_OFF, unaryF, 8192.0f),
+    ENTRY_EXT(tan, 5.0f, 5.0f, 2.0f, 3.0f, 8192.0f, FTZ_OFF, unaryF, 8192.0f),

-    ENTRY(tanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
-    ENTRY(tanpi, 6.0f, 6.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(tanh, 5.0f, 5.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
+    ENTRY(tanpi, 6.0f, 6.0f, 2.0f, 3.0f, FTZ_OFF, unaryF),
    //ENTRY(tgamma, 16.0f, 16.0f, FTZ_OFF, unaryF), Commented this out until we can be sure this requirement is realistic
-    ENTRY(trunc, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(trunc, 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),

    HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
    HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryOF),
@@ -416,8 +425,24 @@ const Func functionList[] = {
    HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryOF),

    // basic operations
-    OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
-    OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
+    OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
+    OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
+    //ENTRY(reciprocal, 1.0f, 1.0f, FTZ_OFF, unaryF),
+    { "reciprocal",
+      "/",
+      { (void*)reference_reciprocal },
+      { (void*)reference_reciprocall },
+      { (void*)reference_relaxed_reciprocal },
+      2.5f,
+      0.0f,
+      0.0f,
+      1.0f,
+      3.0f,
+      2.5f,
+      INFINITY,
+      FTZ_OFF,
+      RELAXED_ON,
+      binaryOperatorF },
    { "divide",
      "/",
      { (void*)reference_divide },
@@ -426,6 +451,7 @@ const Func functionList[] = {
      2.5f,
      0.0f,
      1.0f,
+      1.0f,
      3.0f,
      2.5f,
      INFINITY,
@@ -443,13 +469,14 @@ const Func functionList[] = {
      INFINITY,
      INFINITY,
      INFINITY,
+      INFINITY,
      FTZ_OFF,
      RELAXED_OFF,
      binaryOperatorOF /* only for single precision */ },
-    OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
-    OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, 0.0f, FTZ_OFF,
+    OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
+    OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF,
                   unaryF), // A simple copy operation
-    OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
 };
 // clang-format on

--- a/test_conformance/math_brute_force/function_list.h
+++ b/test_conformance/math_brute_force/function_list.h
@@ -87,6 +87,7 @@ struct Func
    float float_ulps;
    float double_ulps;
    float half_ulps;
+    float half_embedded_ulps;
    float float_embedded_ulps;
    float relaxed_error;
    float relaxed_embedded_error;
--- a/test_conformance/math_brute_force/i_unary_double.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -122,18 +122,12 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/i_unary_float.cpp
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -121,18 +121,12 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/i_unary_half.cpp
+++ b/test_conformance/math_brute_force/i_unary_half.cpp
@@ -118,18 +118,12 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
        {
            size_t vectorSize = sizeValues[j] * sizeof(cl_int);
            size_t localCount = (bufferSizeOut + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -327,26 +327,15 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -320,26 +320,15 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/macro_binary_half.cpp
+++ b/test_conformance/math_brute_force/macro_binary_half.cpp
@@ -210,26 +210,15 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");
+        error =
+            clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), &tinfo->inBuf2);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/macro_unary_double.cpp
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -168,20 +168,12 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/macro_unary_float.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -169,20 +169,12 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/macro_unary_half.cpp
+++ b/test_conformance/math_brute_force/macro_unary_half.cpp
@@ -151,20 +151,12 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/mad_double.cpp
+++ b/test_conformance/math_brute_force/mad_double.cpp
@@ -131,30 +131,18 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
-                                        sizeof(gInBuffer3), &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer2),
+                                   &gInBuffer2);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 3, sizeof(gInBuffer3),
+                                   &gInBuffer3);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/mad_float.cpp
+++ b/test_conformance/math_brute_force/mad_float.cpp
@@ -132,30 +132,18 @@ int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
-                                        sizeof(gInBuffer3), &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer2),
+                                   &gInBuffer2);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 3, sizeof(gInBuffer3),
+                                   &gInBuffer3);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/mad_half.cpp
+++ b/test_conformance/math_brute_force/mad_half.cpp
@@ -126,30 +126,18 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
            size_t vectorSize = sizeof(cl_half) * sizeValues[j];
            size_t localCount = (bufferSize + vectorSize - 1)
                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
-                                        sizeof(gInBuffer3), &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer2),
+                                   &gInBuffer2);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 3, sizeof(gInBuffer3),
+                                   &gInBuffer3);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -154,7 +154,7 @@ static int doTest(const char *name)
        exit(EXIT_FAILURE);
    }

-    if (func_data->func.p == NULL)
+    if (func_data->func.p == NULL && func_data->rfunc.p == NULL)
    {
        vlog("'%s' is missing implementation, skipping function.\n",
             func_data->name);
@@ -308,9 +308,10 @@ static test_definition test_list[] = {
    ADD_TEST(half_log),      ADD_TEST(half_log2),  ADD_TEST(half_log10),
    ADD_TEST(half_powr),     ADD_TEST(half_recip), ADD_TEST(half_rsqrt),
    ADD_TEST(half_sin),      ADD_TEST(half_sqrt),  ADD_TEST(half_tan),
-    ADD_TEST(add),           ADD_TEST(subtract),   ADD_TEST(divide),
-    ADD_TEST(divide_cr),     ADD_TEST(multiply),   ADD_TEST(assignment),
-    ADD_TEST(not ),          ADD_TEST(erf),        ADD_TEST(erfc),
+    ADD_TEST(add),           ADD_TEST(subtract),   ADD_TEST(reciprocal),
+    ADD_TEST(divide),        ADD_TEST(divide_cr),  ADD_TEST(multiply),
+    ADD_TEST(assignment),    ADD_TEST(not ),       ADD_TEST(erf),
+    ADD_TEST(erfc),
 };

 #undef ADD_TEST
@@ -980,19 +981,6 @@ static void ReleaseCL(void)
    }
 }

-void _LogBuildError(cl_program p, int line, const char *file)
-{
-    char the_log[2048] = "";
-
-    vlog_error("%s:%d: Build Log:\n", file, line);
-    if (0
-        == clGetProgramBuildInfo(p, gDevice, CL_PROGRAM_BUILD_LOG,
-                                 sizeof(the_log), the_log, NULL))
-        vlog_error("%s", the_log);
-    else
-        vlog_error("*** Error getting build log for program %p\n", p);
-}
-
 int InitILogbConstants(void)
 {
    int error;
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -1856,6 +1856,13 @@ double reference_logb(double x)

 double reference_relaxed_reciprocal(double x) { return 1.0f / ((float)x); }

+long double reference_reciprocall(long double y)
+{
+    double dx = 1.0;
+    double dy = y;
+    return dx / dy;
+}
+
 double reference_reciprocal(double x) { return 1.0 / x; }

 double reference_remainder(double x, double y)
@@ -3740,9 +3747,6 @@ long double reference_nanl(cl_ulong x)
    return (long double)u.f;
 }

-
-long double reference_reciprocall(long double x) { return 1.0L / x; }
-
 long double reference_remainderl(long double x, long double y)
 {
    int i;
@@ -5771,4 +5775,4 @@ long double reference_erfcl(long double x) { return erfc(x); }
 long double reference_erfl(long double x) { return erf(x); }

 double reference_erfc(double x) { return erfc(x); }
-double reference_erf(double x) { return erf(x); }
+double reference_erf(double x) { return erf(x); }
--- a/test_conformance/math_brute_force/ternary_double.cpp
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -236,30 +236,18 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
-                                        sizeof(gInBuffer3), &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer2),
+                                   &gInBuffer2);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 3, sizeof(gInBuffer3),
+                                   &gInBuffer3);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/ternary_float.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -258,30 +258,18 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
-                                        sizeof(gInBuffer3), &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer2),
+                                   &gInBuffer2);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 3, sizeof(gInBuffer3),
+                                   &gInBuffer3);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/ternary_half.cpp
+++ b/test_conformance/math_brute_force/ternary_half.cpp
@@ -191,30 +191,18 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
            size_t vectorSize = sizeof(cl_half) * sizeValues[j];
            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
-                                        sizeof(gInBuffer3), &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer2),
+                                   &gInBuffer2);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 3, sizeof(gInBuffer3),
+                                   &gInBuffer3);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/unary_double.cpp
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -176,20 +176,12 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -205,20 +205,12 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument 0");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument 1");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/unary_half.cpp
+++ b/test_conformance/math_brute_force/unary_half.cpp
@@ -154,20 +154,12 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];

-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
+        error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                               &tinfo->outBuf[j]);
+        test_error(error, "Failed to set kernel argument");
+        error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), &tinfo->inBuf);
+        test_error(error, "Failed to set kernel argument");

        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
                                            &vectorCount, NULL, 0, NULL, NULL)))
--- a/test_conformance/math_brute_force/unary_two_results_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_double.cpp
@@ -143,25 +143,15 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
        {
            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error =
-                     clSetKernelArg(kernels[j][thread_id], 1,
-                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1,
+                                   sizeof(gOutBuffer2[j]), &gOutBuffer2[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/unary_two_results_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -159,25 +159,15 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
        {
            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error =
-                     clSetKernelArg(kernels[j][thread_id], 1,
-                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1,
+                                   sizeof(gOutBuffer2[j]), &gOutBuffer2[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/unary_two_results_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_half.cpp
@@ -132,25 +132,15 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
        {
            size_t vectorSize = sizeValues[j] * sizeof(cl_half);
            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error =
-                     clSetKernelArg(kernels[j][thread_id], 1,
-                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1,
+                                   sizeof(gOutBuffer2[j]), &gOutBuffer2[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/unary_two_results_i_double.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -151,25 +151,15 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
        {
            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error =
-                     clSetKernelArg(kernels[j][thread_id], 1,
-                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1,
+                                   sizeof(gOutBuffer2[j]), &gOutBuffer2[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/unary_two_results_i_float.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
@@ -156,25 +156,15 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
        {
            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error =
-                     clSetKernelArg(kernels[j][thread_id], 1,
-                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1,
+                                   sizeof(gOutBuffer2[j]), &gOutBuffer2[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/unary_two_results_i_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
@@ -145,25 +145,15 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
            // align working group size with the bigger output type
            size_t vectorSize = sizeValues[j] * sizeof(cl_int);
            size_t localCount = (bufferSizeHi + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error =
-                     clSetKernelArg(kernels[j][thread_id], 1,
-                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1,
+                                   sizeof(gOutBuffer2[j]), &gOutBuffer2[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 2, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/unary_u_double.cpp
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -116,18 +116,12 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
        {
            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/unary_u_float.cpp
+++ b/test_conformance/math_brute_force/unary_u_float.cpp
@@ -123,18 +123,12 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
        {
            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/unary_u_half.cpp
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -111,18 +111,12 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
        {
            size_t vectorSize = sizeValues[j] * sizeof(cl_half);
            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
-            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
-                                        sizeof(gInBuffer), &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                return error;
-            }
+            error = clSetKernelArg(kernels[j][thread_id], 0,
+                                   sizeof(gOutBuffer[j]), &gOutBuffer[j]);
+            test_error(error, "Failed to set kernel argument");
+            error = clSetKernelArg(kernels[j][thread_id], 1, sizeof(gInBuffer),
+                                   &gInBuffer);
+            test_error(error, "Failed to set kernel argument");

            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
                                                1, NULL, &localCount, NULL, 0,
--- a/test_conformance/math_brute_force/utility.cpp
+++ b/test_conformance/math_brute_force/utility.cpp
@@ -195,8 +195,14 @@ float getAllowedUlpError(const Func *f, Type t, const bool relaxed)
            // TODO: distinguish between embedded and full profile.
            return f->double_ulps;
        case khalf:
-            // TODO: distinguish between embedded and full profile.
-            return f->half_ulps;
+            if (gIsEmbedded)
+            {
+                return f->half_embedded_ulps;
+            }
+            else
+            {
+                return f->half_ulps;
+            }
        default:
            assert(false && "unsupported type in getAllowedUlpError");
            // Return a negative value which will make any test fail.
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -114,9 +114,6 @@ inline double DoubleFromUInt32(uint32_t bits)
    return u.d;
 }

-void _LogBuildError(cl_program p, int line, const char *file);
-#define LogBuildError(program) _LogBuildError(program, __LINE__, __FILE__)
-
 // The spec is fairly clear that we may enforce a hard cutoff to prevent
 // premature flushing to zero.
 // However, to avoid conflict for 1.0, we are letting results at TYPE_MIN +
--- a/test_conformance/printf/test_printf.cpp
+++ b/test_conformance/printf/test_printf.cpp
@@ -317,8 +317,10 @@ cl_program makeMixedFormatPrintfProgram(cl_kernel* kernel_ptr,
        {
            const float max_range = 100000.f;
            float arg = get_random_float(-max_range, max_range, gMTdata);
-            args_str << str_sprintf("%f", arg) << "f, ";
-            ref_str << str_sprintf(format, arg) << ", ";
+            std::string arg_str = str_sprintf("%f", arg);
+            args_str << arg_str << "f, ";
+            float arg_deviceRound = std::stof(arg_str);
+            ref_str << str_sprintf(format, arg_deviceRound) << ", ";
        }
    }
    // Restore the original CPU rounding mode
@@ -1151,7 +1153,8 @@ int main(int argc, const char* argv[])
    char* pcTempFname = get_temp_filename();
    if (pcTempFname != nullptr)
    {
-        strncpy(gFileName, pcTempFname, sizeof(gFileName));
+        strncpy(gFileName, pcTempFname, sizeof(gFileName) - 1);
+        gFileName[sizeof(gFileName) - 1] = '\0';
    }

    free(pcTempFname);
--- a/test_conformance/printf/util_printf.cpp
+++ b/test_conformance/printf/util_printf.cpp
@@ -1336,6 +1336,33 @@ std::vector<std::string> correctBufferVector = {
    "00512,01024,262144,1048576"
 };

+std::vector<std::string> correctBufferVectorRTZ = {
+
+    "1.00,2.00,3.00,4.00",
+
+    "0xfa,0xfb",
+
+    "0x1234,0x8765",
+
+    "0x12345678,0x87654321",
+
+    "12345678,98765432",
+
+    "1.00,2.00,3.00,4.00",
+
+    "1.23e+03,9.87e+05,4.99e-04",
+
+    "0x1p-2,0x1p-1,0x1p+0,0x1.8p+0",
+
+    "1,2,3,4,1.5,3.13999,2.5,3.5",
+
+    "1,2,3,4,5,6,7,10,11,0,40,100,200,400,1000,2000",
+
+    "+1,-2,+3,-4,+5,-6,+7,-8",
+
+    "00512,01024,262144,1048576"
+};
+
 //-----------------------------------------------------------

 //Test case for vector                                      |
@@ -1822,7 +1849,14 @@ void generateRef(const cl_device_id device)
            as they're constant and hard-coded
        */
        if (caseToTest->printFN == NULL)
+        {
+            if (caseToTest->_type == TYPE_VECTOR
+                && fpConfigSingle == CL_FP_ROUND_TO_ZERO)
+            {
+                caseToTest->_correctBuffer = correctBufferVectorRTZ;
+            }
            continue;
+        }

        // Make sure the reference result is empty
        assert(caseToTest->_correctBuffer.size() == 0);
--- a/test_conformance/select/test_select.cpp
+++ b/test_conformance/select/test_select.cpp
@@ -261,14 +261,18 @@ static cl_program makeSelectProgram(cl_kernel *kernel_ptr,
    switch( vec_len )
    {
        case 1:
-            strncpy(stypename, type_name[srctype], sizeof(stypename));
-            strncpy(ctypename, type_name[cmptype], sizeof(ctypename));
+            strncpy(stypename, type_name[srctype], sizeof(stypename) - 1);
+            stypename[sizeof(stypename) - 1] = '\0';
+            strncpy(ctypename, type_name[cmptype], sizeof(ctypename) - 1);
+            ctypename[sizeof(ctypename) - 1] = '\0';
            snprintf(testname, sizeof(testname), "select_%s_%s", stypename, ctypename );
            log_info("Building %s(%s, %s, %s)\n", testname, stypename, stypename, ctypename);
            break;
        case 3:
-            strncpy(stypename, type_name[srctype], sizeof(stypename));
-            strncpy(ctypename, type_name[cmptype], sizeof(ctypename));
+            strncpy(stypename, type_name[srctype], sizeof(stypename) - 1);
+            stypename[sizeof(stypename) - 1] = '\0';
+            strncpy(ctypename, type_name[cmptype], sizeof(ctypename) - 1);
+            ctypename[sizeof(ctypename) - 1] = '\0';
            snprintf(testname, sizeof(testname), "select_%s3_%s3", stypename, ctypename );
            log_info("Building %s(%s3, %s3, %s3)\n", testname, stypename, stypename, ctypename);
            break;
--- a/test_conformance/spirv_new/CMakeLists.txt
+++ b/test_conformance/spirv_new/CMakeLists.txt
@@ -5,6 +5,7 @@ set(${MODULE_NAME}_SOURCES
  test_basic_versions.cpp
  test_cl_khr_expect_assume.cpp
  test_decorate.cpp
+  test_extinst_printf.cpp
  test_get_program_il.cpp
  test_linkage.cpp
  test_no_integer_wrap_decoration.cpp
@@ -40,6 +41,9 @@ if(CMAKE_COMPILER_IS_GNUCC OR "${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?Clang"
    add_cxx_flag_if_supported(-Wno-narrowing)
 endif()

-set(${MODULE_NAME}_SOURCES ${SPIRV_NEW_SOURCES} ${TEST_HARNESS_SOURCES})
+set(${MODULE_NAME}_SOURCES ${${MODULE_NAME}_SOURCES} ${TEST_HARNESS_SOURCES})

 include(../CMakeCommon.txt)
+
+add_subdirectory(spirv_asm)
+add_dependencies(${${MODULE_NAME}_OUT} spirv_new_binaries)
--- a/test_conformance/spirv_new/spirv_asm/CMakeLists.txt
+++ b/test_conformance/spirv_new/spirv_asm/CMakeLists.txt
@@ -0,0 +1,502 @@
+set(SPRIV_TOOLS_DIR ""
+    CACHE
+    PATH
+    "Absolute path to the directory containing the SPIR-V tools to use")
+
+find_package(Python3 COMPONENTS Interpreter QUIET)
+
+find_program(SPIRV_ASSEMBLER
+    NAMES spirv-as
+    HINTS ${SPIRV_TOOLS_DIR})
+
+find_program(SPIRV_VALIDATOR
+    NAMES spirv-val
+    HINTS ${SPIRV_TOOLS_DIR})
+
+if (Python3_FOUND AND
+    NOT ${SPIRV_ASSEMBLER} STREQUAL "SPIRV_ASSEMBLER-NOTFOUND" AND
+    NOT ${SPIRV_VALIDATOR} STREQUAL "SPIRV_VALIDATOR-NOTFOUND")
+    message(STATUS "Using python3: ${Python3_EXECUTABLE}")
+    message(STATUS "Using spirv-as: ${SPIRV_ASSEMBLER}")
+    message(STATUS "Using spirv-val: ${SPIRV_VALIDATOR}")
+else()
+    message(STATUS "Skipping automatic build of SPIR-V files for spirv_new")
+    # Empty custom target
+    add_custom_target(spirv_new_binaries)
+    return()
+endif()
+
+set(spirv_sources
+    assume.spvasm32
+    assume.spvasm64
+    atomic_dec_global.spvasm32
+    atomic_dec_global.spvasm64
+    atomic_inc_global.spvasm32
+    atomic_inc_global.spvasm64
+    basic.spvasm32
+    basic.spvasm64
+    branch_conditional.spvasm32
+    branch_conditional.spvasm64
+    branch_conditional_weighted.spvasm32
+    branch_conditional_weighted.spvasm64
+    branch_simple.spvasm32
+    branch_simple.spvasm64
+    composite_construct_int4.spvasm32
+    composite_construct_int4.spvasm64
+    composite_construct_struct.spvasm32
+    composite_construct_struct.spvasm64
+    constant_char_simple.spvasm32
+    constant_char_simple.spvasm64
+    constant_double_simple.spvasm32
+    constant_double_simple.spvasm64
+    constant_false_simple.spvasm32
+    constant_false_simple.spvasm64
+    constant_float_simple.spvasm32
+    constant_float_simple.spvasm64
+    constant_half_simple.spvasm32
+    constant_half_simple.spvasm64
+    constant_int3_simple.spvasm32
+    constant_int3_simple.spvasm64
+    constant_int4_simple.spvasm32
+    constant_int4_simple.spvasm64
+    constant_int_simple.spvasm32
+    constant_int_simple.spvasm64
+    constant_long_simple.spvasm32
+    constant_long_simple.spvasm64
+    constant_short_simple.spvasm32
+    constant_short_simple.spvasm64
+    constant_struct_int_char_simple.spvasm32
+    constant_struct_int_char_simple.spvasm64
+    constant_struct_int_float_simple.spvasm32
+    constant_struct_int_float_simple.spvasm64
+    constant_struct_struct_simple.spvasm32
+    constant_struct_struct_simple.spvasm64
+    constant_true_simple.spvasm32
+    constant_true_simple.spvasm64
+    constant_uchar_simple.spvasm32
+    constant_uchar_simple.spvasm64
+    constant_uint_simple.spvasm32
+    constant_uint_simple.spvasm64
+    constant_ulong_simple.spvasm32
+    constant_ulong_simple.spvasm64
+    constant_ushort_simple.spvasm32
+    constant_ushort_simple.spvasm64
+    copy_char_simple.spvasm32
+    copy_char_simple.spvasm64
+    copy_double_simple.spvasm32
+    copy_double_simple.spvasm64
+    copy_float_simple.spvasm32
+    copy_float_simple.spvasm64
+    copy_half_simple.spvasm32
+    copy_half_simple.spvasm64
+    copy_int3_simple.spvasm32
+    copy_int3_simple.spvasm64
+    copy_int4_simple.spvasm32
+    copy_int4_simple.spvasm64
+    copy_int_simple.spvasm32
+    copy_int_simple.spvasm64
+    copy_long_simple.spvasm32
+    copy_long_simple.spvasm64
+    copy_short_simple.spvasm32
+    copy_short_simple.spvasm64
+    copy_struct_int_char_simple.spvasm32
+    copy_struct_int_char_simple.spvasm64
+    copy_struct_int_float_simple.spvasm32
+    copy_struct_int_float_simple.spvasm64
+    copy_struct_struct_simple.spvasm32
+    copy_struct_struct_simple.spvasm64
+    copy_uchar_simple.spvasm32
+    copy_uchar_simple.spvasm64
+    copy_uint_simple.spvasm32
+    copy_uint_simple.spvasm64
+    copy_ulong_simple.spvasm32
+    copy_ulong_simple.spvasm64
+    copy_ushort_simple.spvasm32
+    copy_ushort_simple.spvasm64
+    decorate_aliased.spvasm32
+    decorate_aliased.spvasm64
+    decorate_alignment.spvasm32
+    decorate_alignment.spvasm64
+    decorate_constant.spvasm32
+    decorate_constant.spvasm64
+    decorate_constant_fail.spvasm32
+    decorate_constant_fail.spvasm64
+    decorate_cpacked.spvasm32
+    decorate_cpacked.spvasm64
+    decorate_restrict.spvasm32
+    decorate_restrict.spvasm64
+    decorate_rounding_rte_double_long.spvasm32
+    decorate_rounding_rte_double_long.spvasm64
+    decorate_rounding_rte_float_int.spvasm32
+    decorate_rounding_rte_float_int.spvasm64
+    decorate_rounding_rte_half_short.spvasm32
+    decorate_rounding_rte_half_short.spvasm64
+    decorate_rounding_rtn_double_long.spvasm32
+    decorate_rounding_rtn_double_long.spvasm64
+    decorate_rounding_rtn_float_int.spvasm32
+    decorate_rounding_rtn_float_int.spvasm64
+    decorate_rounding_rtn_half_short.spvasm32
+    decorate_rounding_rtn_half_short.spvasm64
+    decorate_rounding_rtp_double_long.spvasm32
+    decorate_rounding_rtp_double_long.spvasm64
+    decorate_rounding_rtp_float_int.spvasm32
+    decorate_rounding_rtp_float_int.spvasm64
+    decorate_rounding_rtp_half_short.spvasm32
+    decorate_rounding_rtp_half_short.spvasm64
+    decorate_rounding_rtz_double_long.spvasm32
+    decorate_rounding_rtz_double_long.spvasm64
+    decorate_rounding_rtz_float_int.spvasm32
+    decorate_rounding_rtz_float_int.spvasm64
+    decorate_rounding_rtz_half_short.spvasm32
+    decorate_rounding_rtz_half_short.spvasm64
+    decorate_saturated_conversion_double_to_int.spvasm32
+    decorate_saturated_conversion_double_to_int.spvasm64
+    decorate_saturated_conversion_double_to_uint.spvasm32
+    decorate_saturated_conversion_double_to_uint.spvasm64
+    decorate_saturated_conversion_float_to_char.spvasm32
+    decorate_saturated_conversion_float_to_char.spvasm64
+    decorate_saturated_conversion_float_to_short.spvasm32
+    decorate_saturated_conversion_float_to_short.spvasm64
+    decorate_saturated_conversion_float_to_uchar.spvasm32
+    decorate_saturated_conversion_float_to_uchar.spvasm64
+    decorate_saturated_conversion_float_to_ushort.spvasm32
+    decorate_saturated_conversion_float_to_ushort.spvasm64
+    decorate_saturated_conversion_half_to_char.spvasm32
+    decorate_saturated_conversion_half_to_char.spvasm64
+    decorate_saturated_conversion_half_to_uchar.spvasm32
+    decorate_saturated_conversion_half_to_uchar.spvasm64
+    expect_bool.spvasm32
+    expect_bool.spvasm64
+    expect_char.spvasm32
+    expect_char.spvasm64
+    expect_int.spvasm32
+    expect_int.spvasm64
+    expect_long.spvasm32
+    expect_long.spvasm64
+    expect_short.spvasm32
+    expect_short.spvasm64
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fadd_int.spvasm32
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fadd_int.spvasm64
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fadd_uint.spvasm32
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fadd_uint.spvasm64
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fmul_int.spvasm32
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fmul_int.spvasm64
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fmul_uint.spvasm32
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fmul_uint.spvasm64
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fnegate_int.spvasm32
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fnegate_int.spvasm64
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fshiftleft_int.spvasm32
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fshiftleft_int.spvasm64
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fshiftleft_uint.spvasm32
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fshiftleft_uint.spvasm64
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fsub_int.spvasm32
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fsub_int.spvasm64
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fsub_uint.spvasm32
+    ext_cl_khr_spirv_no_integer_wrap_decoration_fsub_uint.spvasm64
+    fadd_double.spvasm32
+    fadd_double.spvasm64
+    fadd_double2.spvasm32
+    fadd_double2.spvasm64
+    fadd_float.spvasm32
+    fadd_float.spvasm64
+    fadd_float4.spvasm32
+    fadd_float4.spvasm64
+    fadd_half.spvasm32
+    fadd_half.spvasm64
+    fdiv_double.spvasm32
+    fdiv_double.spvasm64
+    fdiv_double2.spvasm32
+    fdiv_double2.spvasm64
+    fdiv_float.spvasm32
+    fdiv_float.spvasm64
+    fdiv_float4.spvasm32
+    fdiv_float4.spvasm64
+    fdiv_half.spvasm32
+    fdiv_half.spvasm64
+    fmod_double.spvasm32
+    fmod_double.spvasm64
+    fmod_double2.spvasm32
+    fmod_double2.spvasm64
+    fmod_float.spvasm32
+    fmod_float.spvasm64
+    fmod_float4.spvasm32
+    fmod_float4.spvasm64
+    fmod_half.spvasm32
+    fmod_half.spvasm64
+    fmul_double.spvasm32
+    fmul_double.spvasm64
+    fmul_double2.spvasm32
+    fmul_double2.spvasm64
+    fmul_float.spvasm32
+    fmul_float.spvasm64
+    fmul_float4.spvasm32
+    fmul_float4.spvasm64
+    fmul_half.spvasm32
+    fmul_half.spvasm64
+    frem_double.spvasm32
+    frem_double.spvasm64
+    frem_double2.spvasm32
+    frem_double2.spvasm64
+    frem_float.spvasm32
+    frem_float.spvasm64
+    frem_float4.spvasm32
+    frem_float4.spvasm64
+    frem_half.spvasm32
+    frem_half.spvasm64
+    fsub_double.spvasm32
+    fsub_double.spvasm64
+    fsub_double2.spvasm32
+    fsub_double2.spvasm64
+    fsub_float.spvasm32
+    fsub_float.spvasm64
+    fsub_float4.spvasm32
+    fsub_float4.spvasm64
+    fsub_half.spvasm32
+    fsub_half.spvasm64
+    label_simple.spvasm32
+    label_simple.spvasm64
+    lifetime_simple.spvasm32
+    lifetime_simple.spvasm64
+    linkage_export.spvasm32
+    linkage_export.spvasm64
+    linkage_import.spvasm32
+    linkage_import.spvasm64
+    linkage_linkonce_odr_main.spvasm32
+    linkage_linkonce_odr_main.spvasm64
+    linkage_linkonce_odr_noa_main.spvasm32
+    linkage_linkonce_odr_noa_main.spvasm64
+    linkage_linkonce_odr_obj.spvasm32
+    linkage_linkonce_odr_obj.spvasm64
+    loop_merge_branch_conditional_dont_unroll.spvasm32
+    loop_merge_branch_conditional_dont_unroll.spvasm64
+    loop_merge_branch_conditional_none.spvasm32
+    loop_merge_branch_conditional_none.spvasm64
+    loop_merge_branch_conditional_unroll.spvasm32
+    loop_merge_branch_conditional_unroll.spvasm64
+    loop_merge_branch_dont_unroll.spvasm32
+    loop_merge_branch_dont_unroll.spvasm64
+    loop_merge_branch_none.spvasm32
+    loop_merge_branch_none.spvasm64
+    loop_merge_branch_unroll.spvasm32
+    loop_merge_branch_unroll.spvasm64
+    op_function_const.spvasm32
+    op_function_const.spvasm64
+    op_function_inline.spvasm32
+    op_function_inline.spvasm64
+    op_function_noinline.spvasm32
+    op_function_noinline.spvasm64
+    op_function_none.spvasm32
+    op_function_none.spvasm64
+    op_function_pure.spvasm32
+    op_function_pure.spvasm64
+    op_function_pure_ptr.spvasm32
+    op_function_pure_ptr.spvasm64
+    op_neg_double.spvasm32
+    op_neg_double.spvasm64
+    op_neg_float.spvasm32
+    op_neg_float.spvasm64
+    op_neg_float4.spvasm32
+    op_neg_float4.spvasm64
+    op_neg_half.spvasm32
+    op_neg_half.spvasm64
+    op_neg_int.spvasm32
+    op_neg_int.spvasm64
+    op_neg_int4.spvasm32
+    op_neg_int4.spvasm64
+    op_neg_long.spvasm32
+    op_neg_long.spvasm64
+    op_neg_short.spvasm32
+    op_neg_short.spvasm64
+    op_not_int.spvasm32
+    op_not_int.spvasm64
+    op_not_int4.spvasm32
+    op_not_int4.spvasm64
+    op_not_long.spvasm32
+    op_not_long.spvasm64
+    op_not_short.spvasm32
+    op_not_short.spvasm64
+    op_spec_constant_double_simple.spvasm32
+    op_spec_constant_double_simple.spvasm64
+    op_spec_constant_false_simple.spvasm32
+    op_spec_constant_false_simple.spvasm64
+    op_spec_constant_float_simple.spvasm32
+    op_spec_constant_float_simple.spvasm64
+    op_spec_constant_half_simple.spvasm32
+    op_spec_constant_half_simple.spvasm64
+    op_spec_constant_true_simple.spvasm32
+    op_spec_constant_true_simple.spvasm64
+    op_spec_constant_uchar_simple.spvasm32
+    op_spec_constant_uchar_simple.spvasm64
+    op_spec_constant_uint_simple.spvasm32
+    op_spec_constant_uint_simple.spvasm64
+    op_spec_constant_ulong_simple.spvasm32
+    op_spec_constant_ulong_simple.spvasm64
+    op_spec_constant_ushort_simple.spvasm32
+    op_spec_constant_ushort_simple.spvasm64
+    opaque.spvasm32
+    opaque.spvasm64
+    phi_2.spvasm32
+    phi_2.spvasm64
+    phi_3.spvasm32
+    phi_3.spvasm64
+    phi_4.spvasm32
+    phi_4.spvasm64
+    select_if_dont_flatten.spvasm32
+    select_if_dont_flatten.spvasm64
+    select_if_flatten.spvasm32
+    select_if_flatten.spvasm64
+    select_if_none.spvasm32
+    select_if_none.spvasm64
+    select_switch_dont_flatten.spvasm32
+    select_switch_dont_flatten.spvasm64
+    select_switch_flatten.spvasm32
+    select_switch_flatten.spvasm64
+    select_switch_none.spvasm32
+    select_switch_none.spvasm64
+    spv1.1/basic.spvasm32
+    spv1.1/basic.spvasm64
+    spv1.2/basic.spvasm32
+    spv1.2/basic.spvasm64
+    spv1.3/basic.spvasm32
+    spv1.3/basic.spvasm64
+    spv1.4/basic.spvasm32
+    spv1.4/basic.spvasm64
+    spv1.4/copylogical_struct.spvasm32
+    spv1.4/copylogical_struct.spvasm64
+    spv1.4/copymemory_memory_operands.spvasm32
+    spv1.4/copymemory_memory_operands.spvasm64
+    spv1.4/image_operand_signextend.spvasm32
+    spv1.4/image_operand_signextend.spvasm64
+    spv1.4/image_operand_zeroextend.spvasm32
+    spv1.4/image_operand_zeroextend.spvasm64
+    spv1.4/loop_control_iterationmultiple.spvasm32
+    spv1.4/loop_control_iterationmultiple.spvasm64
+    spv1.4/loop_control_maxiterations.spvasm32
+    spv1.4/loop_control_maxiterations.spvasm64
+    spv1.4/loop_control_miniterations.spvasm32
+    spv1.4/loop_control_miniterations.spvasm64
+    spv1.4/loop_control_partialcount.spvasm32
+    spv1.4/loop_control_partialcount.spvasm64
+    spv1.4/loop_control_peelcount.spvasm32
+    spv1.4/loop_control_peelcount.spvasm64
+    spv1.4/no_integer_wrap_decoration_fadd_int.spvasm32
+    spv1.4/no_integer_wrap_decoration_fadd_int.spvasm64
+    spv1.4/no_integer_wrap_decoration_fadd_uint.spvasm32
+    spv1.4/no_integer_wrap_decoration_fadd_uint.spvasm64
+    spv1.4/no_integer_wrap_decoration_fmul_int.spvasm32
+    spv1.4/no_integer_wrap_decoration_fmul_int.spvasm64
+    spv1.4/no_integer_wrap_decoration_fmul_uint.spvasm32
+    spv1.4/no_integer_wrap_decoration_fmul_uint.spvasm64
+    spv1.4/no_integer_wrap_decoration_fnegate_int.spvasm32
+    spv1.4/no_integer_wrap_decoration_fnegate_int.spvasm64
+    spv1.4/no_integer_wrap_decoration_fshiftleft_int.spvasm32
+    spv1.4/no_integer_wrap_decoration_fshiftleft_int.spvasm64
+    spv1.4/no_integer_wrap_decoration_fshiftleft_uint.spvasm32
+    spv1.4/no_integer_wrap_decoration_fshiftleft_uint.spvasm64
+    spv1.4/no_integer_wrap_decoration_fsub_int.spvasm32
+    spv1.4/no_integer_wrap_decoration_fsub_int.spvasm64
+    spv1.4/no_integer_wrap_decoration_fsub_uint.spvasm32
+    spv1.4/no_integer_wrap_decoration_fsub_uint.spvasm64
+    spv1.4/nonwriteable_decoration_function_storage_class.spvasm32
+    spv1.4/nonwriteable_decoration_function_storage_class.spvasm64
+    spv1.4/ptrops.spvasm32
+    spv1.4/ptrops.spvasm64
+    spv1.4/select_struct.spvasm32
+    spv1.4/select_struct.spvasm64
+    spv1.4/usersemantic_decoratestring.spvasm32
+    spv1.4/usersemantic_decoratestring.spvasm64
+    spv1.4/usersemantic_memberdecoratestring.spvasm32
+    spv1.4/usersemantic_memberdecoratestring.spvasm64
+    spv1.5/basic.spvasm32
+    spv1.5/basic.spvasm64
+    spv1.6/basic.spvasm32
+    spv1.6/basic.spvasm64
+    undef_char_simple.spvasm32
+    undef_char_simple.spvasm64
+    undef_double_simple.spvasm32
+    undef_double_simple.spvasm64
+    undef_false_simple.spvasm32
+    undef_false_simple.spvasm64
+    undef_float_simple.spvasm32
+    undef_float_simple.spvasm64
+    undef_half_simple.spvasm32
+    undef_half_simple.spvasm64
+    undef_int3_simple.spvasm32
+    undef_int3_simple.spvasm64
+    undef_int4_simple.spvasm32
+    undef_int4_simple.spvasm64
+    undef_int_simple.spvasm32
+    undef_int_simple.spvasm64
+    undef_long_simple.spvasm32
+    undef_long_simple.spvasm64
+    undef_short_simple.spvasm32
+    undef_short_simple.spvasm64
+    undef_struct_int_char_simple.spvasm32
+    undef_struct_int_char_simple.spvasm64
+    undef_struct_int_float_simple.spvasm32
+    undef_struct_int_float_simple.spvasm64
+    undef_struct_struct_simple.spvasm32
+    undef_struct_struct_simple.spvasm64
+    undef_true_simple.spvasm32
+    undef_true_simple.spvasm64
+    undef_uchar_simple.spvasm32
+    undef_uchar_simple.spvasm64
+    undef_uint_simple.spvasm32
+    undef_uint_simple.spvasm64
+    undef_ulong_simple.spvasm32
+    undef_ulong_simple.spvasm64
+    undef_ushort_simple.spvasm32
+    undef_ushort_simple.spvasm64
+    unreachable_simple.spvasm32
+    unreachable_simple.spvasm64
+    vector_char16_extract.spvasm32
+    vector_char16_extract.spvasm64
+    vector_char16_insert.spvasm32
+    vector_char16_insert.spvasm64
+    vector_double2_extract.spvasm32
+    vector_double2_extract.spvasm64
+    vector_double2_insert.spvasm32
+    vector_double2_insert.spvasm64
+    vector_float4_extract.spvasm32
+    vector_float4_extract.spvasm64
+    vector_float4_insert.spvasm32
+    vector_float4_insert.spvasm64
+    vector_half8_extract.spvasm32
+    vector_half8_extract.spvasm64
+    vector_half8_insert.spvasm32
+    vector_half8_insert.spvasm64
+    vector_int4_extract.spvasm32
+    vector_int4_extract.spvasm64
+    vector_int4_insert.spvasm32
+    vector_int4_insert.spvasm64
+    vector_long2_extract.spvasm32
+    vector_long2_extract.spvasm64
+    vector_long2_insert.spvasm32
+    vector_long2_insert.spvasm64
+    vector_times_scalar_double.spvasm32
+    vector_times_scalar_double.spvasm64
+    vector_times_scalar_float.spvasm32
+    vector_times_scalar_float.spvasm64
+    vector_times_scalar_half.spvasm32
+    vector_times_scalar_half.spvasm64)
+
+set(assembled_spirv_binaries "")
+foreach(spirv_source ${spirv_sources})
+    string(REPLACE ".spvasm" ".spv" assembled_spirv_binary ${spirv_source})
+    set(assembled_spirv_binary "${CMAKE_CURRENT_BINARY_DIR}/../spirv_bin/${assembled_spirv_binary}")
+    list(APPEND assembled_spirv_binaries ${assembled_spirv_binary})
+endforeach()
+
+add_custom_command(
+    OUTPUT ${assembled_spirv_binaries}
+    COMMENT "Generating SPIR-V binaries..."
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/assemble_spirv.py
+            --source-dir "${CMAKE_CURRENT_SOURCE_DIR}"
+            --output-dir "${CMAKE_CURRENT_BINARY_DIR}/../spirv_bin"
+            --assembler "${SPIRV_ASSEMBLER}"
+            --validator "${SPIRV_VALIDATOR}"
+    DEPENDS assemble_spirv.py ${spirv_sources}
+    USES_TERMINAL
+    VERBATIM)
+
+add_custom_target(spirv_new_binaries DEPENDS ${assembled_spirv_binaries})
--- a/test_conformance/spirv_new/spirv_asm/assemble_spirv.py
+++ b/test_conformance/spirv_new/spirv_asm/assemble_spirv.py
--- a/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_fp32.spvasm32
+++ b/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_fp32.spvasm32
@@ -0,0 +1,85 @@
+; kernel void printf_operands_scalar_fp32(float f)
+; {
+;     printf("a = %.1a\n", f);
+;     printf("A = %.1A\n", f);
+;     printf("e = %.1e\n", f);
+;     printf("E = %.1E\n", f);
+;     printf("f = %.1f\n", f);
+;     printf("F = %.1F\n", f);
+;     printf("g = %.1g\n", f);
+;     printf("G = %.1G\n", f);
+; }
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int8
+        %clext = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %test "printf_operands_scalar_fp32"
+      %uchar = OpTypeInt 8 0
+       %uint = OpTypeInt 32 0
+     %uint_8 = OpConstant %uint 8
+    %uint_10 = OpConstant %uint 10
+    %uint_12 = OpConstant %uint 12
+  %uchar_nul = OpConstant %uchar 0
+   %uchar_nl = OpConstant %uchar 10
+   %uchar_sp = OpConstant %uchar 32
+  %uchar_pct = OpConstant %uchar 37
+  %uchar_dot = OpConstant %uchar 46
+    %uchar_1 = OpConstant %uchar 49
+   %uchar_eq = OpConstant %uchar 61
+    %uchar_A = OpConstant %uchar 65
+    %uchar_E = OpConstant %uchar 69
+    %uchar_F = OpConstant %uchar 70
+    %uchar_G = OpConstant %uchar 71
+    %uchar_a = OpConstant %uchar 97
+    %uchar_e = OpConstant %uchar 101
+    %uchar_f = OpConstant %uchar 102
+    %uchar_g = OpConstant %uchar 103
+  %string_10 = OpTypeArray %uchar %uint_10
+%cptr_string_10 = OpTypePointer UniformConstant %string_10
+       %void = OpTypeVoid
+      %float = OpTypeFloat 32
+ %kernel_sig = OpTypeFunction %void %float
+  %cptr_char = OpTypePointer UniformConstant %uchar
+
+    %array_a = OpConstantComposite %string_10 %uchar_a %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_a %uchar_nl %uchar_nul ; "a = %.1a\n"
+   %string_a = OpVariable %cptr_string_10 UniformConstant %array_a
+    %array_A = OpConstantComposite %string_10 %uchar_A %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_A %uchar_nl %uchar_nul ; "A = %.1A\n"
+   %string_A = OpVariable %cptr_string_10 UniformConstant %array_A
+    %array_e = OpConstantComposite %string_10 %uchar_e %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_e %uchar_nl %uchar_nul ; "e = %.1e\n"
+   %string_e = OpVariable %cptr_string_10 UniformConstant %array_e
+    %array_E = OpConstantComposite %string_10 %uchar_E %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_E %uchar_nl %uchar_nul ; "E = %.1E\n"
+   %string_E = OpVariable %cptr_string_10 UniformConstant %array_E
+    %array_f = OpConstantComposite %string_10 %uchar_f %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_f %uchar_nl %uchar_nul ; "f = %.1f\n"
+   %string_f = OpVariable %cptr_string_10 UniformConstant %array_f
+    %array_F = OpConstantComposite %string_10 %uchar_F %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_F %uchar_nl %uchar_nul ; "F = %.1F\n"
+   %string_F = OpVariable %cptr_string_10 UniformConstant %array_F
+    %array_g = OpConstantComposite %string_10 %uchar_g %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_g %uchar_nl %uchar_nul ; "g = %.1g\n"
+   %string_g = OpVariable %cptr_string_10 UniformConstant %array_g
+    %array_G = OpConstantComposite %string_10 %uchar_G %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_G %uchar_nl %uchar_nul ; "G = %.1G\n"
+   %string_G = OpVariable %cptr_string_10 UniformConstant %array_G
+
+       %test = OpFunction %void None %kernel_sig
+          %f = OpFunctionParameter %float
+      %entry = OpLabel
+
+      %fmt_a = OpBitcast %cptr_char %string_a
+   %printf_a = OpExtInst %uint %clext printf %fmt_a %f
+      %fmt_A = OpBitcast %cptr_char %string_A
+   %printf_A = OpExtInst %uint %clext printf %fmt_A %f
+      %fmt_e = OpBitcast %cptr_char %string_e
+   %printf_e = OpExtInst %uint %clext printf %fmt_e %f
+      %fmt_E = OpBitcast %cptr_char %string_E
+   %printf_E = OpExtInst %uint %clext printf %fmt_E %f
+      %fmt_f = OpBitcast %cptr_char %string_f
+   %printf_f = OpExtInst %uint %clext printf %fmt_f %f
+      %fmt_F = OpBitcast %cptr_char %string_F
+   %printf_F = OpExtInst %uint %clext printf %fmt_F %f
+      %fmt_g = OpBitcast %cptr_char %string_g
+   %printf_g = OpExtInst %uint %clext printf %fmt_g %f
+      %fmt_G = OpBitcast %cptr_char %string_G
+   %printf_G = OpExtInst %uint %clext printf %fmt_G %f
+
+               OpReturn
+               OpFunctionEnd
--- a/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_fp32.spvasm64
+++ b/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_fp32.spvasm64
@@ -0,0 +1,85 @@
+; kernel void printf_operands_scalar_fp32(float f)
+; {
+;     printf("a = %.1a\n", f);
+;     printf("A = %.1A\n", f);
+;     printf("e = %.1e\n", f);
+;     printf("E = %.1E\n", f);
+;     printf("f = %.1f\n", f);
+;     printf("F = %.1F\n", f);
+;     printf("g = %.1g\n", f);
+;     printf("G = %.1G\n", f);
+; }
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int8
+        %clext = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %test "printf_operands_scalar_fp32"
+      %uchar = OpTypeInt 8 0
+       %uint = OpTypeInt 32 0
+     %uint_8 = OpConstant %uint 8
+    %uint_10 = OpConstant %uint 10
+    %uint_12 = OpConstant %uint 12
+  %uchar_nul = OpConstant %uchar 0
+   %uchar_nl = OpConstant %uchar 10
+   %uchar_sp = OpConstant %uchar 32
+  %uchar_pct = OpConstant %uchar 37
+  %uchar_dot = OpConstant %uchar 46
+    %uchar_1 = OpConstant %uchar 49
+   %uchar_eq = OpConstant %uchar 61
+    %uchar_A = OpConstant %uchar 65
+    %uchar_E = OpConstant %uchar 69
+    %uchar_F = OpConstant %uchar 70
+    %uchar_G = OpConstant %uchar 71
+    %uchar_a = OpConstant %uchar 97
+    %uchar_e = OpConstant %uchar 101
+    %uchar_f = OpConstant %uchar 102
+    %uchar_g = OpConstant %uchar 103
+  %string_10 = OpTypeArray %uchar %uint_10
+%cptr_string_10 = OpTypePointer UniformConstant %string_10
+       %void = OpTypeVoid
+      %float = OpTypeFloat 32
+ %kernel_sig = OpTypeFunction %void %float
+  %cptr_char = OpTypePointer UniformConstant %uchar
+
+    %array_a = OpConstantComposite %string_10 %uchar_a %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_a %uchar_nl %uchar_nul ; "a = %.1a\n"
+   %string_a = OpVariable %cptr_string_10 UniformConstant %array_a
+    %array_A = OpConstantComposite %string_10 %uchar_A %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_A %uchar_nl %uchar_nul ; "A = %.1A\n"
+   %string_A = OpVariable %cptr_string_10 UniformConstant %array_A
+    %array_e = OpConstantComposite %string_10 %uchar_e %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_e %uchar_nl %uchar_nul ; "e = %.1e\n"
+   %string_e = OpVariable %cptr_string_10 UniformConstant %array_e
+    %array_E = OpConstantComposite %string_10 %uchar_E %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_E %uchar_nl %uchar_nul ; "E = %.1E\n"
+   %string_E = OpVariable %cptr_string_10 UniformConstant %array_E
+    %array_f = OpConstantComposite %string_10 %uchar_f %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_f %uchar_nl %uchar_nul ; "f = %.1f\n"
+   %string_f = OpVariable %cptr_string_10 UniformConstant %array_f
+    %array_F = OpConstantComposite %string_10 %uchar_F %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_F %uchar_nl %uchar_nul ; "F = %.1F\n"
+   %string_F = OpVariable %cptr_string_10 UniformConstant %array_F
+    %array_g = OpConstantComposite %string_10 %uchar_g %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_g %uchar_nl %uchar_nul ; "g = %.1g\n"
+   %string_g = OpVariable %cptr_string_10 UniformConstant %array_g
+    %array_G = OpConstantComposite %string_10 %uchar_G %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_G %uchar_nl %uchar_nul ; "G = %.1G\n"
+   %string_G = OpVariable %cptr_string_10 UniformConstant %array_G
+
+       %test = OpFunction %void None %kernel_sig
+          %f = OpFunctionParameter %float
+      %entry = OpLabel
+
+      %fmt_a = OpBitcast %cptr_char %string_a
+   %printf_a = OpExtInst %uint %clext printf %fmt_a %f
+      %fmt_A = OpBitcast %cptr_char %string_A
+   %printf_A = OpExtInst %uint %clext printf %fmt_A %f
+      %fmt_e = OpBitcast %cptr_char %string_e
+   %printf_e = OpExtInst %uint %clext printf %fmt_e %f
+      %fmt_E = OpBitcast %cptr_char %string_E
+   %printf_E = OpExtInst %uint %clext printf %fmt_E %f
+      %fmt_f = OpBitcast %cptr_char %string_f
+   %printf_f = OpExtInst %uint %clext printf %fmt_f %f
+      %fmt_F = OpBitcast %cptr_char %string_F
+   %printf_F = OpExtInst %uint %clext printf %fmt_F %f
+      %fmt_g = OpBitcast %cptr_char %string_g
+   %printf_g = OpExtInst %uint %clext printf %fmt_g %f
+      %fmt_G = OpBitcast %cptr_char %string_G
+   %printf_G = OpExtInst %uint %clext printf %fmt_G %f
+
+               OpReturn
+               OpFunctionEnd
--- a/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_fp64.spvasm32
+++ b/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_fp64.spvasm32
@@ -0,0 +1,93 @@
+; kernel void printf_operands_scalar_fp64(double d)
+; {
+;     printf("a = %.1a\n", d);
+;     printf("A = %.1A\n", d);
+;     printf("e = %.1e\n", d);
+;     printf("E = %.1E\n", d);
+;     printf("f = %.1f\n", d);
+;     printf("F = %.1F\n", d);
+;     printf("g = %.1g\n", d);
+;     printf("G = %.1G\n", d);
+; }
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float64
+               OpCapability Int8
+        %clext = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %test "printf_operands_scalar_fp64"
+      %uchar = OpTypeInt 8 0
+       %uint = OpTypeInt 32 0
+     %uint_8 = OpConstant %uint 8
+    %uint_10 = OpConstant %uint 10
+    %uint_12 = OpConstant %uint 12
+  %uchar_nul = OpConstant %uchar 0
+   %uchar_nl = OpConstant %uchar 10
+   %uchar_sp = OpConstant %uchar 32
+  %uchar_pct = OpConstant %uchar 37
+  %uchar_dot = OpConstant %uchar 46
+    %uchar_1 = OpConstant %uchar 49
+   %uchar_eq = OpConstant %uchar 61
+    %uchar_A = OpConstant %uchar 65
+    %uchar_E = OpConstant %uchar 69
+    %uchar_F = OpConstant %uchar 70
+    %uchar_G = OpConstant %uchar 71
+    %uchar_X = OpConstant %uchar 88
+    %uchar_a = OpConstant %uchar 97
+    %uchar_d = OpConstant %uchar 100
+    %uchar_e = OpConstant %uchar 101
+    %uchar_f = OpConstant %uchar 102
+    %uchar_g = OpConstant %uchar 103
+    %uchar_h = OpConstant %uchar 104
+    %uchar_i = OpConstant %uchar 105
+    %uchar_o = OpConstant %uchar 111
+    %uchar_u = OpConstant %uchar 117
+    %uchar_x = OpConstant %uchar 120
+  %string_10 = OpTypeArray %uchar %uint_10
+%cptr_string_10 = OpTypePointer UniformConstant %string_10
+       %void = OpTypeVoid
+     %double = OpTypeFloat 64
+ %kernel_sig = OpTypeFunction %void %double
+  %cptr_char = OpTypePointer UniformConstant %uchar
+
+    %array_a = OpConstantComposite %string_10 %uchar_a %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_a %uchar_nl %uchar_nul ; "a = %.1a\n"
+   %string_a = OpVariable %cptr_string_10 UniformConstant %array_a
+    %array_A = OpConstantComposite %string_10 %uchar_A %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_A %uchar_nl %uchar_nul ; "A = %.1A\n"
+   %string_A = OpVariable %cptr_string_10 UniformConstant %array_A
+    %array_e = OpConstantComposite %string_10 %uchar_e %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_e %uchar_nl %uchar_nul ; "e = %.1e\n"
+   %string_e = OpVariable %cptr_string_10 UniformConstant %array_e
+    %array_E = OpConstantComposite %string_10 %uchar_E %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_E %uchar_nl %uchar_nul ; "E = %.1E\n"
+   %string_E = OpVariable %cptr_string_10 UniformConstant %array_E
+    %array_f = OpConstantComposite %string_10 %uchar_f %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_f %uchar_nl %uchar_nul ; "f = %.1f\n"
+   %string_f = OpVariable %cptr_string_10 UniformConstant %array_f
+    %array_F = OpConstantComposite %string_10 %uchar_F %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_F %uchar_nl %uchar_nul ; "F = %.1F\n"
+   %string_F = OpVariable %cptr_string_10 UniformConstant %array_F
+    %array_g = OpConstantComposite %string_10 %uchar_g %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_g %uchar_nl %uchar_nul ; "g = %.1g\n"
+   %string_g = OpVariable %cptr_string_10 UniformConstant %array_g
+    %array_G = OpConstantComposite %string_10 %uchar_G %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_G %uchar_nl %uchar_nul ; "G = %.1G\n"
+   %string_G = OpVariable %cptr_string_10 UniformConstant %array_G
+
+       %test = OpFunction %void None %kernel_sig
+          %d = OpFunctionParameter %double
+      %entry = OpLabel
+
+      %fmt_a = OpBitcast %cptr_char %string_a
+   %printf_a = OpExtInst %uint %clext printf %fmt_a %d
+      %fmt_A = OpBitcast %cptr_char %string_A
+   %printf_A = OpExtInst %uint %clext printf %fmt_A %d
+      %fmt_e = OpBitcast %cptr_char %string_e
+   %printf_e = OpExtInst %uint %clext printf %fmt_e %d
+      %fmt_E = OpBitcast %cptr_char %string_E
+   %printf_E = OpExtInst %uint %clext printf %fmt_E %d
+      %fmt_f = OpBitcast %cptr_char %string_f
+   %printf_f = OpExtInst %uint %clext printf %fmt_f %d
+      %fmt_F = OpBitcast %cptr_char %string_F
+   %printf_F = OpExtInst %uint %clext printf %fmt_F %d
+      %fmt_g = OpBitcast %cptr_char %string_g
+   %printf_g = OpExtInst %uint %clext printf %fmt_g %d
+      %fmt_G = OpBitcast %cptr_char %string_G
+   %printf_G = OpExtInst %uint %clext printf %fmt_G %d
+
+               OpReturn
+               OpFunctionEnd
--- a/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_fp64.spvasm64
+++ b/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_fp64.spvasm64
@@ -0,0 +1,93 @@
+; kernel void printf_operands_scalar_fp64(double d)
+; {
+;     printf("a = %.1a\n", d);
+;     printf("A = %.1A\n", d);
+;     printf("e = %.1e\n", d);
+;     printf("E = %.1E\n", d);
+;     printf("f = %.1f\n", d);
+;     printf("F = %.1F\n", d);
+;     printf("g = %.1g\n", d);
+;     printf("G = %.1G\n", d);
+; }
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float64
+               OpCapability Int8
+        %clext = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %test "printf_operands_scalar_fp64"
+      %uchar = OpTypeInt 8 0
+       %uint = OpTypeInt 32 0
+     %uint_8 = OpConstant %uint 8
+    %uint_10 = OpConstant %uint 10
+    %uint_12 = OpConstant %uint 12
+  %uchar_nul = OpConstant %uchar 0
+   %uchar_nl = OpConstant %uchar 10
+   %uchar_sp = OpConstant %uchar 32
+  %uchar_pct = OpConstant %uchar 37
+  %uchar_dot = OpConstant %uchar 46
+    %uchar_1 = OpConstant %uchar 49
+   %uchar_eq = OpConstant %uchar 61
+    %uchar_A = OpConstant %uchar 65
+    %uchar_E = OpConstant %uchar 69
+    %uchar_F = OpConstant %uchar 70
+    %uchar_G = OpConstant %uchar 71
+    %uchar_X = OpConstant %uchar 88
+    %uchar_a = OpConstant %uchar 97
+    %uchar_d = OpConstant %uchar 100
+    %uchar_e = OpConstant %uchar 101
+    %uchar_f = OpConstant %uchar 102
+    %uchar_g = OpConstant %uchar 103
+    %uchar_h = OpConstant %uchar 104
+    %uchar_i = OpConstant %uchar 105
+    %uchar_o = OpConstant %uchar 111
+    %uchar_u = OpConstant %uchar 117
+    %uchar_x = OpConstant %uchar 120
+  %string_10 = OpTypeArray %uchar %uint_10
+%cptr_string_10 = OpTypePointer UniformConstant %string_10
+       %void = OpTypeVoid
+     %double = OpTypeFloat 64
+ %kernel_sig = OpTypeFunction %void %double
+  %cptr_char = OpTypePointer UniformConstant %uchar
+
+    %array_a = OpConstantComposite %string_10 %uchar_a %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_a %uchar_nl %uchar_nul ; "a = %.1a\n"
+   %string_a = OpVariable %cptr_string_10 UniformConstant %array_a
+    %array_A = OpConstantComposite %string_10 %uchar_A %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_A %uchar_nl %uchar_nul ; "A = %.1A\n"
+   %string_A = OpVariable %cptr_string_10 UniformConstant %array_A
+    %array_e = OpConstantComposite %string_10 %uchar_e %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_e %uchar_nl %uchar_nul ; "e = %.1e\n"
+   %string_e = OpVariable %cptr_string_10 UniformConstant %array_e
+    %array_E = OpConstantComposite %string_10 %uchar_E %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_E %uchar_nl %uchar_nul ; "E = %.1E\n"
+   %string_E = OpVariable %cptr_string_10 UniformConstant %array_E
+    %array_f = OpConstantComposite %string_10 %uchar_f %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_f %uchar_nl %uchar_nul ; "f = %.1f\n"
+   %string_f = OpVariable %cptr_string_10 UniformConstant %array_f
+    %array_F = OpConstantComposite %string_10 %uchar_F %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_F %uchar_nl %uchar_nul ; "F = %.1F\n"
+   %string_F = OpVariable %cptr_string_10 UniformConstant %array_F
+    %array_g = OpConstantComposite %string_10 %uchar_g %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_g %uchar_nl %uchar_nul ; "g = %.1g\n"
+   %string_g = OpVariable %cptr_string_10 UniformConstant %array_g
+    %array_G = OpConstantComposite %string_10 %uchar_G %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_dot %uchar_1 %uchar_G %uchar_nl %uchar_nul ; "G = %.1G\n"
+   %string_G = OpVariable %cptr_string_10 UniformConstant %array_G
+
+       %test = OpFunction %void None %kernel_sig
+          %d = OpFunctionParameter %double
+      %entry = OpLabel
+
+      %fmt_a = OpBitcast %cptr_char %string_a
+   %printf_a = OpExtInst %uint %clext printf %fmt_a %d
+      %fmt_A = OpBitcast %cptr_char %string_A
+   %printf_A = OpExtInst %uint %clext printf %fmt_A %d
+      %fmt_e = OpBitcast %cptr_char %string_e
+   %printf_e = OpExtInst %uint %clext printf %fmt_e %d
+      %fmt_E = OpBitcast %cptr_char %string_E
+   %printf_E = OpExtInst %uint %clext printf %fmt_E %d
+      %fmt_f = OpBitcast %cptr_char %string_f
+   %printf_f = OpExtInst %uint %clext printf %fmt_f %d
+      %fmt_F = OpBitcast %cptr_char %string_F
+   %printf_F = OpExtInst %uint %clext printf %fmt_F %d
+      %fmt_g = OpBitcast %cptr_char %string_g
+   %printf_g = OpExtInst %uint %clext printf %fmt_g %d
+      %fmt_G = OpBitcast %cptr_char %string_G
+   %printf_G = OpExtInst %uint %clext printf %fmt_G %d
+
+               OpReturn
+               OpFunctionEnd
--- a/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_int32.spvasm32
+++ b/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_int32.spvasm32
@@ -0,0 +1,140 @@
+; kernel void printf_operands_scalar_int32(int i)
+; {
+;     printf("d = %d\n", i);
+;     printf("i = %i\n", i);
+;     printf("o = %o\n", i);
+;     printf("u = %u\n", i);
+;     printf("x = %x\n", i);
+;     printf("X = %X\n", i);
+;
+;     printf("hd = %hd\n", i);
+;     printf("hi = %hi\n", i);
+;     printf("ho = %ho\n", i);
+;     printf("hu = %hu\n", i);
+;     printf("hx = %hx\n", i);
+;     printf("hX = %hX\n", i);
+;
+;     printf("hhd = %hhd\n", i);
+;     printf("hhi = %hhi\n", i);
+;     printf("hho = %hho\n", i);
+;     printf("hhu = %hhu\n", i);
+;     printf("hhx = %hhx\n", i);
+;     printf("hhX = %hhX\n", i);
+; }
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int8
+        %clext = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %test "printf_operands_scalar_int32"
+      %uchar = OpTypeInt 8 0
+       %uint = OpTypeInt 32 0
+     %uint_8 = OpConstant %uint 8
+    %uint_10 = OpConstant %uint 10
+    %uint_12 = OpConstant %uint 12
+  %uchar_nul = OpConstant %uchar 0
+   %uchar_nl = OpConstant %uchar 10
+   %uchar_sp = OpConstant %uchar 32
+  %uchar_pct = OpConstant %uchar 37
+   %uchar_eq = OpConstant %uchar 61
+    %uchar_X = OpConstant %uchar 88
+    %uchar_d = OpConstant %uchar 100
+    %uchar_h = OpConstant %uchar 104
+    %uchar_i = OpConstant %uchar 105
+    %uchar_o = OpConstant %uchar 111
+    %uchar_u = OpConstant %uchar 117
+    %uchar_x = OpConstant %uchar 120
+   %string_8 = OpTypeArray %uchar %uint_8
+%cptr_string_8 = OpTypePointer UniformConstant %string_8
+  %string_10 = OpTypeArray %uchar %uint_10
+%cptr_string_10 = OpTypePointer UniformConstant %string_10
+  %string_12 = OpTypeArray %uchar %uint_12
+%cptr_string_12 = OpTypePointer UniformConstant %string_12
+       %void = OpTypeVoid
+ %kernel_sig = OpTypeFunction %void %uint
+  %cptr_char = OpTypePointer UniformConstant %uchar
+
+    %array_d = OpConstantComposite %string_8 %uchar_d %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_d %uchar_nl %uchar_nul ; "d = %d\n"
+   %string_d = OpVariable %cptr_string_8 UniformConstant %array_d
+    %array_i = OpConstantComposite %string_8 %uchar_i %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_i %uchar_nl %uchar_nul ; "i = %i\n"
+   %string_i = OpVariable %cptr_string_8 UniformConstant %array_i
+    %array_o = OpConstantComposite %string_8 %uchar_o %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_o %uchar_nl %uchar_nul ; "o = %o\n"
+   %string_o = OpVariable %cptr_string_8 UniformConstant %array_o
+    %array_u = OpConstantComposite %string_8 %uchar_u %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_u %uchar_nl %uchar_nul ; "u = %u\n"
+   %string_u = OpVariable %cptr_string_8 UniformConstant %array_u
+    %array_x = OpConstantComposite %string_8 %uchar_x %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_x %uchar_nl %uchar_nul ; "x = %x\n"
+   %string_x = OpVariable %cptr_string_8 UniformConstant %array_x
+    %array_X = OpConstantComposite %string_8 %uchar_X %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_X %uchar_nl %uchar_nul ; "X = %X\n"
+   %string_X = OpVariable %cptr_string_8 UniformConstant %array_X
+
+   %array_hd = OpConstantComposite %string_10 %uchar_h %uchar_d %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_d %uchar_nl %uchar_nul ; "hd = %hd\n"
+  %string_hd = OpVariable %cptr_string_10 UniformConstant %array_hd
+   %array_hi = OpConstantComposite %string_10 %uchar_h %uchar_i %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_i %uchar_nl %uchar_nul ; "hi = %hi\n"
+  %string_hi = OpVariable %cptr_string_10 UniformConstant %array_hi
+   %array_ho = OpConstantComposite %string_10 %uchar_h %uchar_o %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_o %uchar_nl %uchar_nul ; "ho = %ho\n"
+  %string_ho = OpVariable %cptr_string_10 UniformConstant %array_ho
+   %array_hu = OpConstantComposite %string_10 %uchar_h %uchar_u %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_u %uchar_nl %uchar_nul ; "hu = %hu\n"
+  %string_hu = OpVariable %cptr_string_10 UniformConstant %array_hu
+   %array_hx = OpConstantComposite %string_10 %uchar_h %uchar_x %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_x %uchar_nl %uchar_nul ; "hx = %hx\n"
+  %string_hx = OpVariable %cptr_string_10 UniformConstant %array_hx
+   %array_hX = OpConstantComposite %string_10 %uchar_h %uchar_X %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_X %uchar_nl %uchar_nul ; "hX = %hX\n"
+  %string_hX = OpVariable %cptr_string_10 UniformConstant %array_hX
+
+  %array_hhd = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_d %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_d %uchar_nl %uchar_nul ; "hhd = %hhd\n"
+ %string_hhd = OpVariable %cptr_string_12 UniformConstant %array_hhd
+  %array_hhi = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_i %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_i %uchar_nl %uchar_nul ; "hhi = %hhi\n"
+ %string_hhi = OpVariable %cptr_string_12 UniformConstant %array_hhi
+  %array_hho = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_o %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_o %uchar_nl %uchar_nul ; "hho = %hho\n"
+ %string_hho = OpVariable %cptr_string_12 UniformConstant %array_hho
+  %array_hhu = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_u %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_u %uchar_nl %uchar_nul ; "hhu = %hhu\n"
+ %string_hhu = OpVariable %cptr_string_12 UniformConstant %array_hhu
+  %array_hhx = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_x %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_x %uchar_nl %uchar_nul ; "hhx = %hhx\n"
+ %string_hhx = OpVariable %cptr_string_12 UniformConstant %array_hhx
+  %array_hhX = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_X %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_X %uchar_nl %uchar_nul ; "hhX = %hhX\n"
+ %string_hhX = OpVariable %cptr_string_12 UniformConstant %array_hhX
+
+       %test = OpFunction %void None %kernel_sig
+          %i = OpFunctionParameter %uint
+      %entry = OpLabel
+      %fmt_d = OpBitcast %cptr_char %string_d
+   %printf_d = OpExtInst %uint %clext printf %fmt_d %i
+      %fmt_i = OpBitcast %cptr_char %string_i
+   %printf_i = OpExtInst %uint %clext printf %fmt_i %i
+      %fmt_o = OpBitcast %cptr_char %string_o
+   %printf_o = OpExtInst %uint %clext printf %fmt_o %i
+      %fmt_u = OpBitcast %cptr_char %string_u
+   %printf_u = OpExtInst %uint %clext printf %fmt_u %i
+      %fmt_x = OpBitcast %cptr_char %string_x
+   %printf_x = OpExtInst %uint %clext printf %fmt_x %i
+      %fmt_X = OpBitcast %cptr_char %string_X
+   %printf_X = OpExtInst %uint %clext printf %fmt_X %i
+
+     %fmt_hd = OpBitcast %cptr_char %string_hd
+  %printf_hd = OpExtInst %uint %clext printf %fmt_hd %i
+     %fmt_hi = OpBitcast %cptr_char %string_hi
+  %printf_hi = OpExtInst %uint %clext printf %fmt_hi %i
+     %fmt_ho = OpBitcast %cptr_char %string_ho
+  %printf_ho = OpExtInst %uint %clext printf %fmt_ho %i
+     %fmt_hu = OpBitcast %cptr_char %string_hu
+  %printf_hu = OpExtInst %uint %clext printf %fmt_hu %i
+     %fmt_hx = OpBitcast %cptr_char %string_hx
+  %printf_hx = OpExtInst %uint %clext printf %fmt_hx %i
+     %fmt_hX = OpBitcast %cptr_char %string_hX
+  %printf_hX = OpExtInst %uint %clext printf %fmt_hX %i
+
+    %fmt_hhd = OpBitcast %cptr_char %string_hhd
+ %printf_hhd = OpExtInst %uint %clext printf %fmt_hhd %i
+    %fmt_hhi = OpBitcast %cptr_char %string_hhi
+ %printf_hhi = OpExtInst %uint %clext printf %fmt_hhi %i
+    %fmt_hho = OpBitcast %cptr_char %string_hho
+ %printf_hho = OpExtInst %uint %clext printf %fmt_hho %i
+    %fmt_hhu = OpBitcast %cptr_char %string_hhu
+ %printf_hhu = OpExtInst %uint %clext printf %fmt_hhu %i
+    %fmt_hhx = OpBitcast %cptr_char %string_hhx
+ %printf_hhx = OpExtInst %uint %clext printf %fmt_hhx %i
+    %fmt_hhX = OpBitcast %cptr_char %string_hhX
+ %printf_hhX = OpExtInst %uint %clext printf %fmt_hhX %i
+
+               OpReturn
+               OpFunctionEnd
--- a/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_int32.spvasm64
+++ b/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_int32.spvasm64
@@ -0,0 +1,140 @@
+; kernel void printf_operands_scalar_int32(int i)
+; {
+;     printf("d = %d\n", i);
+;     printf("i = %i\n", i);
+;     printf("o = %o\n", i);
+;     printf("u = %u\n", i);
+;     printf("x = %x\n", i);
+;     printf("X = %X\n", i);
+;
+;     printf("hd = %hd\n", i);
+;     printf("hi = %hi\n", i);
+;     printf("ho = %ho\n", i);
+;     printf("hu = %hu\n", i);
+;     printf("hx = %hx\n", i);
+;     printf("hX = %hX\n", i);
+;
+;     printf("hhd = %hhd\n", i);
+;     printf("hhi = %hhi\n", i);
+;     printf("hho = %hho\n", i);
+;     printf("hhu = %hhu\n", i);
+;     printf("hhx = %hhx\n", i);
+;     printf("hhX = %hhX\n", i);
+; }
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int8
+        %clext = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %test "printf_operands_scalar_int32"
+      %uchar = OpTypeInt 8 0
+       %uint = OpTypeInt 32 0
+     %uint_8 = OpConstant %uint 8
+    %uint_10 = OpConstant %uint 10
+    %uint_12 = OpConstant %uint 12
+  %uchar_nul = OpConstant %uchar 0
+   %uchar_nl = OpConstant %uchar 10
+   %uchar_sp = OpConstant %uchar 32
+  %uchar_pct = OpConstant %uchar 37
+   %uchar_eq = OpConstant %uchar 61
+    %uchar_X = OpConstant %uchar 88
+    %uchar_d = OpConstant %uchar 100
+    %uchar_h = OpConstant %uchar 104
+    %uchar_i = OpConstant %uchar 105
+    %uchar_o = OpConstant %uchar 111
+    %uchar_u = OpConstant %uchar 117
+    %uchar_x = OpConstant %uchar 120
+   %string_8 = OpTypeArray %uchar %uint_8
+%cptr_string_8 = OpTypePointer UniformConstant %string_8
+  %string_10 = OpTypeArray %uchar %uint_10
+%cptr_string_10 = OpTypePointer UniformConstant %string_10
+  %string_12 = OpTypeArray %uchar %uint_12
+%cptr_string_12 = OpTypePointer UniformConstant %string_12
+       %void = OpTypeVoid
+ %kernel_sig = OpTypeFunction %void %uint
+  %cptr_char = OpTypePointer UniformConstant %uchar
+
+    %array_d = OpConstantComposite %string_8 %uchar_d %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_d %uchar_nl %uchar_nul ; "d = %d\n"
+   %string_d = OpVariable %cptr_string_8 UniformConstant %array_d
+    %array_i = OpConstantComposite %string_8 %uchar_i %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_i %uchar_nl %uchar_nul ; "i = %i\n"
+   %string_i = OpVariable %cptr_string_8 UniformConstant %array_i
+    %array_o = OpConstantComposite %string_8 %uchar_o %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_o %uchar_nl %uchar_nul ; "o = %o\n"
+   %string_o = OpVariable %cptr_string_8 UniformConstant %array_o
+    %array_u = OpConstantComposite %string_8 %uchar_u %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_u %uchar_nl %uchar_nul ; "u = %u\n"
+   %string_u = OpVariable %cptr_string_8 UniformConstant %array_u
+    %array_x = OpConstantComposite %string_8 %uchar_x %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_x %uchar_nl %uchar_nul ; "x = %x\n"
+   %string_x = OpVariable %cptr_string_8 UniformConstant %array_x
+    %array_X = OpConstantComposite %string_8 %uchar_X %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_X %uchar_nl %uchar_nul ; "X = %X\n"
+   %string_X = OpVariable %cptr_string_8 UniformConstant %array_X
+
+   %array_hd = OpConstantComposite %string_10 %uchar_h %uchar_d %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_d %uchar_nl %uchar_nul ; "hd = %hd\n"
+  %string_hd = OpVariable %cptr_string_10 UniformConstant %array_hd
+   %array_hi = OpConstantComposite %string_10 %uchar_h %uchar_i %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_i %uchar_nl %uchar_nul ; "hi = %hi\n"
+  %string_hi = OpVariable %cptr_string_10 UniformConstant %array_hi
+   %array_ho = OpConstantComposite %string_10 %uchar_h %uchar_o %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_o %uchar_nl %uchar_nul ; "ho = %ho\n"
+  %string_ho = OpVariable %cptr_string_10 UniformConstant %array_ho
+   %array_hu = OpConstantComposite %string_10 %uchar_h %uchar_u %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_u %uchar_nl %uchar_nul ; "hu = %hu\n"
+  %string_hu = OpVariable %cptr_string_10 UniformConstant %array_hu
+   %array_hx = OpConstantComposite %string_10 %uchar_h %uchar_x %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_x %uchar_nl %uchar_nul ; "hx = %hx\n"
+  %string_hx = OpVariable %cptr_string_10 UniformConstant %array_hx
+   %array_hX = OpConstantComposite %string_10 %uchar_h %uchar_X %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_X %uchar_nl %uchar_nul ; "hX = %hX\n"
+  %string_hX = OpVariable %cptr_string_10 UniformConstant %array_hX
+
+  %array_hhd = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_d %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_d %uchar_nl %uchar_nul ; "hhd = %hhd\n"
+ %string_hhd = OpVariable %cptr_string_12 UniformConstant %array_hhd
+  %array_hhi = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_i %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_i %uchar_nl %uchar_nul ; "hhi = %hhi\n"
+ %string_hhi = OpVariable %cptr_string_12 UniformConstant %array_hhi
+  %array_hho = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_o %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_o %uchar_nl %uchar_nul ; "hho = %hho\n"
+ %string_hho = OpVariable %cptr_string_12 UniformConstant %array_hho
+  %array_hhu = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_u %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_u %uchar_nl %uchar_nul ; "hhu = %hhu\n"
+ %string_hhu = OpVariable %cptr_string_12 UniformConstant %array_hhu
+  %array_hhx = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_x %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_x %uchar_nl %uchar_nul ; "hhx = %hhx\n"
+ %string_hhx = OpVariable %cptr_string_12 UniformConstant %array_hhx
+  %array_hhX = OpConstantComposite %string_12 %uchar_h %uchar_h %uchar_X %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_h %uchar_h %uchar_X %uchar_nl %uchar_nul ; "hhX = %hhX\n"
+ %string_hhX = OpVariable %cptr_string_12 UniformConstant %array_hhX
+
+       %test = OpFunction %void None %kernel_sig
+          %i = OpFunctionParameter %uint
+      %entry = OpLabel
+      %fmt_d = OpBitcast %cptr_char %string_d
+   %printf_d = OpExtInst %uint %clext printf %fmt_d %i
+      %fmt_i = OpBitcast %cptr_char %string_i
+   %printf_i = OpExtInst %uint %clext printf %fmt_i %i
+      %fmt_o = OpBitcast %cptr_char %string_o
+   %printf_o = OpExtInst %uint %clext printf %fmt_o %i
+      %fmt_u = OpBitcast %cptr_char %string_u
+   %printf_u = OpExtInst %uint %clext printf %fmt_u %i
+      %fmt_x = OpBitcast %cptr_char %string_x
+   %printf_x = OpExtInst %uint %clext printf %fmt_x %i
+      %fmt_X = OpBitcast %cptr_char %string_X
+   %printf_X = OpExtInst %uint %clext printf %fmt_X %i
+
+     %fmt_hd = OpBitcast %cptr_char %string_hd
+  %printf_hd = OpExtInst %uint %clext printf %fmt_hd %i
+     %fmt_hi = OpBitcast %cptr_char %string_hi
+  %printf_hi = OpExtInst %uint %clext printf %fmt_hi %i
+     %fmt_ho = OpBitcast %cptr_char %string_ho
+  %printf_ho = OpExtInst %uint %clext printf %fmt_ho %i
+     %fmt_hu = OpBitcast %cptr_char %string_hu
+  %printf_hu = OpExtInst %uint %clext printf %fmt_hu %i
+     %fmt_hx = OpBitcast %cptr_char %string_hx
+  %printf_hx = OpExtInst %uint %clext printf %fmt_hx %i
+     %fmt_hX = OpBitcast %cptr_char %string_hX
+  %printf_hX = OpExtInst %uint %clext printf %fmt_hX %i
+
+    %fmt_hhd = OpBitcast %cptr_char %string_hhd
+ %printf_hhd = OpExtInst %uint %clext printf %fmt_hhd %i
+    %fmt_hhi = OpBitcast %cptr_char %string_hhi
+ %printf_hhi = OpExtInst %uint %clext printf %fmt_hhi %i
+    %fmt_hho = OpBitcast %cptr_char %string_hho
+ %printf_hho = OpExtInst %uint %clext printf %fmt_hho %i
+    %fmt_hhu = OpBitcast %cptr_char %string_hhu
+ %printf_hhu = OpExtInst %uint %clext printf %fmt_hhu %i
+    %fmt_hhx = OpBitcast %cptr_char %string_hhx
+ %printf_hhx = OpExtInst %uint %clext printf %fmt_hhx %i
+    %fmt_hhX = OpBitcast %cptr_char %string_hhX
+ %printf_hhX = OpExtInst %uint %clext printf %fmt_hhX %i
+
+               OpReturn
+               OpFunctionEnd
--- a/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_int64.spvasm32
+++ b/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_int64.spvasm32
@@ -0,0 +1,77 @@
+; kernel void printf_operands_scalar_int64(long l)
+; {
+;     printf("ld = %ld\n", l);
+;     printf("li = %li\n", l);
+;     printf("lo = %lo\n", l);
+;     printf("lu = %lu\n", l);
+;     printf("lx = %lx\n", l);
+;     printf("lX = %lX\n", l);
+; }
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Int8
+        %clext = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %test "printf_operands_scalar_int64"
+      %uchar = OpTypeInt 8 0
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+     %uint_8 = OpConstant %uint 8
+    %uint_10 = OpConstant %uint 10
+    %uint_12 = OpConstant %uint 12
+    %uchar_nul = OpConstant %uchar 0
+   %uchar_nl = OpConstant %uchar 10
+   %uchar_sp = OpConstant %uchar 32
+  %uchar_pct = OpConstant %uchar 37
+   %uchar_eq = OpConstant %uchar 61
+    %uchar_X = OpConstant %uchar 88
+    %uchar_d = OpConstant %uchar 100
+    %uchar_i = OpConstant %uchar 105
+    %uchar_l = OpConstant %uchar 108
+    %uchar_o = OpConstant %uchar 111
+    %uchar_u = OpConstant %uchar 117
+    %uchar_x = OpConstant %uchar 120
+   %string_8 = OpTypeArray %uchar %uint_8
+%cptr_string_8 = OpTypePointer UniformConstant %string_8
+  %string_10 = OpTypeArray %uchar %uint_10
+%cptr_string_10 = OpTypePointer UniformConstant %string_10
+  %string_12 = OpTypeArray %uchar %uint_12
+%cptr_string_12 = OpTypePointer UniformConstant %string_12
+       %void = OpTypeVoid
+      %float = OpTypeFloat 32
+ %kernel_sig = OpTypeFunction %void %ulong
+  %cptr_char = OpTypePointer UniformConstant %uchar
+
+   %array_ld = OpConstantComposite %string_10 %uchar_l %uchar_d %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_d %uchar_nl %uchar_nul ; "ld = %ld\n"
+  %string_ld = OpVariable %cptr_string_10 UniformConstant %array_ld
+   %array_li = OpConstantComposite %string_10 %uchar_l %uchar_i %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_i %uchar_nl %uchar_nul ; "li = %li\n"
+  %string_li = OpVariable %cptr_string_10 UniformConstant %array_li
+   %array_lo = OpConstantComposite %string_10 %uchar_l %uchar_o %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_o %uchar_nl %uchar_nul ; "lo = %lo\n"
+  %string_lo = OpVariable %cptr_string_10 UniformConstant %array_lo
+   %array_lu = OpConstantComposite %string_10 %uchar_l %uchar_u %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_u %uchar_nl %uchar_nul ; "lu = %lu\n"
+  %string_lu = OpVariable %cptr_string_10 UniformConstant %array_lu
+   %array_lx = OpConstantComposite %string_10 %uchar_l %uchar_x %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_x %uchar_nl %uchar_nul ; "lx = %lx\n"
+  %string_lx = OpVariable %cptr_string_10 UniformConstant %array_lx
+   %array_lX = OpConstantComposite %string_10 %uchar_l %uchar_X %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_X %uchar_nl %uchar_nul ; "lX = %lX\n"
+  %string_lX = OpVariable %cptr_string_10 UniformConstant %array_lX
+
+       %test = OpFunction %void None %kernel_sig
+          %l = OpFunctionParameter %ulong
+      %entry = OpLabel
+
+     %fmt_ld = OpBitcast %cptr_char %string_ld
+  %printf_ld = OpExtInst %uint %clext printf %fmt_ld %l
+     %fmt_li = OpBitcast %cptr_char %string_li
+  %printf_li = OpExtInst %uint %clext printf %fmt_li %l
+     %fmt_lo = OpBitcast %cptr_char %string_lo
+  %printf_lo = OpExtInst %uint %clext printf %fmt_lo %l
+     %fmt_lu = OpBitcast %cptr_char %string_lu
+  %printf_lu = OpExtInst %uint %clext printf %fmt_lu %l
+     %fmt_lx = OpBitcast %cptr_char %string_lx
+  %printf_lx = OpExtInst %uint %clext printf %fmt_lx %l
+     %fmt_lX = OpBitcast %cptr_char %string_lX
+  %printf_lX = OpExtInst %uint %clext printf %fmt_lX %l
+               OpReturn
+               OpFunctionEnd
--- a/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_int64.spvasm64
+++ b/test_conformance/spirv_new/spirv_asm/printf_operands_scalar_int64.spvasm64
@@ -0,0 +1,77 @@
+; kernel void printf_operands_scalar_int64(long l)
+; {
+;     printf("ld = %ld\n", l);
+;     printf("li = %li\n", l);
+;     printf("lo = %lo\n", l);
+;     printf("lu = %lu\n", l);
+;     printf("lx = %lx\n", l);
+;     printf("lX = %lX\n", l);
+; }
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Int8
+        %clext = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %test "printf_operands_scalar_int64"
+      %uchar = OpTypeInt 8 0
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+     %uint_8 = OpConstant %uint 8
+    %uint_10 = OpConstant %uint 10
+    %uint_12 = OpConstant %uint 12
+    %uchar_nul = OpConstant %uchar 0
+   %uchar_nl = OpConstant %uchar 10
+   %uchar_sp = OpConstant %uchar 32
+  %uchar_pct = OpConstant %uchar 37
+   %uchar_eq = OpConstant %uchar 61
+    %uchar_X = OpConstant %uchar 88
+    %uchar_d = OpConstant %uchar 100
+    %uchar_i = OpConstant %uchar 105
+    %uchar_l = OpConstant %uchar 108
+    %uchar_o = OpConstant %uchar 111
+    %uchar_u = OpConstant %uchar 117
+    %uchar_x = OpConstant %uchar 120
+   %string_8 = OpTypeArray %uchar %uint_8
+%cptr_string_8 = OpTypePointer UniformConstant %string_8
+  %string_10 = OpTypeArray %uchar %uint_10
+%cptr_string_10 = OpTypePointer UniformConstant %string_10
+  %string_12 = OpTypeArray %uchar %uint_12
+%cptr_string_12 = OpTypePointer UniformConstant %string_12
+       %void = OpTypeVoid
+      %float = OpTypeFloat 32
+ %kernel_sig = OpTypeFunction %void %ulong
+  %cptr_char = OpTypePointer UniformConstant %uchar
+
+   %array_ld = OpConstantComposite %string_10 %uchar_l %uchar_d %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_d %uchar_nl %uchar_nul ; "ld = %ld\n"
+  %string_ld = OpVariable %cptr_string_10 UniformConstant %array_ld
+   %array_li = OpConstantComposite %string_10 %uchar_l %uchar_i %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_i %uchar_nl %uchar_nul ; "li = %li\n"
+  %string_li = OpVariable %cptr_string_10 UniformConstant %array_li
+   %array_lo = OpConstantComposite %string_10 %uchar_l %uchar_o %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_o %uchar_nl %uchar_nul ; "lo = %lo\n"
+  %string_lo = OpVariable %cptr_string_10 UniformConstant %array_lo
+   %array_lu = OpConstantComposite %string_10 %uchar_l %uchar_u %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_u %uchar_nl %uchar_nul ; "lu = %lu\n"
+  %string_lu = OpVariable %cptr_string_10 UniformConstant %array_lu
+   %array_lx = OpConstantComposite %string_10 %uchar_l %uchar_x %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_x %uchar_nl %uchar_nul ; "lx = %lx\n"
+  %string_lx = OpVariable %cptr_string_10 UniformConstant %array_lx
+   %array_lX = OpConstantComposite %string_10 %uchar_l %uchar_X %uchar_sp %uchar_eq %uchar_sp %uchar_pct %uchar_l %uchar_X %uchar_nl %uchar_nul ; "lX = %lX\n"
+  %string_lX = OpVariable %cptr_string_10 UniformConstant %array_lX
+
+       %test = OpFunction %void None %kernel_sig
+          %l = OpFunctionParameter %ulong
+      %entry = OpLabel
+
+     %fmt_ld = OpBitcast %cptr_char %string_ld
+  %printf_ld = OpExtInst %uint %clext printf %fmt_ld %l
+     %fmt_li = OpBitcast %cptr_char %string_li
+  %printf_li = OpExtInst %uint %clext printf %fmt_li %l
+     %fmt_lo = OpBitcast %cptr_char %string_lo
+  %printf_lo = OpExtInst %uint %clext printf %fmt_lo %l
+     %fmt_lu = OpBitcast %cptr_char %string_lu
+  %printf_lu = OpExtInst %uint %clext printf %fmt_lu %l
+     %fmt_lx = OpBitcast %cptr_char %string_lx
+  %printf_lx = OpExtInst %uint %clext printf %fmt_lx %l
+     %fmt_lX = OpBitcast %cptr_char %string_lX
+  %printf_lX = OpExtInst %uint %clext printf %fmt_lX %l
+               OpReturn
+               OpFunctionEnd
--- a/test_conformance/spirv_new/test_extinst_printf.cpp
+++ b/test_conformance/spirv_new/test_extinst_printf.cpp
@@ -0,0 +1,259 @@
+//
+// Copyright (c) 2025 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "harness/os_helpers.h"
+#include "testBase.h"
+
+#if defined(_WIN32)
+#include <io.h>
+#define streamDup(fd1) _dup(fd1)
+#define streamDup2(fd1, fd2) _dup2(fd1, fd2)
+#else
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+#include <unistd.h>
+#define streamDup(fd1) dup(fd1)
+#define streamDup2(fd1, fd2) dup2(fd1, fd2)
+#endif
+
+#include <fstream>
+#include <vector>
+
+// TODO: Unify with test_printf.
+struct StreamGrabber
+{
+    StreamGrabber()
+    {
+        char* tmp = get_temp_filename();
+        tempFileName = tmp;
+        free(tmp);
+    }
+    ~StreamGrabber()
+    {
+        if (acquired)
+        {
+            release();
+        }
+    }
+
+    int acquire(void)
+    {
+        if (acquired == false)
+        {
+            old_fd = streamDup(fileno(stdout));
+            if (!freopen(tempFileName.c_str(), "w", stdout))
+            {
+                release();
+                return -1;
+            }
+            acquired = true;
+        }
+        return 0;
+    }
+
+    int release(void)
+    {
+        if (acquired == true)
+        {
+            fflush(stdout);
+            streamDup2(old_fd, fileno(stdout));
+            close(old_fd);
+            acquired = false;
+        }
+        return 0;
+    }
+
+    int get_results(std::string& results)
+    {
+        if (acquired == false)
+        {
+            std::ifstream is(tempFileName, std::ios::binary);
+            if (is.good())
+            {
+                size_t filesize = 0;
+                is.seekg(0, std::ios::end);
+                filesize = (size_t)is.tellg();
+                is.seekg(0, std::ios::beg);
+
+                results.clear();
+                results.resize(filesize);
+                is.read(&results[0], filesize);
+
+                return 0;
+            }
+        }
+        return -1;
+    }
+
+    std::string tempFileName;
+    int old_fd = 0;
+    bool acquired = false;
+};
+
+// printf callback, for cl_arm_printf
+void CL_CALLBACK printfCallBack(const char* printf_data, size_t len,
+                                size_t final, void* user_data)
+{
+    fwrite(printf_data, 1, len, stdout);
+}
+
+template <typename T>
+static int printf_operands_helper(cl_device_id device,
+                                  const char* spirvFileName,
+                                  const char* kernelName,
+                                  const char* expectedResults, T value)
+{
+    StreamGrabber grabber;
+    cl_int error;
+
+    // Create a context and a queue to test with.
+    // We cannot use the context and queue from the harness because some
+    // implementations require a printf callback to be set at context creation.
+
+    cl_context_properties printf_properties[] = {
+        CL_PRINTF_CALLBACK_ARM, (cl_context_properties)printfCallBack,
+        CL_PRINTF_BUFFERSIZE_ARM, 256, 0
+    };
+
+    cl_context_properties* props =
+        is_extension_available(device, "cl_arm_printf") ? printf_properties
+                                                        : nullptr;
+
+    clContextWrapper context =
+        clCreateContext(props, 1, &device, notify_callback, nullptr, &error);
+    test_error(error, "Unable to create printf context");
+
+    clCommandQueueWrapper queue =
+        clCreateCommandQueue(context, device, 0, &error);
+    test_error(error, "Unable to create printf queue");
+
+    clProgramWrapper program;
+    error = get_program_with_il(program, device, context, spirvFileName);
+    test_error(error, "Unable to build SPIR-V program");
+
+    clKernelWrapper kernel = clCreateKernel(program, kernelName, &error);
+    test_error(error, "Unable to create SPIR-V kernel");
+
+    error = clSetKernelArg(kernel, 0, sizeof(value), &value);
+    test_error(error, "Unable to set kernel arguments");
+
+    size_t global = 1;
+    grabber.acquire();
+    error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0,
+                                    NULL, NULL);
+    error |= clFinish(queue);
+    grabber.release();
+    test_error(error, "unable to enqueue kernel");
+
+    std::string results;
+    grabber.get_results(results);
+
+    if (results != std::string(expectedResults))
+    {
+        log_error("Results do not match.\n");
+        log_error("Expected: \n---\n%s---\n", expectedResults);
+        log_error("Got: \n---\n%s---\n", results.c_str());
+        return TEST_FAIL;
+    }
+
+    return TEST_PASS;
+}
+
+REGISTER_TEST(extinst_printf_operands_scalar_int32)
+{
+    static const char* expected = R"(d = 1
+i = 1
+o = 1
+u = 1
+x = 1
+X = 1
+hd = 1
+hi = 1
+ho = 1
+hu = 1
+hx = 1
+hX = 1
+hhd = 1
+hhi = 1
+hho = 1
+hhu = 1
+hhx = 1
+hhX = 1
+)";
+
+    return printf_operands_helper(device, "printf_operands_scalar_int32",
+                                  "printf_operands_scalar_int32", expected, 1);
+}
+
+REGISTER_TEST(extinst_printf_operands_scalar_fp32)
+{
+    static const char* expected = R"(a = 0x1.0p+1
+A = 0X1.0P+1
+e = 2.0e+00
+E = 2.0E+00
+f = 2.0
+F = 2.0
+g = 2
+G = 2
+)";
+
+    return printf_operands_helper(device, "printf_operands_scalar_fp32",
+                                  "printf_operands_scalar_fp32", expected,
+                                  2.0f);
+}
+
+REGISTER_TEST(extinst_printf_operands_scalar_int64)
+{
+    static const char* expected = R"(ld = 4
+li = 4
+lo = 4
+lu = 4
+lx = 4
+lX = 4
+)";
+
+    if (!gHasLong)
+    {
+        log_info("Device does not support 64-bit integers. Skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    return printf_operands_helper(device, "printf_operands_scalar_int64",
+                                  "printf_operands_scalar_int64", expected, 4L);
+}
+
+REGISTER_TEST(extinst_printf_operands_scalar_fp64)
+{
+    static const char* expected = R"(a = 0x1.0p+3
+A = 0X1.0P+3
+e = 8.0e+00
+E = 8.0E+00
+f = 8.0
+F = 8.0
+g = 8
+G = 8
+)";
+
+    if (!is_extension_available(device, "cl_khr_fp64"))
+    {
+        log_info("Device does not support fp64. Skipping test.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    return printf_operands_helper(device, "printf_operands_scalar_fp64",
+                                  "printf_operands_scalar_fp64", expected, 8.0);
+}
--- a/test_conformance/subgroups/subhelpers.h
+++ b/test_conformance/subgroups/subhelpers.h
@@ -1611,7 +1611,7 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct subgroup_test
        test_params.subgroup_size = subgroup_size;
        Fns::gen(idata.data(), mapin.data(), sgmap.data(), test_params);

-        test_status status;
+        test_status status = TEST_FAIL;

        if (test_params.divergence_mask_arg != -1)
        {
--- a/Show More
+++ b/Show More