diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index 957233ca..96433945 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -1,23 +1,36 @@
 set(MODULE_NAME BRUTEFORCE)
 
 set(${MODULE_NAME}_SOURCES
-    binary.cpp
-    binary_i.cpp
-    binary_operator.cpp
-    binary_two_results_i.cpp
+    binary_double.cpp
+    binary_float.cpp
+    binary_i_double.cpp
+    binary_i_float.cpp
+    binary_operator_double.cpp
+    binary_operator_float.cpp
+    binary_two_results_i_double.cpp
+    binary_two_results_i_float.cpp
     function_list.cpp
-    i_unary.cpp
-    macro_binary.cpp
-    macro_unary.cpp
-    mad.cpp
+    i_unary_double.cpp
+    i_unary_float.cpp
+    macro_binary_double.cpp
+    macro_binary_float.cpp
+    macro_unary_double.cpp
+    macro_unary_float.cpp
+    mad_double.cpp
+    mad_float.cpp
     main.cpp
     reference_math.cpp
     sleep.cpp
-    ternary.cpp
-    unary.cpp
-    unary_two_results.cpp
-    unary_two_results_i.cpp
-    unary_u.cpp
+    ternary_double.cpp
+    ternary_float.cpp
+    unary_double.cpp
+    unary_float.cpp
+    unary_two_results_double.cpp
+    unary_two_results_float.cpp
+    unary_two_results_i_double.cpp
+    unary_two_results_i_float.cpp
+    unary_u_double.cpp
+    unary_u_float.cpp
     utility.cpp
 )
 
diff --git a/test_conformance/math_brute_force/binary_double.cpp b/test_conformance/math_brute_force/binary_double.cpp
new file mode 100644
index 00000000..7bff9aca
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_double.cpp
@@ -0,0 +1,947 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
+
+static int BuildKernelDouble(const char *name, int vectorSize,
+                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                             bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
+                             info->kernels[i], info->programs + i,
+                             info->relaxedMode);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    int isFDim;
+    int skipNanInf;
+    int isNextafter;
+    bool relaxedMode; // True if test is running in relaxed mode, false
+                      // otherwise.
+} TestInfo;
+
+// A table of more difficult cases to get right
+static const double specialValuesDouble[] = {
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
+
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
+};
+
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
+
+static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
+                                                int isNextafter,
+                                                bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = 0;
+    test_info.isNextafter = isNextafter;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input arrays
+        double *p = (double *)gIn;
+        double *p2 = (double *)gIn2;
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+        {
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+            p2[j] = DoubleFromUInt32(genrand_int32(d));
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+
+    int isNextafter = job->isNextafter;
+    cl_ulong *t;
+    cl_double *r;
+    cl_double *s;
+    cl_double *s2;
+
+    Force64BitFPUPrecision();
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Init input array
+    cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
+    cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesDoubleCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        cl_double *fp = (cl_double *)p;
+        cl_double *fp2 = (cl_double *)p2;
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            fp[j] = specialValuesDouble[x];
+            fp2[j] = specialValuesDouble[y];
+            if (++x >= specialValuesDoubleCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesDoubleCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = genrand_int64(d);
+        p2[j] = genrand_int64(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            goto exit;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            goto exit;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+    t = (cl_ulong *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ulong *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_ff(s[j], s2[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsDoubleResultSubnormal(correct, ulps))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // nextafter on FTZ platforms may return the smallest
+                    // normal float (2^-126) given a denormal or a zero
+                    // as the first argument. The rationale here is that
+                    // nextafter flushes the argument to zero and then
+                    // returns the next representable number in the
+                    // direction of the second argument, and since
+                    // denorms are considered as zero, the smallest
+                    // normal number is the next representable number.
+                    // In which case, it should have the same sign as the
+                    // second argument.
+                    if (isNextafter)
+                    {
+                        if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
+                        {
+                            cl_double value = copysign(twoToMinus1022, s2[j]);
+                            fail = fail && (test != value);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                    else
+                    {
+                        // retry per section 6.5.3.3
+                        if (IsDoubleSubnormal(s[j]))
+                        {
+                            long double correct2 = func.f_ff(0.0, s2[j]);
+                            long double correct3 = func.f_ff(-0.0, s2[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+
+                            // try with both args as zero
+                            if (IsDoubleSubnormal(s2[j]))
+                            {
+                                correct2 = func.f_ff(0.0, 0.0);
+                                correct3 = func.f_ff(-0.0, 0.0);
+                                long double correct4 = func.f_ff(0.0, -0.0);
+                                long double correct5 = func.f_ff(-0.0, -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= ulps))
+                                        && (!(fabsf(err3) <= ulps))
+                                        && (!(fabsf(err4) <= ulps))
+                                        && (!(fabsf(err5) <= ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (IsDoubleResultSubnormal(correct2, ulps)
+                                    || IsDoubleResultSubnormal(correct3, ulps)
+                                    || IsDoubleResultSubnormal(correct4, ulps)
+                                    || IsDoubleResultSubnormal(correct5, ulps))
+                                {
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
+                                }
+                            }
+                        }
+                        else if (IsDoubleSubnormal(s2[j]))
+                        {
+                            long double correct2 = func.f_ff(s[j], 0.0);
+                            long double correct3 = func.f_ff(s[j], -0.0);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, "
+                               "%.13la}: *%.13la vs. %.13la\n",
+                               name, sizeNames[k], err, s[j], s2[j], r[j],
+                               test);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+exit:
+    return error;
+}
+
+int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    return TestFunc_Double_Double_Double_common(f, d, 0, relaxedMode);
+}
+
+int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata d,
+                                            bool relaxedMode)
+{
+    return TestFunc_Double_Double_Double_common(f, d, 1, relaxedMode);
+}
diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary_float.cpp
similarity index 55%
rename from test_conformance/math_brute_force/binary.cpp
rename to test_conformance/math_brute_force/binary_float.cpp
index 699c0944..0ad7b87a 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary_float.cpp
@@ -21,7 +21,6 @@
 #include <string.h>
 
 const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
-const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
 
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
@@ -108,94 +107,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in1, __global double",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* in, __global double* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       d0 = ",
-        name,
-        "( d0, d1 );\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       double3 d1;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = ",
-        name,
-        "( d0, d1 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -215,16 +126,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
-}
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -1194,790 +1095,13 @@ exit:
     return error;
 }
 
-// A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
-    -NAN,
-    -INFINITY,
-    -DBL_MAX,
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
-    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
-    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
-    -1000.,
-    -100.,
-    -4.0,
-    -3.5,
-    -3.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
-    -2.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
-    -2.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
-    -1.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    -1.0,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
-    -0.5,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
-    -0.25,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
-    -DBL_MIN,
-    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
-    -0.0,
-
-    +NAN,
-    +INFINITY,
-    +DBL_MAX,
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
-    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
-    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
-    +1000.,
-    +100.,
-    +4.0,
-    +3.5,
-    +3.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
-    +2.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
-    +2.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
-    +1.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    +1.0,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
-    +0.5,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
-    +0.25,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
-    +DBL_MIN,
-    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
-    +0.0,
-};
-
-static size_t specialValuesDoubleCount =
-    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
-
-static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
-                                                int isNextafter,
-                                                bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-
-    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
-    test_info.skipNanInf = 0;
-    test_info.isNextafter = isNextafter;
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_double);
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    float ulps = job->ulps;
-    dptr func = job->f->dfunc;
-    int ftz = job->ftz;
-    MTdata d = tinfo->d;
-    cl_uint j, k;
-    cl_int error;
-    const char *name = job->f->name;
-
-    int isNextafter = job->isNextafter;
-    cl_ulong *t;
-    cl_double *r;
-    cl_double *s;
-    cl_double *s2;
-
-    Force64BitFPUPrecision();
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_ulong *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Init input array
-    cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
-    cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
-    j = 0;
-    int totalSpecialValueCount =
-        specialValuesDoubleCount * specialValuesDoubleCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
-
-    if (job_id <= (cl_uint)indx)
-    { // test edge cases
-        cl_double *fp = (cl_double *)p;
-        cl_double *fp2 = (cl_double *)p2;
-        uint32_t x, y;
-
-        x = (job_id * buffer_elements) % specialValuesDoubleCount;
-        y = (job_id * buffer_elements) / specialValuesDoubleCount;
-
-        for (; j < buffer_elements; j++)
-        {
-            fp[j] = specialValuesDouble[x];
-            fp2[j] = specialValuesDouble[y];
-            if (++x >= specialValuesDoubleCount)
-            {
-                x = 0;
-                y++;
-                if (y >= specialValuesDoubleCount) break;
-            }
-        }
-    }
-
-    // Init any remaining values.
-    for (; j < buffer_elements; j++)
-    {
-        p[j] = genrand_int64(d);
-        p2[j] = genrand_int64(d);
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
-                                      buffer_size, p2, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            goto exit;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            goto exit;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
-    s = (cl_double *)gIn + thread_id * buffer_elements;
-    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
-        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            goto exit;
-        }
-    }
-
-    // Wait for the last buffer
-    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
-    // Verify data
-    t = (cl_ulong *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        {
-            cl_ulong *q = out[k];
-
-            // If we aren't getting the correctly rounded result
-            if (t[j] != q[j])
-            {
-                cl_double test = ((cl_double *)q)[j];
-                long double correct = func.f_ff(s[j], s2[j]);
-                float err = Bruteforce_Ulp_Error_Double(test, correct);
-                int fail = !(fabsf(err) <= ulps);
-
-                if (fail && ftz)
-                {
-                    // retry per section 6.5.3.2
-                    if (IsDoubleResultSubnormal(correct, ulps))
-                    {
-                        fail = fail && (test != 0.0f);
-                        if (!fail) err = 0.0f;
-                    }
-
-                    // nextafter on FTZ platforms may return the smallest
-                    // normal float (2^-126) given a denormal or a zero
-                    // as the first argument. The rationale here is that
-                    // nextafter flushes the argument to zero and then
-                    // returns the next representable number in the
-                    // direction of the second argument, and since
-                    // denorms are considered as zero, the smallest
-                    // normal number is the next representable number.
-                    // In which case, it should have the same sign as the
-                    // second argument.
-                    if (isNextafter)
-                    {
-                        if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
-                        {
-                            cl_double value = copysign(twoToMinus1022, s2[j]);
-                            fail = fail && (test != value);
-                            if (!fail) err = 0.0f;
-                        }
-                    }
-                    else
-                    {
-                        // retry per section 6.5.3.3
-                        if (IsDoubleSubnormal(s[j]))
-                        {
-                            long double correct2 = func.f_ff(0.0, s2[j]);
-                            long double correct3 = func.f_ff(-0.0, s2[j]);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= ulps))
-                                    && (!(fabsf(err3) <= ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2, ulps)
-                                || IsDoubleResultSubnormal(correct3, ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-
-                            // try with both args as zero
-                            if (IsDoubleSubnormal(s2[j]))
-                            {
-                                correct2 = func.f_ff(0.0, 0.0);
-                                correct3 = func.f_ff(-0.0, 0.0);
-                                long double correct4 = func.f_ff(0.0, -0.0);
-                                long double correct5 = func.f_ff(-0.0, -0.0);
-                                err2 =
-                                    Bruteforce_Ulp_Error_Double(test, correct2);
-                                err3 =
-                                    Bruteforce_Ulp_Error_Double(test, correct3);
-                                float err4 =
-                                    Bruteforce_Ulp_Error_Double(test, correct4);
-                                float err5 =
-                                    Bruteforce_Ulp_Error_Double(test, correct5);
-                                fail = fail
-                                    && ((!(fabsf(err2) <= ulps))
-                                        && (!(fabsf(err3) <= ulps))
-                                        && (!(fabsf(err4) <= ulps))
-                                        && (!(fabsf(err5) <= ulps)));
-                                if (fabsf(err2) < fabsf(err)) err = err2;
-                                if (fabsf(err3) < fabsf(err)) err = err3;
-                                if (fabsf(err4) < fabsf(err)) err = err4;
-                                if (fabsf(err5) < fabsf(err)) err = err5;
-
-                                // retry per section 6.5.3.4
-                                if (IsDoubleResultSubnormal(correct2, ulps)
-                                    || IsDoubleResultSubnormal(correct3, ulps)
-                                    || IsDoubleResultSubnormal(correct4, ulps)
-                                    || IsDoubleResultSubnormal(correct5, ulps))
-                                {
-                                    fail = fail && (test != 0.0f);
-                                    if (!fail) err = 0.0f;
-                                }
-                            }
-                        }
-                        else if (IsDoubleSubnormal(s2[j]))
-                        {
-                            long double correct2 = func.f_ff(s[j], 0.0);
-                            long double correct3 = func.f_ff(s[j], -0.0);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= ulps))
-                                    && (!(fabsf(err3) <= ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2, ulps)
-                                || IsDoubleResultSubnormal(correct3, ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                }
-
-                if (fabsf(err) > tinfo->maxError)
-                {
-                    tinfo->maxError = fabsf(err);
-                    tinfo->maxErrorValue = s[j];
-                    tinfo->maxErrorValue2 = s2[j];
-                }
-                if (fail)
-                {
-                    vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, "
-                               "%.13la}: *%.13la vs. %.13la\n",
-                               name, sizeNames[k], err, s[j], s2[j], r[j],
-                               test);
-                    error = -1;
-                    goto exit;
-                }
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
-                 "ThreadCount:%2u\n",
-                 base, job->step, job->scale, buffer_elements, job->ulps,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-exit:
-    return error;
-}
-
 int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     return TestFunc_Float_Float_Float_common(f, d, 0, relaxedMode);
 }
 
-int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    return TestFunc_Double_Double_Double_common(f, d, 0, relaxedMode);
-}
-
 int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata d,
                                          bool relaxedMode)
 {
     return TestFunc_Float_Float_Float_common(f, d, 1, relaxedMode);
 }
-
-int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata d,
-                                            bool relaxedMode)
-{
-    return TestFunc_Double_Double_Double_common(f, d, 1, relaxedMode);
-}
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
similarity index 54%
rename from test_conformance/math_brute_force/binary_i.cpp
rename to test_conformance/math_brute_force/binary_i_double.cpp
index 50d14f33..4d6cb860 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -21,91 +21,6 @@
 #include <limits.h>
 #include <string.h>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global float",
-                        sizeNames[vectorSize],
-                        "* out, __global float",
-                        sizeNames[vectorSize],
-                        "* in1, __global int",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global float* out, __global float* in, __global int* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
-        "       f0 = ",
-        name,
-        "( f0, i0 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       int3 i0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, i0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
@@ -204,15 +119,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
-}
-
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -223,112 +129,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->relaxedMode);
 }
 
-// A table of more difficult cases to get right
-static const float specialValuesFloat[] = {
-    -NAN,
-    -INFINITY,
-    -FLT_MAX,
-    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
-    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
-    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
-    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
-    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
-    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
-    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
-    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
-    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
-    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
-    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
-    -1000.f,
-    -100.f,
-    -4.0f,
-    -3.5f,
-    -3.0f,
-    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
-    -2.5f,
-    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
-    -2.0f,
-    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
-    -1.5f,
-    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
-    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
-    -1.0f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
-    -0.5f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
-    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
-    -0.25f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
-    -FLT_MIN,
-    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
-    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
-    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
-    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
-    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
-    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
-    -0.0f,
-
-    +NAN,
-    +INFINITY,
-    +FLT_MAX,
-    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
-    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
-    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
-    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
-    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
-    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
-    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
-    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
-    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
-    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
-    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
-    +1000.f,
-    +100.f,
-    +4.0f,
-    +3.5f,
-    +3.0f,
-    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
-    2.5f,
-    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
-    +2.0f,
-    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
-    1.5f,
-    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
-    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
-    +1.0f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
-    +0.5f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
-    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
-    +0.25f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
-    +FLT_MIN,
-    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
-    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
-    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
-    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
-    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
-    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
-    +0.0f
-};
-
-static const size_t specialValuesFloatCount =
-    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
-
 static const int specialValuesInt[] = {
     0,           1,           2,          3,          126,        127,
     128,         0x02000001,  0x04000001, 1465264071, 1488522147, -1,
@@ -373,576 +173,6 @@ typedef struct TestInfo
     // no special values
 } TestInfo;
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
-
-int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    cl_int maxErrorVal2 = 0;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_float)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        cl_buffer_region region2 = { i * test_info.subBufferSize
-                                         * sizeof(cl_int),
-                                     test_info.subBufferSize * sizeof(cl_int) };
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gInBuffer for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-        {
-            p[j] = (genrand_int32(d) & ~0x40000000) | 0x38000000;
-            p2[j] = 3;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_float);
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    fptr func = job->f->func;
-    int ftz = job->ftz;
-    float ulps = job->ulps;
-    MTdata d = tinfo->d;
-    cl_uint j, k;
-    cl_int error;
-    const char *name = job->f->name;
-    cl_uint *t = 0;
-    cl_float *r = 0;
-    cl_float *s = 0;
-    cl_int *s2 = 0;
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_uint *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_uint *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Init input array
-    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
-    cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
-    j = 0;
-
-    int totalSpecialValueCount =
-        specialValuesFloatCount * specialValuesIntCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
-
-    if (job_id <= (cl_uint)indx)
-    { // test edge cases
-        float *fp = (float *)p;
-        cl_int *ip2 = (cl_int *)p2;
-        uint32_t x, y;
-
-        x = (job_id * buffer_elements) % specialValuesFloatCount;
-        y = (job_id * buffer_elements) / specialValuesFloatCount;
-
-        for (; j < buffer_elements; j++)
-        {
-            fp[j] = specialValuesFloat[x];
-            ip2[j] = specialValuesInt[y];
-            ++x;
-            if (x >= specialValuesFloatCount)
-            {
-                x = 0;
-                y++;
-                if (y >= specialValuesIntCount) break;
-            }
-        }
-    }
-
-    // Init any remaining values.
-    for (; j < buffer_elements; j++)
-    {
-        p[j] = genrand_int32(d);
-        p2[j] = genrand_int32(d);
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
-                                      buffer_size, p2, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            goto exit;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            goto exit;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    r = (float *)gOut_Ref + thread_id * buffer_elements;
-    s = (float *)gIn + thread_id * buffer_elements;
-    s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_fi(s[j], s2[j]);
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_uint *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            goto exit;
-        }
-    }
-
-    // Wait for the last buffer
-    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                           0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
-    // Verify data
-    t = (cl_uint *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        {
-            cl_uint *q = out[k];
-
-            // If we aren't getting the correctly rounded result
-            if (t[j] != q[j])
-            {
-                float test = ((float *)q)[j];
-                double correct = func.f_fi(s[j], s2[j]);
-                float err = Ulp_Error(test, correct);
-                int fail = !(fabsf(err) <= ulps);
-
-                if (fail && ftz)
-                {
-                    // retry per section 6.5.3.2
-                    if (IsFloatResultSubnormal(correct, ulps))
-                    {
-                        fail = fail && (test != 0.0f);
-                        if (!fail) err = 0.0f;
-                    }
-
-                    // retry per section 6.5.3.3
-                    if (IsFloatSubnormal(s[j]))
-                    {
-                        double correct2, correct3;
-                        float err2, err3;
-                        correct2 = func.f_fi(0.0, s2[j]);
-                        correct3 = func.f_fi(-0.0, s2[j]);
-                        err2 = Ulp_Error(test, correct2);
-                        err3 = Ulp_Error(test, correct3);
-                        fail = fail
-                            && ((!(fabsf(err2) <= ulps))
-                                && (!(fabsf(err3) <= ulps)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-
-                        // retry per section 6.5.3.4
-                        if (IsFloatResultSubnormal(correct2, ulps)
-                            || IsFloatResultSubnormal(correct3, ulps))
-                        {
-                            fail = fail && (test != 0.0f);
-                            if (!fail) err = 0.0f;
-                        }
-                    }
-                }
-
-                if (fabsf(err) > tinfo->maxError)
-                {
-                    tinfo->maxError = fabsf(err);
-                    tinfo->maxErrorValue = s[j];
-                    tinfo->maxErrorValue2 = s2[j];
-                }
-                if (fail)
-                {
-                    vlog_error(
-                        "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: "
-                        "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %d\n",
-                        name, sizeNames[k], err, s[j], ((uint32_t *)s)[j],
-                        s2[j], r[j], ((uint32_t *)r)[j], test,
-                        ((cl_uint *)&test)[0], j);
-                    error = -1;
-                    goto exit;
-                }
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
-                 "ThreadCount:%2u\n",
-                 base, job->step, job->scale, buffer_elements, job->ulps,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-exit:
-    return error;
-}
-
-
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
     -NAN,
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
new file mode 100644
index 00000000..0ff9b57f
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -0,0 +1,845 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <limits.h>
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                       cl_kernel *k, cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global int",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global int* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0, i0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       int3 i0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
+}
+
+// A table of more difficult cases to get right
+static const float specialValuesFloat[] = {
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
+
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
+};
+
+static const size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
+
+static const int specialValuesInt[] = {
+    0,           1,           2,          3,          126,        127,
+    128,         0x02000001,  0x04000001, 1465264071, 1488522147, -1,
+    -2,          -3,          -126,       -127,       -128,       -0x02000001,
+    -0x04000001, -1465264071, -1488522147
+};
+static size_t specialValuesIntCount =
+    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    cl_int maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    // no special values
+} TestInfo;
+
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
+
+int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input arrays
+        uint32_t *p = (uint32_t *)gIn;
+        uint32_t *p2 = (uint32_t *)gIn2;
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        {
+            p[j] = (genrand_int32(d) & ~0x40000000) | 0x38000000;
+            p2[j] = 3;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    float ulps = job->ulps;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_uint *t = 0;
+    cl_float *r = 0;
+    cl_float *s = 0;
+    cl_int *s2 = 0;
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_uint *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Init input array
+    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
+    cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesIntCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        float *fp = (float *)p;
+        cl_int *ip2 = (cl_int *)p2;
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesFloatCount;
+        y = (job_id * buffer_elements) / specialValuesFloatCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            fp[j] = specialValuesFloat[x];
+            ip2[j] = specialValuesInt[y];
+            ++x;
+            if (x >= specialValuesFloatCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesIntCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = genrand_int32(d);
+        p2[j] = genrand_int32(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            goto exit;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            goto exit;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (float *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) r[j] = (float)func.f_fi(s[j], s2[j]);
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+    t = (cl_uint *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_uint *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                float test = ((float *)q)[j];
+                double correct = func.f_fi(s[j], s2[j]);
+                float err = Ulp_Error(test, correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsFloatResultSubnormal(correct, ulps))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsFloatSubnormal(s[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+                        correct2 = func.f_fi(0.0, s2[j]);
+                        correct3 = func.f_fi(-0.0, s2[j]);
+                        err2 = Ulp_Error(test, correct2);
+                        err3 = Ulp_Error(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsFloatResultSubnormal(correct2, ulps)
+                            || IsFloatResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error(
+                        "\nERROR: %s%s: %f ulp error at {%a (0x%8.8x), %d}: "
+                        "*%a (0x%8.8x) vs. %a (0x%8.8x) at index: %d\n",
+                        name, sizeNames[k], err, s[j], ((uint32_t *)s)[j],
+                        s2[j], r[j], ((uint32_t *)r)[j], test,
+                        ((cl_uint *)&test)[0], j);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+exit:
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
new file mode 100644
index 00000000..7f86afde
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -0,0 +1,911 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernelDouble(const char *name, const char *operator_symbol,
+                             int vectorSize, cl_uint kernel_count, cl_kernel *k,
+                             cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void ",
+                        name,
+                        "_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] =  in1[i] ",
+                        operator_symbol,
+                        " in2[i];\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void ",
+        name,
+        "_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = d0 ",
+        operator_symbol,
+        " d1;\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = d0 ",
+        operator_symbol,
+        " d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name,
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *name;
+    const char *operator_symbol;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->name, info->operator_symbol, i,
+                             info->kernel_count, info->kernels[i],
+                             info->programs + i, info->relaxedMode);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+    bool relaxedMode; // True if the test is being run in relaxed mode, false
+                      // otherwise.
+
+    // no special fields
+} TestInfo;
+
+// A table of more difficult cases to get right
+static const double specialValuesDouble[] = {
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000.,
+    -100.,
+    -4.0,
+    -3.5,
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
+    -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
+    -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
+
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000.,
+    +100.,
+    +4.0,
+    +3.5,
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
+};
+
+static size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
+
+int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
+                                           bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex,
+                                       test_info.threadCount,
+                                       test_info.k,
+                                       test_info.programs,
+                                       f->name,
+                                       f->nameInCode,
+                                       relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input arrays
+        double *p = (double *)gIn;
+        double *p2 = (double *)gIn2;
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+        {
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+            p2[j] = DoubleFromUInt32(genrand_int32(d));
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    int ftz = job->ftz;
+    bool relaxedMode = job->relaxedMode;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_ulong *t;
+    cl_double *r;
+    cl_double *s;
+    cl_double *s2;
+
+    Force64BitFPUPrecision();
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Init input array
+    cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
+    cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+    int totalSpecialValueCount =
+        specialValuesDoubleCount * specialValuesDoubleCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        cl_double *fp = (cl_double *)p;
+        cl_double *fp2 = (cl_double *)p2;
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesDoubleCount;
+        y = (job_id * buffer_elements) / specialValuesDoubleCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            fp[j] = specialValuesDouble[x];
+            fp2[j] = specialValuesDouble[y];
+            if (++x >= specialValuesDoubleCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesDoubleCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = genrand_int64(d);
+        p2[j] = genrand_int64(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            goto exit;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            goto exit;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    s = (cl_double *)gIn + thread_id * buffer_elements;
+    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+    t = (cl_ulong *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ulong *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_ff(s[j], s2[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsDoubleResultSubnormal(correct, ulps))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+
+                    // retry per section 6.5.3.3
+                    if (IsDoubleSubnormal(s[j]))
+                    {
+                        long double correct2 = func.f_ff(0.0, s2[j]);
+                        long double correct3 = func.f_ff(-0.0, s2[j]);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct2);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsDoubleResultSubnormal(correct2, ulps)
+                            || IsDoubleResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsDoubleSubnormal(s2[j]))
+                        {
+                            correct2 = func.f_ff(0.0, 0.0);
+                            correct3 = func.f_ff(-0.0, 0.0);
+                            long double correct4 = func.f_ff(0.0, -0.0);
+                            long double correct5 = func.f_ff(-0.0, -0.0);
+                            err2 = Bruteforce_Ulp_Error_Double(test, correct2);
+                            err3 = Bruteforce_Ulp_Error_Double(test, correct3);
+                            float err4 =
+                                Bruteforce_Ulp_Error_Double(test, correct4);
+                            float err5 =
+                                Bruteforce_Ulp_Error_Double(test, correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps)
+                                || IsDoubleResultSubnormal(correct4, ulps)
+                                || IsDoubleResultSubnormal(correct5, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsDoubleSubnormal(s2[j]))
+                    {
+                        long double correct2 = func.f_ff(s[j], 0.0);
+                        long double correct3 = func.f_ff(s[j], -0.0);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct2);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsDoubleResultSubnormal(correct2, ulps)
+                            || IsDoubleResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error(
+                        "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n",
+                        name, sizeNames[k], err, s[j], s2[j], r[j], test);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+exit:
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_operator.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
similarity index 54%
rename from test_conformance/math_brute_force/binary_operator.cpp
rename to test_conformance/math_brute_force/binary_operator_float.cpp
index 65756901..56b0280c 100644
--- a/test_conformance/math_brute_force/binary_operator.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -110,98 +110,6 @@ static int BuildKernel(const char *name, const char *operator_symbol,
                        relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, const char *operator_symbol,
-                             int vectorSize, cl_uint kernel_count, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void ",
-                        name,
-                        "_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in1, __global double",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] =  in1[i] ",
-                        operator_symbol,
-                        " in2[i];\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void ",
-        name,
-        "_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* in, __global double* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       d0 = d0 ",
-        operator_symbol,
-        " d1;\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       double3 d1;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = d0 ",
-        operator_symbol,
-        " d1;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "%s_kernel%s", name,
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -222,16 +130,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->name, info->operator_symbol, i,
-                             info->kernel_count, info->kernels[i],
-                             info->programs + i, info->relaxedMode);
-}
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -1139,743 +1037,3 @@ exit:
     if (overflow) free(overflow);
     return error;
 }
-
-// A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
-    -NAN,
-    -INFINITY,
-    -DBL_MAX,
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
-    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
-    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
-    -1000.,
-    -100.,
-    -4.0,
-    -3.5,
-    -3.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
-    -2.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
-    -2.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
-    -1.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    -1.0,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
-    -0.5,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
-    -0.25,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
-    -DBL_MIN,
-    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
-    -0.0,
-
-    +NAN,
-    +INFINITY,
-    +DBL_MAX,
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
-    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
-    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
-    +1000.,
-    +100.,
-    +4.0,
-    +3.5,
-    +3.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
-    +2.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
-    +2.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
-    +1.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    +1.0,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
-    +0.5,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
-    +0.25,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
-    +DBL_MIN,
-    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
-    +0.0,
-};
-
-static size_t specialValuesDoubleCount =
-    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
-
-int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
-                                           bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex,
-                                       test_info.threadCount,
-                                       test_info.k,
-                                       test_info.programs,
-                                       f->name,
-                                       f->nameInCode,
-                                       relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_double);
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    float ulps = job->ulps;
-    dptr func = job->f->dfunc;
-    int ftz = job->ftz;
-    bool relaxedMode = job->relaxedMode;
-    MTdata d = tinfo->d;
-    cl_uint j, k;
-    cl_int error;
-    const char *name = job->f->name;
-    cl_ulong *t;
-    cl_double *r;
-    cl_double *s;
-    cl_double *s2;
-
-    Force64BitFPUPrecision();
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_ulong *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Init input array
-    cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
-    cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
-    j = 0;
-    int totalSpecialValueCount =
-        specialValuesDoubleCount * specialValuesDoubleCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
-
-    if (job_id <= (cl_uint)indx)
-    { // test edge cases
-        cl_double *fp = (cl_double *)p;
-        cl_double *fp2 = (cl_double *)p2;
-        uint32_t x, y;
-
-        x = (job_id * buffer_elements) % specialValuesDoubleCount;
-        y = (job_id * buffer_elements) / specialValuesDoubleCount;
-
-        for (; j < buffer_elements; j++)
-        {
-            fp[j] = specialValuesDouble[x];
-            fp2[j] = specialValuesDouble[y];
-            if (++x >= specialValuesDoubleCount)
-            {
-                x = 0;
-                y++;
-                if (y >= specialValuesDoubleCount) break;
-            }
-        }
-    }
-
-    // Init any remaining values.
-    for (; j < buffer_elements; j++)
-    {
-        p[j] = genrand_int64(d);
-        p2[j] = genrand_int64(d);
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
-                                      buffer_size, p2, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            goto exit;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            goto exit;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
-    s = (cl_double *)gIn + thread_id * buffer_elements;
-    s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
-        r[j] = (cl_double)func.f_ff(s[j], s2[j]);
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            goto exit;
-        }
-    }
-
-    // Wait for the last buffer
-    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
-    // Verify data
-    t = (cl_ulong *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        {
-            cl_ulong *q = out[k];
-
-            // If we aren't getting the correctly rounded result
-            if (t[j] != q[j])
-            {
-                cl_double test = ((cl_double *)q)[j];
-                long double correct = func.f_ff(s[j], s2[j]);
-                float err = Bruteforce_Ulp_Error_Double(test, correct);
-                int fail = !(fabsf(err) <= ulps);
-
-                if (fail && ftz)
-                {
-                    // retry per section 6.5.3.2
-                    if (IsDoubleResultSubnormal(correct, ulps))
-                    {
-                        fail = fail && (test != 0.0f);
-                        if (!fail) err = 0.0f;
-                    }
-
-
-                    // retry per section 6.5.3.3
-                    if (IsDoubleSubnormal(s[j]))
-                    {
-                        long double correct2 = func.f_ff(0.0, s2[j]);
-                        long double correct3 = func.f_ff(-0.0, s2[j]);
-                        float err2 =
-                            Bruteforce_Ulp_Error_Double(test, correct2);
-                        float err3 =
-                            Bruteforce_Ulp_Error_Double(test, correct3);
-                        fail = fail
-                            && ((!(fabsf(err2) <= ulps))
-                                && (!(fabsf(err3) <= ulps)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-
-                        // retry per section 6.5.3.4
-                        if (IsDoubleResultSubnormal(correct2, ulps)
-                            || IsDoubleResultSubnormal(correct3, ulps))
-                        {
-                            fail = fail && (test != 0.0f);
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // try with both args as zero
-                        if (IsDoubleSubnormal(s2[j]))
-                        {
-                            correct2 = func.f_ff(0.0, 0.0);
-                            correct3 = func.f_ff(-0.0, 0.0);
-                            long double correct4 = func.f_ff(0.0, -0.0);
-                            long double correct5 = func.f_ff(-0.0, -0.0);
-                            err2 = Bruteforce_Ulp_Error_Double(test, correct2);
-                            err3 = Bruteforce_Ulp_Error_Double(test, correct3);
-                            float err4 =
-                                Bruteforce_Ulp_Error_Double(test, correct4);
-                            float err5 =
-                                Bruteforce_Ulp_Error_Double(test, correct5);
-                            fail = fail
-                                && ((!(fabsf(err2) <= ulps))
-                                    && (!(fabsf(err3) <= ulps))
-                                    && (!(fabsf(err4) <= ulps))
-                                    && (!(fabsf(err5) <= ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-                            if (fabsf(err4) < fabsf(err)) err = err4;
-                            if (fabsf(err5) < fabsf(err)) err = err5;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2, ulps)
-                                || IsDoubleResultSubnormal(correct3, ulps)
-                                || IsDoubleResultSubnormal(correct4, ulps)
-                                || IsDoubleResultSubnormal(correct5, ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                    else if (IsDoubleSubnormal(s2[j]))
-                    {
-                        long double correct2 = func.f_ff(s[j], 0.0);
-                        long double correct3 = func.f_ff(s[j], -0.0);
-                        float err2 =
-                            Bruteforce_Ulp_Error_Double(test, correct2);
-                        float err3 =
-                            Bruteforce_Ulp_Error_Double(test, correct3);
-                        fail = fail
-                            && ((!(fabsf(err2) <= ulps))
-                                && (!(fabsf(err3) <= ulps)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-
-                        // retry per section 6.5.3.4
-                        if (IsDoubleResultSubnormal(correct2, ulps)
-                            || IsDoubleResultSubnormal(correct3, ulps))
-                        {
-                            fail = fail && (test != 0.0f);
-                            if (!fail) err = 0.0f;
-                        }
-                    }
-                }
-
-                if (fabsf(err) > tinfo->maxError)
-                {
-                    tinfo->maxError = fabsf(err);
-                    tinfo->maxErrorValue = s[j];
-                    tinfo->maxErrorValue2 = s2[j];
-                }
-                if (fail)
-                {
-                    vlog_error(
-                        "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n",
-                        name, sizeNames[k], err, s[j], s2[j], r[j], test);
-                    error = -1;
-                    goto exit;
-                }
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
-                 "ThreadCount:%2u\n",
-                 base, job->step, job->scale, buffer_elements, job->ulps,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-exit:
-    return error;
-}
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
deleted file mode 100644
index a20c0571..00000000
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ /dev/null
@@ -1,1298 +0,0 @@
-//
-// Copyright (c) 2017 The Khronos Group Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-#include "function_list.h"
-#include "test_functions.h"
-#include "utility.h"
-
-#include <limits.h>
-#include <string.h>
-
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global float",
-                        sizeNames[vectorSize],
-                        "* out, __global int",
-                        sizeNames[vectorSize],
-                        "* out2, __global float",
-                        sizeNames[vectorSize],
-                        "* in1, __global float",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], out2 + i );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global float* out, __global int* out2, __global float* in, "
-        "__global float* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       int3 i0 = 0xdeaddead;\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, &i0 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "       vstore3( i0, 0, out2 + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       float3 f1;\n"
-        "       int3 i0 = 0xdeaddead;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, &i0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               out2[3*i+1] = i0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               out2[3*i] = i0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global int",
-                        sizeNames[vectorSize],
-                        "* out2, __global double",
-                        sizeNames[vectorSize],
-                        "* in1, __global double",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], out2 + i );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global int* out2, __global double* in, "
-        "__global double* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       int3 i0 = 0xdeaddead;\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, &i0 );\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "       vstore3( i0, 0, out2 + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       double3 d1;\n"
-        "       int3 i0 = 0xdeaddead;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, &i0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               out2[3*i+1] = i0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               out2[3*i] = i0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_kernel *kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
-}
-
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
-}
-
-typedef struct ComputeReferenceInfoF_
-{
-    const float *x;
-    const float *y;
-    float *r;
-    int *i;
-    double (*f_ffpI)(double, double, int *);
-    cl_uint lim;
-    cl_uint count;
-} ComputeReferenceInfoF;
-
-typedef struct ComputeReferenceInfoD_
-{
-    const double *x;
-    const double *y;
-    double *r;
-    int *i;
-    long double (*f_ffpI)(long double, long double, int *);
-    cl_uint lim;
-    cl_uint count;
-} ComputeReferenceInfoD;
-
-static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
-{
-    ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
-    cl_uint lim = cri->lim;
-    cl_uint count = cri->count;
-    cl_uint off = jid * count;
-    const float *x = cri->x + off;
-    const float *y = cri->y + off;
-    float *r = cri->r + off;
-    int *i = cri->i + off;
-    double (*f)(double, double, int *) = cri->f_ffpI;
-    cl_uint j;
-
-    if (off + count > lim) count = lim - off;
-
-    for (j = 0; j < count; ++j)
-        r[j] = (float)f((double)x[j], (double)y[j], i + j);
-
-    return CL_SUCCESS;
-}
-
-static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
-{
-    ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
-    cl_uint lim = cri->lim;
-    cl_uint count = cri->count;
-    cl_uint off = jid * count;
-    const double *x = cri->x + off;
-    const double *y = cri->y + off;
-    double *r = cri->r + off;
-    int *i = cri->i + off;
-    long double (*f)(long double, long double, int *) = cri->f_ffpI;
-    cl_uint j;
-
-    if (off + count > lim) count = lim - off;
-
-    Force64BitFPUPrecision();
-
-    for (j = 0; j < count; ++j)
-        r[j] = (double)f((long double)x[j], (long double)y[j], i + j);
-
-    return CL_SUCCESS;
-}
-
-int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    int64_t maxError2 = 0;
-    float maxErrorVal = 0.0f;
-    float maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-
-    cl_uint threadCount = GetThreadCount();
-
-    float float_ulps;
-    if (gIsEmbedded)
-        float_ulps = f->float_embedded_ulps;
-    else
-        float_ulps = f->float_ulps;
-
-    int testingRemquo = !strcmp(f->name, "remquo");
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        float *s = (float *)gIn;
-        float *s2 = (float *)gIn2;
-
-        if (threadCount > 1)
-        {
-            ComputeReferenceInfoF cri;
-            cri.x = s;
-            cri.y = s2;
-            cri.r = (float *)gOut_Ref;
-            cri.i = (int *)gOut_Ref2;
-            cri.f_ffpI = f->func.f_ffpI;
-            cri.lim = bufferSize / sizeof(float);
-            cri.count = (cri.lim + threadCount - 1) / threadCount;
-            ThreadPool_Do(ReferenceF, threadCount, &cri);
-        }
-        else
-        {
-            float *r = (float *)gOut_Ref;
-            int *r2 = (int *)gOut_Ref2;
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j);
-        }
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray2 failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint32_t *t = (uint32_t *)gOut_Ref;
-        int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint32_t *q = (uint32_t *)(gOut[k]);
-                int32_t *q2 = (int32_t *)gOut2[k];
-
-                // Check for exact match to correctly rounded result
-                if (t[j] == q[j] && t2[j] == q2[j]) continue;
-
-                // Check for paired NaNs
-                if ((t[j] & 0x7fffffff) > 0x7f800000
-                    && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j])
-                    continue;
-
-                float test = ((float *)q)[j];
-                int correct2 = INT_MIN;
-                double correct = f->func.f_ffpI(s[j], s2[j], &correct2);
-                float err = Ulp_Error(test, correct);
-                int64_t iErr;
-
-                // in case of remquo, we only care about the sign and last
-                // seven bits of integer as per the spec.
-                if (testingRemquo)
-                    iErr = (long long)(q2[j] & 0x0000007f)
-                        - (long long)(correct2 & 0x0000007f);
-                else
-                    iErr = (long long)q2[j] - (long long)correct2;
-
-                // For remquo, if y = 0, x is infinite, or either is NaN
-                // then the standard either neglects to say what is returned
-                // in iptr or leaves it undefined or implementation defined.
-                int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY
-                    || ((float *)gIn2)[j] == 0.0f || isnan(((float *)gIn2)[j])
-                    || isnan(((float *)gIn)[j]);
-                if (iptrUndefined) iErr = 0;
-
-                int fail = !(fabsf(err) <= float_ulps && iErr == 0);
-                if (ftz && fail)
-                {
-                    // retry per section 6.5.3.2
-                    if (IsFloatResultSubnormal(correct, float_ulps))
-                    {
-                        fail = fail && !(test == 0.0f && iErr == 0);
-                        if (!fail) err = 0.0f;
-                    }
-
-                    // retry per section 6.5.3.3
-                    if (IsFloatSubnormal(s[j]))
-                    {
-                        int correct3i, correct4i;
-                        double correct3 =
-                            f->func.f_ffpI(0.0, s2[j], &correct3i);
-                        double correct4 =
-                            f->func.f_ffpI(-0.0, s2[j], &correct4i);
-                        float err2 = Ulp_Error(test, correct3);
-                        float err3 = Ulp_Error(test, correct4);
-                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
-                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
-                        fail = fail
-                            && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
-                                && (!(fabsf(err3) <= float_ulps
-                                      && iErr4 == 0)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-
-                        // retry per section 6.5.3.4
-                        if (IsFloatResultSubnormal(correct2, float_ulps)
-                            || IsFloatResultSubnormal(correct3, float_ulps))
-                        {
-                            fail = fail
-                                && !(test == 0.0f
-                                     && (iErr3 == 0 || iErr4 == 0));
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // try with both args as zero
-                        if (IsFloatSubnormal(s2[j]))
-                        {
-                            int correct7i, correct8i;
-                            correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
-                            correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i);
-                            double correct7 =
-                                f->func.f_ffpI(0.0, -0.0, &correct7i);
-                            double correct8 =
-                                f->func.f_ffpI(-0.0, -0.0, &correct8i);
-                            err2 = Ulp_Error(test, correct3);
-                            err3 = Ulp_Error(test, correct4);
-                            float err4 = Ulp_Error(test, correct7);
-                            float err5 = Ulp_Error(test, correct8);
-                            iErr3 = (long long)q2[j] - (long long)correct3i;
-                            iErr4 = (long long)q2[j] - (long long)correct4i;
-                            int64_t iErr7 =
-                                (long long)q2[j] - (long long)correct7i;
-                            int64_t iErr8 =
-                                (long long)q2[j] - (long long)correct8i;
-                            fail = fail
-                                && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
-                                    && (!(fabsf(err3) <= float_ulps
-                                          && iErr4 == 0))
-                                    && (!(fabsf(err4) <= float_ulps
-                                          && iErr7 == 0))
-                                    && (!(fabsf(err5) <= float_ulps
-                                          && iErr8 == 0)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-                            if (fabsf(err4) < fabsf(err)) err = err4;
-                            if (fabsf(err5) < fabsf(err)) err = err5;
-                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
-                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
-
-                            // retry per section 6.5.3.4
-                            if (IsFloatResultSubnormal(correct3, float_ulps)
-                                || IsFloatResultSubnormal(correct4, float_ulps)
-                                || IsFloatResultSubnormal(correct7, float_ulps)
-                                || IsFloatResultSubnormal(correct8, float_ulps))
-                            {
-                                fail = fail
-                                    && !(test == 0.0f
-                                         && (iErr3 == 0 || iErr4 == 0
-                                             || iErr7 == 0 || iErr8 == 0));
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                    else if (IsFloatSubnormal(s2[j]))
-                    {
-                        int correct3i, correct4i;
-                        double correct3 = f->func.f_ffpI(s[j], 0.0, &correct3i);
-                        double correct4 =
-                            f->func.f_ffpI(s[j], -0.0, &correct4i);
-                        float err2 = Ulp_Error(test, correct3);
-                        float err3 = Ulp_Error(test, correct4);
-                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
-                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
-                        fail = fail
-                            && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
-                                && (!(fabsf(err3) <= float_ulps
-                                      && iErr4 == 0)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-
-                        // retry per section 6.5.3.4
-                        if (IsFloatResultSubnormal(correct2, float_ulps)
-                            || IsFloatResultSubnormal(correct3, float_ulps))
-                        {
-                            fail = fail
-                                && !(test == 0.0f
-                                     && (iErr3 == 0 || iErr4 == 0));
-                            if (!fail) err = 0.0f;
-                        }
-                    }
-                }
-                    if (fabsf(err) > maxError)
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                    }
-                    if (llabs(iErr) > maxError2)
-                    {
-                        maxError2 = llabs(iErr);
-                        maxErrorVal2 = s[j];
-                    }
-
-                    if (fail)
-                    {
-                        vlog_error(
-                            "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} "
-                            "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, "
-                            "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
-                            f->name, sizeNames[k], err, iErr, ((float *)gIn)[j],
-                            ((float *)gIn2)[j], ((cl_uint *)gIn)[j],
-                            ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j],
-                            ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j],
-                            ((cl_uint *)gOut_Ref2)[j], test, q2[j],
-                            ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]);
-                        error = -1;
-                        goto exit;
-                    }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
-
-int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    int64_t maxError2 = 0;
-    int ftz = f->ftz || gForceFTZ;
-    double maxErrorVal = 0.0f;
-    double maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(double), bufferSize);
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    cl_uint threadCount = GetThreadCount();
-
-    Force64BitFPUPrecision();
-
-    int testingRemquo = !strcmp(f->name, "remquo");
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        double *s = (double *)gIn;
-        double *s2 = (double *)gIn2;
-
-        if (threadCount > 1)
-        {
-            ComputeReferenceInfoD cri;
-            cri.x = s;
-            cri.y = s2;
-            cri.r = (double *)gOut_Ref;
-            cri.i = (int *)gOut_Ref2;
-            cri.f_ffpI = f->dfunc.f_ffpI;
-            cri.lim = bufferSize / sizeof(double);
-            cri.count = (cri.lim + threadCount - 1) / threadCount;
-            ThreadPool_Do(ReferenceD, threadCount, &cri);
-        }
-        else
-        {
-            double *r = (double *)gOut_Ref;
-            int *r2 = (int *)gOut_Ref2;
-            for (j = 0; j < bufferSize / sizeof(double); j++)
-                r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j);
-        }
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray2 failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint64_t *t = (uint64_t *)gOut_Ref;
-        int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint64_t *q = (uint64_t *)gOut[k];
-                int32_t *q2 = (int32_t *)gOut2[k];
-
-                // Check for exact match to correctly rounded result
-                if (t[j] == q[j] && t2[j] == q2[j]) continue;
-
-                // Check for paired NaNs
-                if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
-                    && (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
-                    && t2[j] == q2[j])
-                    continue;
-
-                double test = ((double *)q)[j];
-                int correct2 = INT_MIN;
-                long double correct = f->dfunc.f_ffpI(s[j], s2[j], &correct2);
-                float err = Bruteforce_Ulp_Error_Double(test, correct);
-                int64_t iErr;
-
-                // in case of remquo, we only care about the sign and last
-                // seven bits of integer as per the spec.
-                if (testingRemquo)
-                    iErr = (long long)(q2[j] & 0x0000007f)
-                        - (long long)(correct2 & 0x0000007f);
-                else
-                    iErr = (long long)q2[j] - (long long)correct2;
-
-                // For remquo, if y = 0, x is infinite, or either is NaN
-                // then the standard either neglects to say what is returned
-                // in iptr or leaves it undefined or implementation defined.
-                int iptrUndefined = fabs(((double *)gIn)[j]) == INFINITY
-                    || ((double *)gIn2)[j] == 0.0 || isnan(((double *)gIn2)[j])
-                    || isnan(((double *)gIn)[j]);
-                if (iptrUndefined) iErr = 0;
-
-                int fail = !(fabsf(err) <= f->double_ulps && iErr == 0);
-                if (ftz && fail)
-                {
-                    // retry per section 6.5.3.2
-                    if (IsDoubleResultSubnormal(correct, f->double_ulps))
-                    {
-                        fail = fail && !(test == 0.0f && iErr == 0);
-                        if (!fail) err = 0.0f;
-                    }
-
-                    // retry per section 6.5.3.3
-                    if (IsDoubleSubnormal(s[j]))
-                    {
-                        int correct3i, correct4i;
-                        long double correct3 =
-                            f->dfunc.f_ffpI(0.0, s2[j], &correct3i);
-                        long double correct4 =
-                            f->dfunc.f_ffpI(-0.0, s2[j], &correct4i);
-                        float err2 =
-                            Bruteforce_Ulp_Error_Double(test, correct3);
-                        float err3 =
-                            Bruteforce_Ulp_Error_Double(test, correct4);
-                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
-                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
-                        fail = fail
-                            && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0))
-                                && (!(fabsf(err3) <= f->double_ulps
-                                      && iErr4 == 0)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-
-                        // retry per section 6.5.3.4
-                        if (IsDoubleResultSubnormal(correct2, f->double_ulps)
-                            || IsDoubleResultSubnormal(correct3,
-                                                       f->double_ulps))
-                        {
-                            fail = fail
-                                && !(test == 0.0f
-                                     && (iErr3 == 0 || iErr4 == 0));
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // try with both args as zero
-                        if (IsDoubleSubnormal(s2[j]))
-                        {
-                            int correct7i, correct8i;
-                            correct3 = f->dfunc.f_ffpI(0.0, 0.0, &correct3i);
-                            correct4 = f->dfunc.f_ffpI(-0.0, 0.0, &correct4i);
-                            long double correct7 =
-                                f->dfunc.f_ffpI(0.0, -0.0, &correct7i);
-                            long double correct8 =
-                                f->dfunc.f_ffpI(-0.0, -0.0, &correct8i);
-                            err2 = Bruteforce_Ulp_Error_Double(test, correct3);
-                            err3 = Bruteforce_Ulp_Error_Double(test, correct4);
-                            float err4 =
-                                Bruteforce_Ulp_Error_Double(test, correct7);
-                            float err5 =
-                                Bruteforce_Ulp_Error_Double(test, correct8);
-                            iErr3 = (long long)q2[j] - (long long)correct3i;
-                            iErr4 = (long long)q2[j] - (long long)correct4i;
-                            int64_t iErr7 =
-                                (long long)q2[j] - (long long)correct7i;
-                            int64_t iErr8 =
-                                (long long)q2[j] - (long long)correct8i;
-                            fail = fail
-                                && ((!(fabsf(err2) <= f->double_ulps
-                                       && iErr3 == 0))
-                                    && (!(fabsf(err3) <= f->double_ulps
-                                          && iErr4 == 0))
-                                    && (!(fabsf(err4) <= f->double_ulps
-                                          && iErr7 == 0))
-                                    && (!(fabsf(err5) <= f->double_ulps
-                                          && iErr8 == 0)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-                            if (fabsf(err4) < fabsf(err)) err = err4;
-                            if (fabsf(err5) < fabsf(err)) err = err5;
-                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
-                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct3,
-                                                        f->double_ulps)
-                                || IsDoubleResultSubnormal(correct4,
-                                                           f->double_ulps)
-                                || IsDoubleResultSubnormal(correct7,
-                                                           f->double_ulps)
-                                || IsDoubleResultSubnormal(correct8,
-                                                           f->double_ulps))
-                            {
-                                fail = fail
-                                    && !(test == 0.0f
-                                         && (iErr3 == 0 || iErr4 == 0
-                                             || iErr7 == 0 || iErr8 == 0));
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                    else if (IsDoubleSubnormal(s2[j]))
-                    {
-                        int correct3i, correct4i;
-                        long double correct3 =
-                            f->dfunc.f_ffpI(s[j], 0.0, &correct3i);
-                        long double correct4 =
-                            f->dfunc.f_ffpI(s[j], -0.0, &correct4i);
-                        float err2 =
-                            Bruteforce_Ulp_Error_Double(test, correct3);
-                        float err3 =
-                            Bruteforce_Ulp_Error_Double(test, correct4);
-                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
-                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
-                        fail = fail
-                            && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0))
-                                && (!(fabsf(err3) <= f->double_ulps
-                                      && iErr4 == 0)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
-                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
-                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
-
-                        // retry per section 6.5.3.4
-                        if (IsDoubleResultSubnormal(correct2, f->double_ulps)
-                            || IsDoubleResultSubnormal(correct3,
-                                                       f->double_ulps))
-                        {
-                            fail = fail
-                                && !(test == 0.0f
-                                     && (iErr3 == 0 || iErr4 == 0));
-                            if (!fail) err = 0.0f;
-                        }
-                    }
-                }
-                    if (fabsf(err) > maxError)
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                    }
-                    if (llabs(iErr) > maxError2)
-                    {
-                        maxError2 = llabs(iErr);
-                        maxErrorVal2 = s[j];
-                    }
-
-                    if (fail)
-                    {
-                        vlog_error(
-                            "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, "
-                            "%.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, "
-                            "%d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ "
-                            "0x%16.16llx, 0x%8.8x})\n",
-                            f->name, sizeNames[k], err, iErr,
-                            ((double *)gIn)[j], ((double *)gIn2)[j],
-                            ((cl_ulong *)gIn)[j], ((cl_ulong *)gIn2)[j],
-                            ((double *)gOut_Ref)[j], ((int *)gOut_Ref2)[j],
-                            ((cl_ulong *)gOut_Ref)[j],
-                            ((cl_uint *)gOut_Ref2)[j], test, q2[j],
-                            ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]);
-                        error = -1;
-                        goto exit;
-                    }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
diff --git a/test_conformance/math_brute_force/binary_two_results_i_double.cpp b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
new file mode 100644
index 00000000..5f1ba3b2
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_two_results_i_double.cpp
@@ -0,0 +1,671 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <limits.h>
+#include <string.h>
+
+static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
+                             cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global int* out2, __global double* in, "
+        "__global double* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, &i0 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "       vstore3( i0, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, &i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               out2[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               out2[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
+                             info->programs + i, info->relaxedMode);
+}
+
+typedef struct ComputeReferenceInfoD_
+{
+    const double *x;
+    const double *y;
+    double *r;
+    int *i;
+    long double (*f_ffpI)(long double, long double, int *);
+    cl_uint lim;
+    cl_uint count;
+} ComputeReferenceInfoD;
+
+static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
+{
+    ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
+    cl_uint lim = cri->lim;
+    cl_uint count = cri->count;
+    cl_uint off = jid * count;
+    const double *x = cri->x + off;
+    const double *y = cri->y + off;
+    double *r = cri->r + off;
+    int *i = cri->i + off;
+    long double (*f)(long double, long double, int *) = cri->f_ffpI;
+    cl_uint j;
+
+    if (off + count > lim) count = lim - off;
+
+    Force64BitFPUPrecision();
+
+    for (j = 0; j < count; ++j)
+        r[j] = (double)f((long double)x[j], (long double)y[j], i + j);
+
+    return CL_SUCCESS;
+}
+
+int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    int64_t maxError2 = 0;
+    int ftz = f->ftz || gForceFTZ;
+    double maxErrorVal = 0.0f;
+    double maxErrorVal2 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(double), bufferSize);
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    cl_uint threadCount = GetThreadCount();
+
+    Force64BitFPUPrecision();
+
+    int testingRemquo = !strcmp(f->name, "remquo");
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        double *p2 = (double *)gIn2;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+        {
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+            p2[j] = DoubleFromUInt32(genrand_int32(d));
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        double *s = (double *)gIn;
+        double *s2 = (double *)gIn2;
+
+        if (threadCount > 1)
+        {
+            ComputeReferenceInfoD cri;
+            cri.x = s;
+            cri.y = s2;
+            cri.r = (double *)gOut_Ref;
+            cri.i = (int *)gOut_Ref2;
+            cri.f_ffpI = f->dfunc.f_ffpI;
+            cri.lim = bufferSize / sizeof(double);
+            cri.count = (cri.lim + threadCount - 1) / threadCount;
+            ThreadPool_Do(ReferenceD, threadCount, &cri);
+        }
+        else
+        {
+            double *r = (double *)gOut_Ref;
+            int *r2 = (int *)gOut_Ref2;
+            for (j = 0; j < bufferSize / sizeof(double); j++)
+                r[j] = (double)f->dfunc.f_ffpI(s[j], s2[j], r2 + j);
+        }
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        int32_t *t2 = (int32_t *)gOut_Ref2;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint64_t *q = (uint64_t *)gOut[k];
+                int32_t *q2 = (int32_t *)gOut2[k];
+
+                // Check for exact match to correctly rounded result
+                if (t[j] == q[j] && t2[j] == q2[j]) continue;
+
+                // Check for paired NaNs
+                if ((t[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
+                    && (q[j] & 0x7fffffffffffffffUL) > 0x7ff0000000000000UL
+                    && t2[j] == q2[j])
+                    continue;
+
+                double test = ((double *)q)[j];
+                int correct2 = INT_MIN;
+                long double correct = f->dfunc.f_ffpI(s[j], s2[j], &correct2);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int64_t iErr;
+
+                // in case of remquo, we only care about the sign and last
+                // seven bits of integer as per the spec.
+                if (testingRemquo)
+                    iErr = (long long)(q2[j] & 0x0000007f)
+                        - (long long)(correct2 & 0x0000007f);
+                else
+                    iErr = (long long)q2[j] - (long long)correct2;
+
+                // For remquo, if y = 0, x is infinite, or either is NaN
+                // then the standard either neglects to say what is returned
+                // in iptr or leaves it undefined or implementation defined.
+                int iptrUndefined = fabs(((double *)gIn)[j]) == INFINITY
+                    || ((double *)gIn2)[j] == 0.0 || isnan(((double *)gIn2)[j])
+                    || isnan(((double *)gIn)[j]);
+                if (iptrUndefined) iErr = 0;
+
+                int fail = !(fabsf(err) <= f->double_ulps && iErr == 0);
+                if (ftz && fail)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsDoubleResultSubnormal(correct, f->double_ulps))
+                    {
+                        fail = fail && !(test == 0.0f && iErr == 0);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsDoubleSubnormal(s[j]))
+                    {
+                        int correct3i, correct4i;
+                        long double correct3 =
+                            f->dfunc.f_ffpI(0.0, s2[j], &correct3i);
+                        long double correct4 =
+                            f->dfunc.f_ffpI(-0.0, s2[j], &correct4i);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= f->double_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsDoubleResultSubnormal(correct2, f->double_ulps)
+                            || IsDoubleResultSubnormal(correct3,
+                                                       f->double_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsDoubleSubnormal(s2[j]))
+                        {
+                            int correct7i, correct8i;
+                            correct3 = f->dfunc.f_ffpI(0.0, 0.0, &correct3i);
+                            correct4 = f->dfunc.f_ffpI(-0.0, 0.0, &correct4i);
+                            long double correct7 =
+                                f->dfunc.f_ffpI(0.0, -0.0, &correct7i);
+                            long double correct8 =
+                                f->dfunc.f_ffpI(-0.0, -0.0, &correct8i);
+                            err2 = Bruteforce_Ulp_Error_Double(test, correct3);
+                            err3 = Bruteforce_Ulp_Error_Double(test, correct4);
+                            float err4 =
+                                Bruteforce_Ulp_Error_Double(test, correct7);
+                            float err5 =
+                                Bruteforce_Ulp_Error_Double(test, correct8);
+                            iErr3 = (long long)q2[j] - (long long)correct3i;
+                            iErr4 = (long long)q2[j] - (long long)correct4i;
+                            int64_t iErr7 =
+                                (long long)q2[j] - (long long)correct7i;
+                            int64_t iErr8 =
+                                (long long)q2[j] - (long long)correct8i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps
+                                       && iErr3 == 0))
+                                    && (!(fabsf(err3) <= f->double_ulps
+                                          && iErr4 == 0))
+                                    && (!(fabsf(err4) <= f->double_ulps
+                                          && iErr7 == 0))
+                                    && (!(fabsf(err5) <= f->double_ulps
+                                          && iErr8 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct3,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct4,
+                                                           f->double_ulps)
+                                || IsDoubleResultSubnormal(correct7,
+                                                           f->double_ulps)
+                                || IsDoubleResultSubnormal(correct8,
+                                                           f->double_ulps))
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0
+                                             || iErr7 == 0 || iErr8 == 0));
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsDoubleSubnormal(s2[j]))
+                    {
+                        int correct3i, correct4i;
+                        long double correct3 =
+                            f->dfunc.f_ffpI(s[j], 0.0, &correct3i);
+                        long double correct4 =
+                            f->dfunc.f_ffpI(s[j], -0.0, &correct4i);
+                        float err2 =
+                            Bruteforce_Ulp_Error_Double(test, correct3);
+                        float err3 =
+                            Bruteforce_Ulp_Error_Double(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= f->double_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= f->double_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsDoubleResultSubnormal(correct2, f->double_ulps)
+                            || IsDoubleResultSubnormal(correct3,
+                                                       f->double_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+                if (fabsf(err) > maxError)
+                {
+                    maxError = fabsf(err);
+                    maxErrorVal = s[j];
+                }
+                if (llabs(iErr) > maxError2)
+                {
+                    maxError2 = llabs(iErr);
+                    maxErrorVal2 = s[j];
+                }
+
+                if (fail)
+                {
+                    vlog_error(
+                        "\nERROR: %sD%s: {%f, %lld} ulp error at {%.13la, "
+                        "%.13la} ({ 0x%16.16llx, 0x%16.16llx}): *{%.13la, "
+                        "%d} ({ 0x%16.16llx, 0x%8.8x}) vs. {%.13la, %d} ({ "
+                        "0x%16.16llx, 0x%8.8x})\n",
+                        f->name, sizeNames[k], err, iErr, ((double *)gIn)[j],
+                        ((double *)gIn2)[j], ((cl_ulong *)gIn)[j],
+                        ((cl_ulong *)gIn2)[j], ((double *)gOut_Ref)[j],
+                        ((int *)gOut_Ref2)[j], ((cl_ulong *)gOut_Ref)[j],
+                        ((cl_uint *)gOut_Ref2)[j], test, q2[j],
+                        ((cl_ulong *)q)[j], ((cl_uint *)q2)[j]);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_two_results_i_float.cpp b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
new file mode 100644
index 00000000..4ea7a85d
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_two_results_i_float.cpp
@@ -0,0 +1,657 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <limits.h>
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global int* out2, __global float* in, "
+        "__global float* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, &i0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( i0, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       int3 i0 = 0xdeaddead;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, &i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
+}
+
+typedef struct ComputeReferenceInfoF_
+{
+    const float *x;
+    const float *y;
+    float *r;
+    int *i;
+    double (*f_ffpI)(double, double, int *);
+    cl_uint lim;
+    cl_uint count;
+} ComputeReferenceInfoF;
+
+static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
+{
+    ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
+    cl_uint lim = cri->lim;
+    cl_uint count = cri->count;
+    cl_uint off = jid * count;
+    const float *x = cri->x + off;
+    const float *y = cri->y + off;
+    float *r = cri->r + off;
+    int *i = cri->i + off;
+    double (*f)(double, double, int *) = cri->f_ffpI;
+    cl_uint j;
+
+    if (off + count > lim) count = lim - off;
+
+    for (j = 0; j < count; ++j)
+        r[j] = (float)f((double)x[j], (double)y[j], i + j);
+
+    return CL_SUCCESS;
+}
+
+int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    int64_t maxError2 = 0;
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
+
+    cl_uint threadCount = GetThreadCount();
+
+    float float_ulps;
+    if (gIsEmbedded)
+        float_ulps = f->float_embedded_ulps;
+    else
+        float_ulps = f->float_ulps;
+
+    int testingRemquo = !strcmp(f->name, "remquo");
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            p[j] = genrand_int32(d);
+            p2[j] = genrand_int32(d);
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_TRUE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        float *s = (float *)gIn;
+        float *s2 = (float *)gIn2;
+
+        if (threadCount > 1)
+        {
+            ComputeReferenceInfoF cri;
+            cri.x = s;
+            cri.y = s2;
+            cri.r = (float *)gOut_Ref;
+            cri.i = (int *)gOut_Ref2;
+            cri.f_ffpI = f->func.f_ffpI;
+            cri.lim = bufferSize / sizeof(float);
+            cri.count = (cri.lim + threadCount - 1) / threadCount;
+            ThreadPool_Do(ReferenceF, threadCount, &cri);
+        }
+        else
+        {
+            float *r = (float *)gOut_Ref;
+            int *r2 = (int *)gOut_Ref2;
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                r[j] = (float)f->func.f_ffpI(s[j], s2[j], r2 + j);
+        }
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        int32_t *t2 = (int32_t *)gOut_Ref2;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+                int32_t *q2 = (int32_t *)gOut2[k];
+
+                // Check for exact match to correctly rounded result
+                if (t[j] == q[j] && t2[j] == q2[j]) continue;
+
+                // Check for paired NaNs
+                if ((t[j] & 0x7fffffff) > 0x7f800000
+                    && (q[j] & 0x7fffffff) > 0x7f800000 && t2[j] == q2[j])
+                    continue;
+
+                float test = ((float *)q)[j];
+                int correct2 = INT_MIN;
+                double correct = f->func.f_ffpI(s[j], s2[j], &correct2);
+                float err = Ulp_Error(test, correct);
+                int64_t iErr;
+
+                // in case of remquo, we only care about the sign and last
+                // seven bits of integer as per the spec.
+                if (testingRemquo)
+                    iErr = (long long)(q2[j] & 0x0000007f)
+                        - (long long)(correct2 & 0x0000007f);
+                else
+                    iErr = (long long)q2[j] - (long long)correct2;
+
+                // For remquo, if y = 0, x is infinite, or either is NaN
+                // then the standard either neglects to say what is returned
+                // in iptr or leaves it undefined or implementation defined.
+                int iptrUndefined = fabs(((float *)gIn)[j]) == INFINITY
+                    || ((float *)gIn2)[j] == 0.0f || isnan(((float *)gIn2)[j])
+                    || isnan(((float *)gIn)[j]);
+                if (iptrUndefined) iErr = 0;
+
+                int fail = !(fabsf(err) <= float_ulps && iErr == 0);
+                if (ftz && fail)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsFloatResultSubnormal(correct, float_ulps))
+                    {
+                        fail = fail && !(test == 0.0f && iErr == 0);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsFloatSubnormal(s[j]))
+                    {
+                        int correct3i, correct4i;
+                        double correct3 =
+                            f->func.f_ffpI(0.0, s2[j], &correct3i);
+                        double correct4 =
+                            f->func.f_ffpI(-0.0, s2[j], &correct4i);
+                        float err2 = Ulp_Error(test, correct3);
+                        float err3 = Ulp_Error(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= float_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsFloatResultSubnormal(correct2, float_ulps)
+                            || IsFloatResultSubnormal(correct3, float_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsFloatSubnormal(s2[j]))
+                        {
+                            int correct7i, correct8i;
+                            correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
+                            correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i);
+                            double correct7 =
+                                f->func.f_ffpI(0.0, -0.0, &correct7i);
+                            double correct8 =
+                                f->func.f_ffpI(-0.0, -0.0, &correct8i);
+                            err2 = Ulp_Error(test, correct3);
+                            err3 = Ulp_Error(test, correct4);
+                            float err4 = Ulp_Error(test, correct7);
+                            float err5 = Ulp_Error(test, correct8);
+                            iErr3 = (long long)q2[j] - (long long)correct3i;
+                            iErr4 = (long long)q2[j] - (long long)correct4i;
+                            int64_t iErr7 =
+                                (long long)q2[j] - (long long)correct7i;
+                            int64_t iErr8 =
+                                (long long)q2[j] - (long long)correct8i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                    && (!(fabsf(err3) <= float_ulps
+                                          && iErr4 == 0))
+                                    && (!(fabsf(err4) <= float_ulps
+                                          && iErr7 == 0))
+                                    && (!(fabsf(err5) <= float_ulps
+                                          && iErr8 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
+
+                            // retry per section 6.5.3.4
+                            if (IsFloatResultSubnormal(correct3, float_ulps)
+                                || IsFloatResultSubnormal(correct4, float_ulps)
+                                || IsFloatResultSubnormal(correct7, float_ulps)
+                                || IsFloatResultSubnormal(correct8, float_ulps))
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0
+                                             || iErr7 == 0 || iErr8 == 0));
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsFloatSubnormal(s2[j]))
+                    {
+                        int correct3i, correct4i;
+                        double correct3 = f->func.f_ffpI(s[j], 0.0, &correct3i);
+                        double correct4 =
+                            f->func.f_ffpI(s[j], -0.0, &correct4i);
+                        float err2 = Ulp_Error(test, correct3);
+                        float err3 = Ulp_Error(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= float_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= float_ulps
+                                      && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsFloatResultSubnormal(correct2, float_ulps)
+                            || IsFloatResultSubnormal(correct3, float_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+                if (fabsf(err) > maxError)
+                {
+                    maxError = fabsf(err);
+                    maxErrorVal = s[j];
+                }
+                if (llabs(iErr) > maxError2)
+                {
+                    maxError2 = llabs(iErr);
+                    maxErrorVal2 = s[j];
+                }
+
+                if (fail)
+                {
+                    vlog_error(
+                        "\nERROR: %s%s: {%f, %lld} ulp error at {%a, %a} "
+                        "({0x%8.8x, 0x%8.8x}): *{%a, %d} ({0x%8.8x, "
+                        "0x%8.8x}) vs. {%a, %d} ({0x%8.8x, 0x%8.8x})\n",
+                        f->name, sizeNames[k], err, iErr, ((float *)gIn)[j],
+                        ((float *)gIn2)[j], ((cl_uint *)gIn)[j],
+                        ((cl_uint *)gIn2)[j], ((float *)gOut_Ref)[j],
+                        ((int *)gOut_Ref2)[j], ((cl_uint *)gOut_Ref)[j],
+                        ((cl_uint *)gOut_Ref2)[j], test, q2[j],
+                        ((cl_uint *)&test)[0], ((cl_uint *)q2)[j]);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        cl_uint *p = (cl_uint *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            p[j] = genrand_int32(d);
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary_double.cpp
similarity index 52%
rename from test_conformance/math_brute_force/i_unary.cpp
rename to test_conformance/math_brute_force/i_unary_double.cpp
index 9418d44d..8cb863b3 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary_double.cpp
@@ -20,84 +20,6 @@
 
 #include <string.h>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global int",
-                        sizeNames[vectorSize],
-                        "* out, __global float",
-                        sizeNames[vectorSize],
-                        "* in)\n"
-                        "{\n"
-                        "   int i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global int* out, __global float* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       int3 i0 = ",
-        name,
-        "( f0 );\n"
-        "       vstore3( i0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       int3 i0 = ",
-        name,
-        "( f0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = i0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = i0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
@@ -187,15 +109,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
-}
-
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -205,259 +118,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->programs + i, info->relaxedMode);
 }
 
-int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // This test is not using ThreadPool so we need to disable FTZ here
-    // for reference computations
-    FPU_mode_type oldMode;
-    DisableFTZ(&oldMode);
-
-    Force64BitFPUPrecision();
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        if (gWimpyMode)
-        {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = (cl_uint)i + j * scale;
-        }
-        else
-        {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = (uint32_t)i + j;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        int *r = (int *)gOut_Ref;
-        float *s = (float *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            r[j] = f->func.i_f(s[j]);
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint32_t *t = (uint32_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint32_t *q = (uint32_t *)(gOut[k]);
-                // If we aren't getting the correctly rounded result
-                if (t[j] != q[j])
-                {
-                    if (ftz && IsFloatSubnormal(s[j]))
-                    {
-                        unsigned int correct0 = f->func.i_f(0.0);
-                        unsigned int correct1 = f->func.i_f(-0.0);
-                        if (q[j] == correct0 || q[j] == correct1) continue;
-                    }
-
-                    uint32_t err = t[j] - q[j];
-                    if (q[j] > t[j]) err = q[j] - t[j];
-                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%8.8x): "
-                               "*%d vs. %d\n",
-                               f->name, sizeNames[k], err, ((float *)gIn)[j],
-                               ((cl_uint *)gIn)[j], t[j], q[j]);
-                    error = -1;
-                    goto exit;
-                }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            p[j] = genrand_int32(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    vlog("\n");
-
-exit:
-    RestoreFPState(&oldMode);
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
-
 int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
diff --git a/test_conformance/math_brute_force/i_unary_float.cpp b/test_conformance/math_brute_force/i_unary_float.cpp
new file mode 100644
index 00000000..feecb54c
--- /dev/null
+++ b/test_conformance/math_brute_force/i_unary_float.cpp
@@ -0,0 +1,370 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
+}
+
+int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // This test is not using ThreadPool so we need to disable FTZ here
+    // for reference computations
+    FPU_mode_type oldMode;
+    DisableFTZ(&oldMode);
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_uint *p = (cl_uint *)gIn;
+        if (gWimpyMode)
+        {
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (cl_uint)i + j * scale;
+        }
+        else
+        {
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        int *r = (int *)gOut_Ref;
+        float *s = (float *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = f->func.i_f(s[j]);
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    if (ftz && IsFloatSubnormal(s[j]))
+                    {
+                        unsigned int correct0 = f->func.i_f(0.0);
+                        unsigned int correct1 = f->func.i_f(-0.0);
+                        if (q[j] == correct0 || q[j] == correct1) continue;
+                    }
+
+                    uint32_t err = t[j] - q[j];
+                    if (q[j] > t[j]) err = q[j] - t[j];
+                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%8.8x): "
+                               "*%d vs. %d\n",
+                               f->name, sizeNames[k], err, ((float *)gIn)[j],
+                               ((cl_uint *)gIn)[j], t[j], q[j]);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            p[j] = genrand_int32(d);
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    vlog("\n");
+
+exit:
+    RestoreFPState(&oldMode);
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
similarity index 53%
rename from test_conformance/math_brute_force/macro_binary.cpp
rename to test_conformance/math_brute_force/macro_binary_double.cpp
index fb88e607..9b5d8f24 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -20,91 +20,6 @@
 
 #include <string.h>
 
-static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
-                       cl_kernel *k, cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global int",
-                        sizeNames[vectorSize],
-                        "* out, __global float",
-                        sizeNames[vectorSize],
-                        "* in1, __global float",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global int* out, __global float* in, __global float* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       int3 i0 = ",
-        name,
-        "( f0, f1 );\n"
-        "       vstore3( i0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       float3 f1;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       int3 i0 = ",
-        name,
-        "( f0, f1 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = i0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = i0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
@@ -203,15 +118,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
-}
-
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -222,112 +128,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->relaxedMode);
 }
 
-// A table of more difficult cases to get right
-static const float specialValuesFloat[] = {
-    -NAN,
-    -INFINITY,
-    -FLT_MAX,
-    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
-    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
-    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
-    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
-    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
-    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
-    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
-    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
-    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
-    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
-    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
-    -1000.f,
-    -100.f,
-    -4.0f,
-    -3.5f,
-    -3.0f,
-    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
-    -2.5f,
-    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
-    -2.0f,
-    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
-    -1.5f,
-    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
-    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
-    -1.0f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
-    -0.5f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
-    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
-    -0.25f,
-    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
-    -FLT_MIN,
-    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
-    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
-    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
-    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
-    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
-    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
-    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
-    -0.0f,
-
-    +NAN,
-    +INFINITY,
-    +FLT_MAX,
-    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
-    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
-    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
-    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
-    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
-    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
-    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
-    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
-    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
-    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
-    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
-    +1000.f,
-    +100.f,
-    +4.0f,
-    +3.5f,
-    +3.0f,
-    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
-    2.5f,
-    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
-    +2.0f,
-    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
-    1.5f,
-    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
-    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
-    +1.0f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
-    +0.5f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
-    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
-    +0.25f,
-    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
-    +FLT_MIN,
-    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
-    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
-    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
-    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
-    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
-    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
-    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
-    +0.0f
-};
-
-static const size_t specialValuesFloatCount =
-    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
-
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -356,579 +156,6 @@ typedef struct TestInfo
 
 } TestInfo;
 
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
-
-int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_float));
-
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_float)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_float),
-            test_info.subBufferSize * sizeof(cl_float)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf2)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of "
-                           "gInBuffer for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input arrays
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 2,
-                                        sizeof(gInBuffer2), &gInBuffer2)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_float);
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    fptr func = job->f->func;
-    int ftz = job->ftz;
-    MTdata d = tinfo->d;
-    cl_uint j, k;
-    cl_int error;
-    const char *name = job->f->name;
-    cl_int *t = 0;
-    cl_int *r = 0;
-    cl_float *s = 0;
-    cl_float *s2 = 0;
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_int *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_int *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Init input array
-    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
-    cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
-    j = 0;
-
-    int totalSpecialValueCount =
-        specialValuesFloatCount * specialValuesFloatCount;
-    int indx = (totalSpecialValueCount - 1) / buffer_elements;
-
-    if (job_id <= (cl_uint)indx)
-    { // test edge cases
-        float *fp = (float *)p;
-        float *fp2 = (float *)p2;
-        uint32_t x, y;
-
-        x = (job_id * buffer_elements) % specialValuesFloatCount;
-        y = (job_id * buffer_elements) / specialValuesFloatCount;
-
-        for (; j < buffer_elements; j++)
-        {
-            fp[j] = specialValuesFloat[x];
-            fp2[j] = specialValuesFloat[y];
-            ++x;
-            if (x >= specialValuesFloatCount)
-            {
-                x = 0;
-                y++;
-                if (y >= specialValuesFloatCount) break;
-            }
-        }
-    }
-
-    // Init any remaining values.
-    for (; j < buffer_elements; j++)
-    {
-        p[j] = genrand_int32(d);
-        p2[j] = genrand_int32(d);
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
-                                      buffer_size, p2, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            goto exit;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            goto exit;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
-                                    &tinfo->inBuf2)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            goto exit;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
-    s = (float *)gIn + thread_id * buffer_elements;
-    s2 = (float *)gIn2 + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
-
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_int *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            goto exit;
-        }
-    }
-
-    // Wait for the last buffer
-    out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                          CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                          0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
-    }
-
-    // Verify data
-    t = (cl_int *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        cl_int *q = out[0];
-
-        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
-        {
-            if (ftz)
-            {
-                if (IsFloatSubnormal(s[j]))
-                {
-                    if (IsFloatSubnormal(s2[j]))
-                    {
-                        int correct = func.i_ff(0.0f, 0.0f);
-                        int correct2 = func.i_ff(0.0f, -0.0f);
-                        int correct3 = func.i_ff(-0.0f, 0.0f);
-                        int correct4 = func.i_ff(-0.0f, -0.0f);
-
-                        if (correct == q[j] || correct2 == q[j]
-                            || correct3 == q[j] || correct4 == q[j])
-                            continue;
-                    }
-                    else
-                    {
-                        int correct = func.i_ff(0.0f, s2[j]);
-                        int correct2 = func.i_ff(-0.0f, s2[j]);
-                        if (correct == q[j] || correct2 == q[j]) continue;
-                    }
-                }
-                else if (IsFloatSubnormal(s2[j]))
-                {
-                    int correct = func.i_ff(s[j], 0.0f);
-                    int correct2 = func.i_ff(s[j], -0.0f);
-                    if (correct == q[j] || correct2 == q[j]) continue;
-                }
-            }
-
-            uint32_t err = t[j] - q[j];
-            if (q[j] > t[j]) err = q[j] - t[j];
-            vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. "
-                       "0x%8.8x (index: %d)\n",
-                       name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j],
-                       j);
-            error = -1;
-            goto exit;
-        }
-
-        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
-        {
-            q = out[k];
-            // If we aren't getting the correctly rounded result
-            if (-t[j] != q[j])
-            {
-                if (ftz)
-                {
-                    if (IsFloatSubnormal(s[j]))
-                    {
-                        if (IsFloatSubnormal(s2[j]))
-                        {
-                            int correct = -func.i_ff(0.0f, 0.0f);
-                            int correct2 = -func.i_ff(0.0f, -0.0f);
-                            int correct3 = -func.i_ff(-0.0f, 0.0f);
-                            int correct4 = -func.i_ff(-0.0f, -0.0f);
-
-                            if (correct == q[j] || correct2 == q[j]
-                                || correct3 == q[j] || correct4 == q[j])
-                                continue;
-                        }
-                        else
-                        {
-                            int correct = -func.i_ff(0.0f, s2[j]);
-                            int correct2 = -func.i_ff(-0.0f, s2[j]);
-                            if (correct == q[j] || correct2 == q[j]) continue;
-                        }
-                    }
-                    else if (IsFloatSubnormal(s2[j]))
-                    {
-                        int correct = -func.i_ff(s[j], 0.0f);
-                        int correct2 = -func.i_ff(s[j], -0.0f);
-                        if (correct == q[j] || correct2 == q[j]) continue;
-                    }
-                }
-                cl_uint err = -t[j] - q[j];
-                if (q[j] > -t[j]) err = q[j] + t[j];
-                vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x "
-                           "vs. 0x%8.8x (index: %d)\n",
-                           name, sizeNames[k], err, ((float *)s)[j],
-                           ((float *)s2)[j], -t[j], q[j], j);
-                error = -1;
-                goto exit;
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
-                 "ThreadCount:%2u\n",
-                 base, job->step, job->scale, buffer_elements,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-exit:
-    return error;
-}
-
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
     -NAN,
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
new file mode 100644
index 00000000..ece96037
--- /dev/null
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -0,0 +1,832 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
+                       cl_kernel *k, cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global float* in, __global float* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0, f1 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = ",
+        name,
+        "( f0, f1 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
+}
+
+// A table of more difficult cases to get right
+static const float specialValuesFloat[] = {
+    -NAN,
+    -INFINITY,
+    -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
+    MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
+    MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
+    MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
+    MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
+    MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
+    MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
+    MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
+    MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f,
+    -100.f,
+    -4.0f,
+    -3.5f,
+    -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
+    -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
+    -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
+    -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
+    -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
+    -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
+    -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
+    -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
+    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
+    MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
+    -0.0f,
+
+    +NAN,
+    +INFINITY,
+    +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
+    MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
+    MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
+    MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
+    MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
+    MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
+    MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
+    MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
+    MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f,
+    +100.f,
+    +4.0f,
+    +3.5f,
+    +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
+    2.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
+    +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
+    1.5f,
+    MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
+    +1.0f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
+    +0.5f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
+    +0.25f,
+    MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
+    +FLT_MIN,
+    MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
+    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
+    MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
+    +0.0f
+};
+
+static const size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
+
+} TestInfo;
+
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
+
+int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_float));
+
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_float)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_float),
+            test_info.subBufferSize * sizeof(cl_float)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input arrays
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
+        {
+            p[j] = genrand_int32(d);
+            p2[j] = genrand_int32(d);
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_float);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_int *t = 0;
+    cl_int *r = 0;
+    cl_float *s = 0;
+    cl_float *s2 = 0;
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_int *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Init input array
+    cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
+    cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+
+    int totalSpecialValueCount =
+        specialValuesFloatCount * specialValuesFloatCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        float *fp = (float *)p;
+        float *fp2 = (float *)p2;
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesFloatCount;
+        y = (job_id * buffer_elements) / specialValuesFloatCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            fp[j] = specialValuesFloat[x];
+            fp2[j] = specialValuesFloat[y];
+            ++x;
+            if (x >= specialValuesFloatCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesFloatCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = genrand_int32(d);
+        p2[j] = genrand_int32(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            goto exit;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            goto exit;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
+    s = (float *)gIn + thread_id * buffer_elements;
+    s2 = (float *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) r[j] = func.i_ff(s[j], s2[j]);
+
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_int *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                          CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                          0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+    t = (cl_int *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        cl_int *q = out[0];
+
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
+        {
+            if (ftz)
+            {
+                if (IsFloatSubnormal(s[j]))
+                {
+                    if (IsFloatSubnormal(s2[j]))
+                    {
+                        int correct = func.i_ff(0.0f, 0.0f);
+                        int correct2 = func.i_ff(0.0f, -0.0f);
+                        int correct3 = func.i_ff(-0.0f, 0.0f);
+                        int correct4 = func.i_ff(-0.0f, -0.0f);
+
+                        if (correct == q[j] || correct2 == q[j]
+                            || correct3 == q[j] || correct4 == q[j])
+                            continue;
+                    }
+                    else
+                    {
+                        int correct = func.i_ff(0.0f, s2[j]);
+                        int correct2 = func.i_ff(-0.0f, s2[j]);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+                else if (IsFloatSubnormal(s2[j]))
+                {
+                    int correct = func.i_ff(s[j], 0.0f);
+                    int correct2 = func.i_ff(s[j], -0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
+                }
+            }
+
+            uint32_t err = t[j] - q[j];
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %s: %d ulp error at {%a, %a}: *0x%8.8x vs. "
+                       "0x%8.8x (index: %d)\n",
+                       name, err, ((float *)s)[j], ((float *)s2)[j], t[j], q[j],
+                       j);
+            error = -1;
+            goto exit;
+        }
+
+        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        {
+            q = out[k];
+            // If we aren't getting the correctly rounded result
+            if (-t[j] != q[j])
+            {
+                if (ftz)
+                {
+                    if (IsFloatSubnormal(s[j]))
+                    {
+                        if (IsFloatSubnormal(s2[j]))
+                        {
+                            int correct = -func.i_ff(0.0f, 0.0f);
+                            int correct2 = -func.i_ff(0.0f, -0.0f);
+                            int correct3 = -func.i_ff(-0.0f, 0.0f);
+                            int correct4 = -func.i_ff(-0.0f, -0.0f);
+
+                            if (correct == q[j] || correct2 == q[j]
+                                || correct3 == q[j] || correct4 == q[j])
+                                continue;
+                        }
+                        else
+                        {
+                            int correct = -func.i_ff(0.0f, s2[j]);
+                            int correct2 = -func.i_ff(-0.0f, s2[j]);
+                            if (correct == q[j] || correct2 == q[j]) continue;
+                        }
+                    }
+                    else if (IsFloatSubnormal(s2[j]))
+                    {
+                        int correct = -func.i_ff(s[j], 0.0f);
+                        int correct2 = -func.i_ff(s[j], -0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+                cl_uint err = -t[j] - q[j];
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error("\nERROR: %s%s: %d ulp error at {%a, %a}: *0x%8.8x "
+                           "vs. 0x%8.8x (index: %d)\n",
+                           name, sizeNames[k], err, ((float *)s)[j],
+                           ((float *)s2)[j], -t[j], q[j], j);
+                error = -1;
+                goto exit;
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+exit:
+    return error;
+}
diff --git a/test_conformance/math_brute_force/macro_unary_double.cpp b/test_conformance/math_brute_force/macro_unary_double.cpp
new file mode 100644
index 00000000..8d80abb4
--- /dev/null
+++ b/test_conformance/math_brute_force/macro_unary_double.cpp
@@ -0,0 +1,598 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernelDouble(const char *name, int vectorSize,
+                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                             bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global long",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global long* out, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       long3 l0 = ",
+        name,
+        "( d0 );\n"
+        "       vstore3( l0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       long3 l0 = ",
+        name,
+        "( d0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = l0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = l0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
+                             info->kernels[i], info->programs + i,
+                             info->relaxedMode);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
+
+} TestInfo;
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data);
+
+int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        cl_ulong *p = (cl_ulong *)gIn;
+        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint scale = job->scale;
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    dptr dfunc = job->f->dfunc;
+    int ftz = job->ftz;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+
+    Force64BitFPUPrecision();
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_long *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Write the new values to the input array
+    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        p[j] = DoubleFromUInt32(base + j * scale);
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            return error;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            return error;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            return error;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
+    cl_double *s = (cl_double *)p;
+    for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_long *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        return error;
+    }
+
+    // Verify data
+    cl_long *t = (cl_long *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        cl_long *q = out[0];
+
+        // If we aren't getting the correctly rounded result
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
+        {
+            // If we aren't getting the correctly rounded result
+            if (ftz)
+            {
+                if (IsDoubleSubnormal(s[j]))
+                {
+                    cl_long correct = dfunc.i_f(+0.0f);
+                    cl_long correct2 = dfunc.i_f(-0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
+                }
+            }
+
+            cl_ulong err = t[j] - q[j];
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n",
+                       name, err, ((double *)gIn)[j], t[j], q[j]);
+            return -1;
+        }
+
+
+        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
+        {
+            q = out[k];
+            // If we aren't getting the correctly rounded result
+            if (-t[j] != q[j])
+            {
+                if (ftz)
+                {
+                    if (IsDoubleSubnormal(s[j]))
+                    {
+                        int64_t correct = -dfunc.i_f(+0.0f);
+                        int64_t correct2 = -dfunc.i_f(-0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+
+                cl_ulong err = -t[j] - q[j];
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error(
+                    "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n",
+                    name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]);
+                return -1;
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary_float.cpp
similarity index 52%
rename from test_conformance/math_brute_force/macro_unary.cpp
rename to test_conformance/math_brute_force/macro_unary_float.cpp
index e5aa9e70..2a37c95b 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary_float.cpp
@@ -100,88 +100,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global long",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global long* out, __global double* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       long3 l0 = ",
-        name,
-        "( d0 );\n"
-        "       vstore3( l0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       long3 l0 = ",
-        name,
-        "( d0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = l0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = l0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -201,16 +119,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
-}
-
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -699,452 +607,3 @@ exit:
 
     return ret;
 }
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data);
-
-int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ftz = f->ftz || gForceFTZ;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        cl_ulong *p = (cl_ulong *)gIn;
-        for (j = 0; j < BUFFER_SIZE / sizeof(cl_double); j++)
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_double);
-    cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    dptr dfunc = job->f->dfunc;
-    int ftz = job->ftz;
-    cl_uint j, k;
-    cl_int error;
-    const char *name = job->f->name;
-
-    Force64BitFPUPrecision();
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_long *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_long *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Write the new values to the input array
-    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
-        p[j] = DoubleFromUInt32(base + j * scale);
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        return error;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            return error;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            return error;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            return error;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
-    cl_double *s = (cl_double *)p;
-    for (j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_long *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Wait for the last buffer
-    out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
-                                           0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        return error;
-    }
-
-    // Verify data
-    cl_long *t = (cl_long *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        cl_long *q = out[0];
-
-        // If we aren't getting the correctly rounded result
-        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
-        {
-            // If we aren't getting the correctly rounded result
-            if (ftz)
-            {
-                if (IsDoubleSubnormal(s[j]))
-                {
-                    cl_long correct = dfunc.i_f(+0.0f);
-                    cl_long correct2 = dfunc.i_f(-0.0f);
-                    if (correct == q[j] || correct2 == q[j]) continue;
-                }
-            }
-
-            cl_ulong err = t[j] - q[j];
-            if (q[j] > t[j]) err = q[j] - t[j];
-            vlog_error("\nERROR: %sD: %zd ulp error at %.13la: *%zd vs. %zd\n",
-                       name, err, ((double *)gIn)[j], t[j], q[j]);
-            return -1;
-        }
-
-
-        for (k = MAX(1, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; k++)
-        {
-            q = out[k];
-            // If we aren't getting the correctly rounded result
-            if (-t[j] != q[j])
-            {
-                if (ftz)
-                {
-                    if (IsDoubleSubnormal(s[j]))
-                    {
-                        int64_t correct = -dfunc.i_f(+0.0f);
-                        int64_t correct2 = -dfunc.i_f(-0.0f);
-                        if (correct == q[j] || correct2 == q[j]) continue;
-                    }
-                }
-
-                cl_ulong err = -t[j] - q[j];
-                if (q[j] > -t[j]) err = q[j] + t[j];
-                vlog_error(
-                    "\nERROR: %sD%s: %zd ulp error at %.13la: *%zd vs. %zd\n",
-                    name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]);
-                return -1;
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
-                 "ThreadCount:%2u\n",
-                 base, job->step, job->scale, buffer_elements,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-    return CL_SUCCESS;
-}
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad_double.cpp
similarity index 52%
rename from test_conformance/math_brute_force/mad.cpp
rename to test_conformance/math_brute_force/mad_double.cpp
index 0d8c6d44..cbbc1951 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad_double.cpp
@@ -20,97 +20,6 @@
 
 #include <string.h>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global float",
-                        sizeNames[vectorSize],
-                        "* out, __global float",
-                        sizeNames[vectorSize],
-                        "* in1, __global float",
-                        sizeNames[vectorSize],
-                        "* in2,  __global float",
-                        sizeNames[vectorSize],
-                        "* in3 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], in3[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global float* out, __global float* in, __global float* in2, "
-        "__global float* in3)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, f2 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       float3 f0;\n"
-        "       float3 f1;\n"
-        "       float3 f2;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
-        "               f2 = (float3)( in3[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, f1, f2 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
@@ -213,15 +122,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
-}
-
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -231,278 +131,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->programs + i, info->relaxedMode);
 }
 
-int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    float maxErrorVal = 0.0f;
-    float maxErrorVal2 = 0.0f;
-    float maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        cl_uint *p3 = (cl_uint *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-            p3[j] = genrand_int32(d);
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        float *r = (float *)gOut_Ref;
-        float *s = (float *)gIn;
-        float *s2 = (float *)gIn2;
-        float *s3 = (float *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            r[j] = (float)f->func.f_fff(s[j], s2[j], s3[j]);
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data -- No verification possible.
-        // MAD is a random number generator.
-        if (0 == (i & 0x0fffffff))
-        {
-            vlog(".");
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        cl_uint *p = (cl_uint *)gIn;
-        cl_uint *p2 = (cl_uint *)gIn2;
-        cl_uint *p3 = (cl_uint *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            p[j] = genrand_int32(d);
-            p2[j] = genrand_int32(d);
-            p3[j] = genrand_int32(d);
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
-             maxErrorVal3);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
-
 int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
diff --git a/test_conformance/math_brute_force/mad_float.cpp b/test_conformance/math_brute_force/mad_float.cpp
new file mode 100644
index 00000000..2124b268
--- /dev/null
+++ b/test_conformance/math_brute_force/mad_float.cpp
@@ -0,0 +1,402 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global float",
+                        sizeNames[vectorSize],
+                        "* in1, __global float",
+                        sizeNames[vectorSize],
+                        "* in2,  __global float",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global float* in, __global float* in2, "
+        "__global float* in3)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       float3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       float3 f2 = vload3( 0, in3 + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       float3 f2;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (float3)( in2[3*i], NAN, NAN ); \n"
+        "               f2 = (float3)( in3[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               f2 = (float3)( in3[3*i], in3[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, f1, f2 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
+}
+
+int TestFunc_mad_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    float maxErrorVal3 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            p[j] = genrand_int32(d);
+            p2[j] = genrand_int32(d);
+            p3[j] = genrand_int32(d);
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        float *r = (float *)gOut_Ref;
+        float *s = (float *)gIn;
+        float *s2 = (float *)gIn2;
+        float *s3 = (float *)gIn3;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = (float)f->func.f_fff(s[j], s2[j], s3[j]);
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data -- No verification possible.
+        // MAD is a random number generator.
+        if (0 == (i & 0x0fffffff))
+        {
+            vlog(".");
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            p[j] = genrand_int32(d);
+            p2[j] = genrand_int32(d);
+            p3[j] = genrand_int32(d);
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
new file mode 100644
index 00000000..427f4efd
--- /dev/null
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -0,0 +1,842 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+#define CORRECTLY_ROUNDED 0
+#define FLUSHED 1
+
+static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
+                             cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in1, __global double",
+                        sizeNames[vectorSize],
+                        "* in2,  __global double",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in, __global double* in2, "
+        "__global double* in3)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 d0 = vload3( 0, in + 3 * i );\n"
+        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       double3 d2;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
+        "               d2 = (double3)( in3[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
+                             info->programs + i, info->relaxedMode);
+}
+
+// A table of more difficult cases to get right
+static const double specialValuesDouble[] = {
+    -NAN,
+    -INFINITY,
+    -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
+    -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
+    -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
+    -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0,
+
+    +NAN,
+    +INFINITY,
+    +DBL_MAX,
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
+    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    +3.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
+    +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
+    +2.0,
+    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0,
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN,
+    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
+    +0.0,
+};
+
+static const size_t specialValuesDoubleCount =
+    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
+
+int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
+                                         bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ;
+    double maxErrorVal = 0.0f;
+    double maxErrorVal2 = 0.0f;
+    double maxErrorVal3 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(double), bufferSize);
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        double *p2 = (double *)gIn2;
+        double *p3 = (double *)gIn3;
+        j = 0;
+        if (i == 0)
+        { // test edge cases
+            uint32_t x, y, z;
+            x = y = z = 0;
+            for (; j < bufferSize / sizeof(double); j++)
+            {
+                p[j] = specialValuesDouble[x];
+                p2[j] = specialValuesDouble[y];
+                p3[j] = specialValuesDouble[z];
+                if (++x >= specialValuesDoubleCount)
+                {
+                    x = 0;
+                    if (++y >= specialValuesDoubleCount)
+                    {
+                        y = 0;
+                        if (++z >= specialValuesDoubleCount) break;
+                    }
+                }
+            }
+            if (j == bufferSize / sizeof(double))
+                vlog_error("Test Error: not all special cases tested!\n");
+        }
+
+        for (; j < bufferSize / sizeof(double); j++)
+        {
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+            p2[j] = DoubleFromUInt32(genrand_int32(d));
+            p3[j] = DoubleFromUInt32(genrand_int32(d));
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        double *r = (double *)gOut_Ref;
+        double *s = (double *)gIn;
+        double *s2 = (double *)gIn2;
+        double *s3 = (double *)gIn3;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint64_t *q = (uint64_t *)(gOut[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    double test = ((double *)q)[j];
+                    long double correct = f->dfunc.f_fff(s[j], s2[j], s3[j]);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    int fail = !(fabsf(err) <= f->double_ulps);
+
+                    if (fail && ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsDoubleSubnormal(correct))
+                        { // look at me,
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (fail && IsDoubleSubnormal(s[j]))
+                        { // look at me,
+                            long double correct2 =
+                                f->dfunc.f_fff(0.0, s2[j], s3[j]);
+                            long double correct3 =
+                                f->dfunc.f_fff(-0.0, s2[j], s3[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
+                            { // look at me now,
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+
+                            // try with first two args as zero
+                            if (IsDoubleSubnormal(s2[j]))
+                            { // its fun to have fun,
+                                correct2 = f->dfunc.f_fff(0.0, 0.0, s3[j]);
+                                correct3 = f->dfunc.f_fff(-0.0, 0.0, s3[j]);
+                                long double correct4 =
+                                    f->dfunc.f_fff(0.0, -0.0, s3[j]);
+                                long double correct5 =
+                                    f->dfunc.f_fff(-0.0, -0.0, s3[j]);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
+                                {
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
+                                }
+
+                                if (IsDoubleSubnormal(s3[j]))
+                                { // but you have to know how!
+                                    correct2 = f->dfunc.f_fff(0.0, 0.0, 0.0f);
+                                    correct3 = f->dfunc.f_fff(-0.0, 0.0, 0.0f);
+                                    correct4 = f->dfunc.f_fff(0.0, -0.0, 0.0f);
+                                    correct5 = f->dfunc.f_fff(-0.0, -0.0, 0.0f);
+                                    long double correct6 =
+                                        f->dfunc.f_fff(0.0, 0.0, -0.0f);
+                                    long double correct7 =
+                                        f->dfunc.f_fff(-0.0, 0.0, -0.0f);
+                                    long double correct8 =
+                                        f->dfunc.f_fff(0.0, -0.0, -0.0f);
+                                    long double correct9 =
+                                        f->dfunc.f_fff(-0.0, -0.0, -0.0f);
+                                    err2 = Bruteforce_Ulp_Error_Double(
+                                        test, correct2);
+                                    err3 = Bruteforce_Ulp_Error_Double(
+                                        test, correct3);
+                                    err4 = Bruteforce_Ulp_Error_Double(
+                                        test, correct4);
+                                    err5 = Bruteforce_Ulp_Error_Double(
+                                        test, correct5);
+                                    float err6 = Bruteforce_Ulp_Error_Double(
+                                        test, correct6);
+                                    float err7 = Bruteforce_Ulp_Error_Double(
+                                        test, correct7);
+                                    float err8 = Bruteforce_Ulp_Error_Double(
+                                        test, correct8);
+                                    float err9 = Bruteforce_Ulp_Error_Double(
+                                        test, correct9);
+                                    fail = fail
+                                        && ((!(fabsf(err2) <= f->double_ulps))
+                                            && (!(fabsf(err3)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err4)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err5)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err5)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err6)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err7)
+                                                  <= f->double_ulps))
+                                            && (!(fabsf(err8)
+                                                  <= f->double_ulps)));
+                                    if (fabsf(err2) < fabsf(err)) err = err2;
+                                    if (fabsf(err3) < fabsf(err)) err = err3;
+                                    if (fabsf(err4) < fabsf(err)) err = err4;
+                                    if (fabsf(err5) < fabsf(err)) err = err5;
+                                    if (fabsf(err6) < fabsf(err)) err = err6;
+                                    if (fabsf(err7) < fabsf(err)) err = err7;
+                                    if (fabsf(err8) < fabsf(err)) err = err8;
+                                    if (fabsf(err9) < fabsf(err)) err = err9;
+
+                                    // retry per section 6.5.3.4
+                                    if (IsDoubleResultSubnormal(correct2,
+                                                                f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct3, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct4, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct5, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct6, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct7, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct8, f->double_ulps)
+                                        || IsDoubleResultSubnormal(
+                                            correct9, f->double_ulps))
+                                    {
+                                        fail = fail && (test != 0.0f);
+                                        if (!fail) err = 0.0f;
+                                    }
+                                }
+                            }
+                            else if (IsDoubleSubnormal(s3[j]))
+                            {
+                                correct2 = f->dfunc.f_fff(0.0, s2[j], 0.0);
+                                correct3 = f->dfunc.f_fff(-0.0, s2[j], 0.0);
+                                long double correct4 =
+                                    f->dfunc.f_fff(0.0, s2[j], -0.0);
+                                long double correct5 =
+                                    f->dfunc.f_fff(-0.0, s2[j], -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
+                                {
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
+                                }
+                            }
+                        }
+                        else if (fail && IsDoubleSubnormal(s2[j]))
+                        {
+                            long double correct2 =
+                                f->dfunc.f_fff(s[j], 0.0, s3[j]);
+                            long double correct3 =
+                                f->dfunc.f_fff(s[j], -0.0, s3[j]);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+
+                            // try with second two args as zero
+                            if (IsDoubleSubnormal(s3[j]))
+                            {
+                                correct2 = f->dfunc.f_fff(s[j], 0.0, 0.0);
+                                correct3 = f->dfunc.f_fff(s[j], -0.0, 0.0);
+                                long double correct4 =
+                                    f->dfunc.f_fff(s[j], 0.0, -0.0);
+                                long double correct5 =
+                                    f->dfunc.f_fff(s[j], -0.0, -0.0);
+                                err2 =
+                                    Bruteforce_Ulp_Error_Double(test, correct2);
+                                err3 =
+                                    Bruteforce_Ulp_Error_Double(test, correct3);
+                                float err4 =
+                                    Bruteforce_Ulp_Error_Double(test, correct4);
+                                float err5 =
+                                    Bruteforce_Ulp_Error_Double(test, correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= f->double_ulps))
+                                        && (!(fabsf(err3) <= f->double_ulps))
+                                        && (!(fabsf(err4) <= f->double_ulps))
+                                        && (!(fabsf(err5) <= f->double_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (IsDoubleResultSubnormal(correct2,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct3,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct4,
+                                                               f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct5,
+                                                               f->double_ulps))
+                                {
+                                    fail = fail && (test != 0.0f);
+                                    if (!fail) err = 0.0f;
+                                }
+                            }
+                        }
+                        else if (fail && IsDoubleSubnormal(s3[j]))
+                        {
+                            long double correct2 =
+                                f->dfunc.f_fff(s[j], s2[j], 0.0);
+                            long double correct3 =
+                                f->dfunc.f_fff(s[j], s2[j], -0.0);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= f->double_ulps))
+                                    && (!(fabsf(err3) <= f->double_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correct3,
+                                                           f->double_ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = s[j];
+                        maxErrorVal2 = s2[j];
+                        maxErrorVal3 = s3[j];
+                    }
+
+                    if (fail)
+                    {
+                        vlog_error("\nERROR: %sD%s: %f ulp error at {%.13la, "
+                                   "%.13la, %.13la}: *%.13la vs. %.13la\n",
+                                   f->name, sizeNames[k], err, s[j], s2[j],
+                                   s3[j], ((double *)gOut_Ref)[j], test);
+                        error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        double *p2 = (double *)gIn2;
+        double *p3 = (double *)gIn3;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+        {
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+            p2[j] = DoubleFromUInt32(genrand_int32(d));
+            p3[j] = DoubleFromUInt32(genrand_int32(d));
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary_float.cpp
similarity index 52%
rename from test_conformance/math_brute_force/ternary.cpp
rename to test_conformance/math_brute_force/ternary_float.cpp
index f8908909..3b3bde7c 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -114,99 +114,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in1, __global double",
-                        sizeNames[vectorSize],
-                        "* in2,  __global double",
-                        sizeNames[vectorSize],
-                        "* in3 )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], in3[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* in, __global double* in2, "
-        "__global double* in3)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 d0 = vload3( 0, in + 3 * i );\n"
-        "       double3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       double3 d2 = vload3( 0, in3 + 3 * i );\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, d2 );\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 d0;\n"
-        "       double3 d1;\n"
-        "       double3 d2;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (double3)( in2[3*i], NAN, NAN ); \n"
-        "               d2 = (double3)( in3[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               d2 = (double3)( in3[3*i], in3[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, d2 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -225,15 +132,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
-}
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -316,7 +214,6 @@ static const float specialValuesFloat[] = {
 static const size_t specialValuesFloatCount =
     sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
-
 int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
@@ -1077,711 +974,3 @@ exit:
 
     return error;
 }
-
-// A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
-    -NAN,
-    -INFINITY,
-    -DBL_MAX,
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
-    -3.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
-    -2.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
-    -2.0,
-    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
-    -1.5,
-    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    -1.0,
-    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
-    -DBL_MIN,
-    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
-    -0.0,
-
-    +NAN,
-    +INFINITY,
-    +DBL_MAX,
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
-    MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
-    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    +3.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
-    +2.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
-    +2.0,
-    MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
-    +1.5,
-    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
-    +1.0,
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
-    +DBL_MIN,
-    MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
-    +0.0,
-};
-
-static const size_t specialValuesDoubleCount =
-    sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
-
-
-int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
-                                         bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    int ftz = f->ftz || gForceFTZ;
-    double maxErrorVal = 0.0f;
-    double maxErrorVal2 = 0.0f;
-    double maxErrorVal3 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(double), bufferSize);
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    Force64BitFPUPrecision();
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        double *p3 = (double *)gIn3;
-        j = 0;
-        if (i == 0)
-        { // test edge cases
-            uint32_t x, y, z;
-            x = y = z = 0;
-            for (; j < bufferSize / sizeof(double); j++)
-            {
-                p[j] = specialValuesDouble[x];
-                p2[j] = specialValuesDouble[y];
-                p3[j] = specialValuesDouble[z];
-                if (++x >= specialValuesDoubleCount)
-                {
-                    x = 0;
-                    if (++y >= specialValuesDoubleCount)
-                    {
-                        y = 0;
-                        if (++z >= specialValuesDoubleCount) break;
-                    }
-                }
-            }
-            if (j == bufferSize / sizeof(double))
-                vlog_error("Test Error: not all special cases tested!\n");
-        }
-
-        for (; j < bufferSize / sizeof(double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-            p3[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        double *r = (double *)gOut_Ref;
-        double *s = (double *)gIn;
-        double *s2 = (double *)gIn2;
-        double *s3 = (double *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-            r[j] = (double)f->dfunc.f_fff(s[j], s2[j], s3[j]);
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint64_t *t = (uint64_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint64_t *q = (uint64_t *)(gOut[k]);
-
-                // If we aren't getting the correctly rounded result
-                if (t[j] != q[j])
-                {
-                    double test = ((double *)q)[j];
-                    long double correct = f->dfunc.f_fff(s[j], s2[j], s3[j]);
-                    float err = Bruteforce_Ulp_Error_Double(test, correct);
-                    int fail = !(fabsf(err) <= f->double_ulps);
-
-                    if (fail && ftz)
-                    {
-                        // retry per section 6.5.3.2
-                        if (IsDoubleSubnormal(correct))
-                        { // look at me,
-                            fail = fail && (test != 0.0f);
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // retry per section 6.5.3.3
-                        if (fail && IsDoubleSubnormal(s[j]))
-                        { // look at me,
-                            long double correct2 =
-                                f->dfunc.f_fff(0.0, s2[j], s3[j]);
-                            long double correct3 =
-                                f->dfunc.f_fff(-0.0, s2[j], s3[j]);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= f->double_ulps))
-                                    && (!(fabsf(err3) <= f->double_ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2,
-                                                        f->double_ulps)
-                                || IsDoubleResultSubnormal(correct3,
-                                                           f->double_ulps))
-                            { // look at me now,
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-
-                            // try with first two args as zero
-                            if (IsDoubleSubnormal(s2[j]))
-                            { // its fun to have fun,
-                                correct2 = f->dfunc.f_fff(0.0, 0.0, s3[j]);
-                                correct3 = f->dfunc.f_fff(-0.0, 0.0, s3[j]);
-                                long double correct4 =
-                                    f->dfunc.f_fff(0.0, -0.0, s3[j]);
-                                long double correct5 =
-                                    f->dfunc.f_fff(-0.0, -0.0, s3[j]);
-                                err2 =
-                                    Bruteforce_Ulp_Error_Double(test, correct2);
-                                err3 =
-                                    Bruteforce_Ulp_Error_Double(test, correct3);
-                                float err4 =
-                                    Bruteforce_Ulp_Error_Double(test, correct4);
-                                float err5 =
-                                    Bruteforce_Ulp_Error_Double(test, correct5);
-                                fail = fail
-                                    && ((!(fabsf(err2) <= f->double_ulps))
-                                        && (!(fabsf(err3) <= f->double_ulps))
-                                        && (!(fabsf(err4) <= f->double_ulps))
-                                        && (!(fabsf(err5) <= f->double_ulps)));
-                                if (fabsf(err2) < fabsf(err)) err = err2;
-                                if (fabsf(err3) < fabsf(err)) err = err3;
-                                if (fabsf(err4) < fabsf(err)) err = err4;
-                                if (fabsf(err5) < fabsf(err)) err = err5;
-
-                                // retry per section 6.5.3.4
-                                if (IsDoubleResultSubnormal(correct2,
-                                                            f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct3,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct4,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct5,
-                                                               f->double_ulps))
-                                {
-                                    fail = fail && (test != 0.0f);
-                                    if (!fail) err = 0.0f;
-                                }
-
-                                if (IsDoubleSubnormal(s3[j]))
-                                { // but you have to know how!
-                                    correct2 = f->dfunc.f_fff(0.0, 0.0, 0.0f);
-                                    correct3 = f->dfunc.f_fff(-0.0, 0.0, 0.0f);
-                                    correct4 = f->dfunc.f_fff(0.0, -0.0, 0.0f);
-                                    correct5 = f->dfunc.f_fff(-0.0, -0.0, 0.0f);
-                                    long double correct6 =
-                                        f->dfunc.f_fff(0.0, 0.0, -0.0f);
-                                    long double correct7 =
-                                        f->dfunc.f_fff(-0.0, 0.0, -0.0f);
-                                    long double correct8 =
-                                        f->dfunc.f_fff(0.0, -0.0, -0.0f);
-                                    long double correct9 =
-                                        f->dfunc.f_fff(-0.0, -0.0, -0.0f);
-                                    err2 = Bruteforce_Ulp_Error_Double(
-                                        test, correct2);
-                                    err3 = Bruteforce_Ulp_Error_Double(
-                                        test, correct3);
-                                    err4 = Bruteforce_Ulp_Error_Double(
-                                        test, correct4);
-                                    err5 = Bruteforce_Ulp_Error_Double(
-                                        test, correct5);
-                                    float err6 = Bruteforce_Ulp_Error_Double(
-                                        test, correct6);
-                                    float err7 = Bruteforce_Ulp_Error_Double(
-                                        test, correct7);
-                                    float err8 = Bruteforce_Ulp_Error_Double(
-                                        test, correct8);
-                                    float err9 = Bruteforce_Ulp_Error_Double(
-                                        test, correct9);
-                                    fail = fail
-                                        && ((!(fabsf(err2) <= f->double_ulps))
-                                            && (!(fabsf(err3)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err4)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err5)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err5)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err6)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err7)
-                                                  <= f->double_ulps))
-                                            && (!(fabsf(err8)
-                                                  <= f->double_ulps)));
-                                    if (fabsf(err2) < fabsf(err)) err = err2;
-                                    if (fabsf(err3) < fabsf(err)) err = err3;
-                                    if (fabsf(err4) < fabsf(err)) err = err4;
-                                    if (fabsf(err5) < fabsf(err)) err = err5;
-                                    if (fabsf(err6) < fabsf(err)) err = err6;
-                                    if (fabsf(err7) < fabsf(err)) err = err7;
-                                    if (fabsf(err8) < fabsf(err)) err = err8;
-                                    if (fabsf(err9) < fabsf(err)) err = err9;
-
-                                    // retry per section 6.5.3.4
-                                    if (IsDoubleResultSubnormal(correct2,
-                                                                f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct3, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct4, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct5, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct6, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct7, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct8, f->double_ulps)
-                                        || IsDoubleResultSubnormal(
-                                            correct9, f->double_ulps))
-                                    {
-                                        fail = fail && (test != 0.0f);
-                                        if (!fail) err = 0.0f;
-                                    }
-                                }
-                            }
-                            else if (IsDoubleSubnormal(s3[j]))
-                            {
-                                correct2 = f->dfunc.f_fff(0.0, s2[j], 0.0);
-                                correct3 = f->dfunc.f_fff(-0.0, s2[j], 0.0);
-                                long double correct4 =
-                                    f->dfunc.f_fff(0.0, s2[j], -0.0);
-                                long double correct5 =
-                                    f->dfunc.f_fff(-0.0, s2[j], -0.0);
-                                err2 =
-                                    Bruteforce_Ulp_Error_Double(test, correct2);
-                                err3 =
-                                    Bruteforce_Ulp_Error_Double(test, correct3);
-                                float err4 =
-                                    Bruteforce_Ulp_Error_Double(test, correct4);
-                                float err5 =
-                                    Bruteforce_Ulp_Error_Double(test, correct5);
-                                fail = fail
-                                    && ((!(fabsf(err2) <= f->double_ulps))
-                                        && (!(fabsf(err3) <= f->double_ulps))
-                                        && (!(fabsf(err4) <= f->double_ulps))
-                                        && (!(fabsf(err5) <= f->double_ulps)));
-                                if (fabsf(err2) < fabsf(err)) err = err2;
-                                if (fabsf(err3) < fabsf(err)) err = err3;
-                                if (fabsf(err4) < fabsf(err)) err = err4;
-                                if (fabsf(err5) < fabsf(err)) err = err5;
-
-                                // retry per section 6.5.3.4
-                                if (IsDoubleResultSubnormal(correct2,
-                                                            f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct3,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct4,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct5,
-                                                               f->double_ulps))
-                                {
-                                    fail = fail && (test != 0.0f);
-                                    if (!fail) err = 0.0f;
-                                }
-                            }
-                        }
-                        else if (fail && IsDoubleSubnormal(s2[j]))
-                        {
-                            long double correct2 =
-                                f->dfunc.f_fff(s[j], 0.0, s3[j]);
-                            long double correct3 =
-                                f->dfunc.f_fff(s[j], -0.0, s3[j]);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= f->double_ulps))
-                                    && (!(fabsf(err3) <= f->double_ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2,
-                                                        f->double_ulps)
-                                || IsDoubleResultSubnormal(correct3,
-                                                           f->double_ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-
-                            // try with second two args as zero
-                            if (IsDoubleSubnormal(s3[j]))
-                            {
-                                correct2 = f->dfunc.f_fff(s[j], 0.0, 0.0);
-                                correct3 = f->dfunc.f_fff(s[j], -0.0, 0.0);
-                                long double correct4 =
-                                    f->dfunc.f_fff(s[j], 0.0, -0.0);
-                                long double correct5 =
-                                    f->dfunc.f_fff(s[j], -0.0, -0.0);
-                                err2 =
-                                    Bruteforce_Ulp_Error_Double(test, correct2);
-                                err3 =
-                                    Bruteforce_Ulp_Error_Double(test, correct3);
-                                float err4 =
-                                    Bruteforce_Ulp_Error_Double(test, correct4);
-                                float err5 =
-                                    Bruteforce_Ulp_Error_Double(test, correct5);
-                                fail = fail
-                                    && ((!(fabsf(err2) <= f->double_ulps))
-                                        && (!(fabsf(err3) <= f->double_ulps))
-                                        && (!(fabsf(err4) <= f->double_ulps))
-                                        && (!(fabsf(err5) <= f->double_ulps)));
-                                if (fabsf(err2) < fabsf(err)) err = err2;
-                                if (fabsf(err3) < fabsf(err)) err = err3;
-                                if (fabsf(err4) < fabsf(err)) err = err4;
-                                if (fabsf(err5) < fabsf(err)) err = err5;
-
-                                // retry per section 6.5.3.4
-                                if (IsDoubleResultSubnormal(correct2,
-                                                            f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct3,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct4,
-                                                               f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct5,
-                                                               f->double_ulps))
-                                {
-                                    fail = fail && (test != 0.0f);
-                                    if (!fail) err = 0.0f;
-                                }
-                            }
-                        }
-                        else if (fail && IsDoubleSubnormal(s3[j]))
-                        {
-                            long double correct2 =
-                                f->dfunc.f_fff(s[j], s2[j], 0.0);
-                            long double correct3 =
-                                f->dfunc.f_fff(s[j], s2[j], -0.0);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= f->double_ulps))
-                                    && (!(fabsf(err3) <= f->double_ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2,
-                                                        f->double_ulps)
-                                || IsDoubleResultSubnormal(correct3,
-                                                           f->double_ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-
-                    if (fabsf(err) > maxError)
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                        maxErrorVal2 = s2[j];
-                        maxErrorVal3 = s3[j];
-                    }
-
-                    if (fail)
-                    {
-                        vlog_error("\nERROR: %sD%s: %f ulp error at {%.13la, "
-                                   "%.13la, %.13la}: *%.13la vs. %.13la\n",
-                                   f->name, sizeNames[k], err, s[j], s2[j],
-                                   s3[j], ((double *)gOut_Ref)[j], test);
-                        error = -1;
-                        goto exit;
-                    }
-                }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        double *p2 = (double *)gIn2;
-        double *p3 = (double *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-            p2[j] = DoubleFromUInt32(genrand_int32(d));
-            p3[j] = DoubleFromUInt32(genrand_int32(d));
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
-                                          bufferSize, gIn2, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
-            return error;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
-                                          bufferSize, gIn3, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (bufferSize + vectorSize - 1)
-                / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
-             maxErrorVal3);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
diff --git a/test_conformance/math_brute_force/unary_double.cpp b/test_conformance/math_brute_force/unary_double.cpp
new file mode 100644
index 00000000..b97b1943
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_double.cpp
@@ -0,0 +1,662 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+#if defined(__APPLE__)
+#include <sys/time.h>
+#endif
+
+static int BuildKernelDouble(const char *name, int vectorSize,
+                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                             bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* in )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
+                             info->kernels[i], info->programs + i,
+                             info->relaxedMode);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double maxErrorValue; // position of the max error value.  Init to 0.
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    int isRangeLimited; // 1 if the function is only to be evaluated over a
+                        // range
+    float half_sin_cos_tan_limit;
+    bool relaxedMode; // True if test is running in relaxed mode, false
+                      // otherwise.
+} TestInfo;
+
+static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_double);
+    cl_uint scale = job->scale;
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    dptr func = job->f->dfunc;
+    cl_uint j, k;
+    cl_int error;
+    int ftz = job->ftz;
+
+    Force64BitFPUPrecision();
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ulong *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Write the new values to the input array
+    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+        p[j] = DoubleFromUInt32(base + j * scale);
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            return error;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            return error;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xffffdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            return error;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
+    cl_double *s = (cl_double *)p;
+    for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]);
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ulong *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+    // Wait for the last buffer
+    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        return error;
+    }
+
+
+    // Verify data
+    cl_ulong *t = (cl_ulong *)r;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ulong *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                cl_double test = ((cl_double *)q)[j];
+                long double correct = func.f_f(s[j]);
+                float err = Bruteforce_Ulp_Error_Double(test, correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail)
+                {
+                    if (ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsDoubleResultSubnormal(correct, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (IsDoubleSubnormal(s[j]))
+                        {
+                            long double correct2 = func.f_f(0.0L);
+                            long double correct3 = func.f_f(-0.0L);
+                            float err2 =
+                                Bruteforce_Ulp_Error_Double(test, correct2);
+                            float err3 =
+                                Bruteforce_Ulp_Error_Double(test, correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correct2, ulps)
+                                || IsDoubleResultSubnormal(correct3, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                }
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
+                               "(0x%16.16llx): *%.13la vs. %.13la\n",
+                               job->f->name, sizeNames[k], err,
+                               ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
+                               ((cl_double *)gOut_Ref)[j], test);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, buffer_elements, job->scale, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+    return CL_SUCCESS;
+}
+
+int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+#if defined(__APPLE__)
+    struct timeval time_val;
+    gettimeofday(&time_val, NULL);
+    double start_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
+    double end_time;
+#endif
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_double));
+    if (gWimpyMode)
+    {
+        test_info.subBufferSize = gWimpyBufferSize
+            / (sizeof(cl_double)
+               * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    }
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->double_ulps;
+    test_info.ftz = f->ftz || gForceFTZ;
+    test_info.relaxedMode = relaxedMode;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = {
+            i * test_info.subBufferSize * sizeof(cl_double),
+            test_info.subBufferSize * sizeof(cl_double)
+        };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = {
+            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
+            test_info.programs,  f->nameInCode,         relaxedMode
+        };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+
+#if defined(__APPLE__)
+    gettimeofday(&time_val, NULL);
+    end_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
+#endif
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+
+        if (strstr(f->name, "exp"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+                p[j] = (double)genrand_real1(d);
+        else if (strstr(f->name, "log"))
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+                p[j] = fabs(DoubleFromUInt32(genrand_int32(d)));
+        else
+            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
+                p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(test_info.k[j][0], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(test_info.programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (i = 0; i < PERF_LOOP_COUNT; i++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
+                                                    1, NULL, &localCount, NULL,
+                                                    0, NULL, NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (BUFFER_SIZE / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+
+#if defined(__APPLE__)
+    vlog("\t(%2.2f seconds)", end_time - start_time);
+#endif
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary_float.cpp
similarity index 58%
rename from test_conformance/math_brute_force/unary.cpp
rename to test_conformance/math_brute_force/unary_float.cpp
index dc6d56c1..4c1bd7ab 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -103,88 +103,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize,
-                             cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                             bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* in )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 f0 = vload3( 0, in + 3 * i );\n"
-        "       f0 = ",
-        name,
-        "( f0 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -204,16 +122,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->kernels[i], info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
-}
-
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -915,505 +823,3 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 
     return CL_SUCCESS;
 }
-
-
-static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
-{
-    const TestInfo *job = (const TestInfo *)data;
-    size_t buffer_elements = job->subBufferSize;
-    size_t buffer_size = buffer_elements * sizeof(cl_double);
-    cl_uint scale = job->scale;
-    cl_uint base = job_id * (cl_uint)job->step;
-    ThreadInfo *tinfo = job->tinfo + thread_id;
-    float ulps = job->ulps;
-    dptr func = job->f->dfunc;
-    cl_uint j, k;
-    cl_int error;
-    int ftz = job->ftz;
-
-    Force64BitFPUPrecision();
-
-    // start the map of the output arrays
-    cl_event e[VECTOR_SIZE_COUNT];
-    cl_ulong *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
-
-    // Write the new values to the input array
-    cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
-    for (j = 0; j < buffer_elements; j++)
-        p[j] = DoubleFromUInt32(base + j * scale);
-
-    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
-                                      buffer_size, p, 0, NULL, NULL)))
-    {
-        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        return error;
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            return error;
-        }
-        if ((error = clReleaseEvent(e[j])))
-        {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            return error;
-        }
-
-        // Fill the result buffer with garbage, so that old results don't carry
-        // over
-        uint32_t pattern = 0xffffdead;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            return error;
-        }
-
-        // run the kernel
-        size_t vectorCount =
-            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
-        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
-                                                 // own copy of the cl_kernel
-        cl_program program = job->programs[j];
-
-        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
-                                    &tinfo->outBuf[j])))
-        {
-            LogBuildError(program);
-            return error;
-        }
-        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
-                                    &tinfo->inBuf)))
-        {
-            LogBuildError(program);
-            return error;
-        }
-
-        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
-                                            &vectorCount, NULL, 0, NULL, NULL)))
-        {
-            vlog_error("FAILED -- could not execute kernel\n");
-            return error;
-        }
-    }
-
-
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
-
-    if (gSkipCorrectnessTesting) return CL_SUCCESS;
-
-    // Calculate the correctly rounded reference result
-    cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
-    cl_double *s = (cl_double *)p;
-    for (j = 0; j < buffer_elements; j++) r[j] = (cl_double)func.f_f(s[j]);
-
-    // Read the data back -- no need to wait for the first N-1 buffers. This is
-    // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
-    {
-        out[j] = (cl_ulong *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
-            buffer_size, 0, NULL, NULL, &error);
-        if (error || NULL == out[j])
-        {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
-        }
-    }
-    // Wait for the last buffer
-    out[j] = (cl_ulong *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        return error;
-    }
-
-
-    // Verify data
-    cl_ulong *t = (cl_ulong *)r;
-    for (j = 0; j < buffer_elements; j++)
-    {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-        {
-            cl_ulong *q = out[k];
-
-            // If we aren't getting the correctly rounded result
-            if (t[j] != q[j])
-            {
-                cl_double test = ((cl_double *)q)[j];
-                long double correct = func.f_f(s[j]);
-                float err = Bruteforce_Ulp_Error_Double(test, correct);
-                int fail = !(fabsf(err) <= ulps);
-
-                if (fail)
-                {
-                    if (ftz)
-                    {
-                        // retry per section 6.5.3.2
-                        if (IsDoubleResultSubnormal(correct, ulps))
-                        {
-                            fail = fail && (test != 0.0f);
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // retry per section 6.5.3.3
-                        if (IsDoubleSubnormal(s[j]))
-                        {
-                            long double correct2 = func.f_f(0.0L);
-                            long double correct3 = func.f_f(-0.0L);
-                            float err2 =
-                                Bruteforce_Ulp_Error_Double(test, correct2);
-                            float err3 =
-                                Bruteforce_Ulp_Error_Double(test, correct3);
-                            fail = fail
-                                && ((!(fabsf(err2) <= ulps))
-                                    && (!(fabsf(err3) <= ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correct2, ulps)
-                                || IsDoubleResultSubnormal(correct3, ulps))
-                            {
-                                fail = fail && (test != 0.0f);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                }
-                if (fabsf(err) > tinfo->maxError)
-                {
-                    tinfo->maxError = fabsf(err);
-                    tinfo->maxErrorValue = s[j];
-                }
-                if (fail)
-                {
-                    vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
-                               "(0x%16.16llx): *%.13la vs. %.13la\n",
-                               job->f->name, sizeNames[k], err,
-                               ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
-                               ((cl_double *)gOut_Ref)[j], test);
-                    return -1;
-                }
-            }
-        }
-    }
-
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-    {
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
-        {
-            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
-                       j, error);
-            return error;
-        }
-    }
-
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
-
-
-    if (0 == (base & 0x0fffffff))
-    {
-        if (gVerboseBruteForce)
-        {
-            vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f "
-                 "ThreadCount:%2u\n",
-                 base, job->step, buffer_elements, job->scale, job->ulps,
-                 job->threadCount);
-        }
-        else
-        {
-            vlog(".");
-        }
-        fflush(stdout);
-    }
-
-    return CL_SUCCESS;
-}
-
-int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfo test_info;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-#if defined(__APPLE__)
-    struct timeval time_val;
-    gettimeofday(&time_val, NULL);
-    double start_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
-    double end_time;
-#endif
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-    // Init test_info
-    memset(&test_info, 0, sizeof(test_info));
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_double));
-    if (gWimpyMode)
-    {
-        test_info.subBufferSize = gWimpyBufferSize
-            / (sizeof(cl_double)
-               * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    }
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->double_ulps;
-    test_info.ftz = f->ftz || gForceFTZ;
-    test_info.relaxedMode = relaxedMode;
-
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = {
-            i * test_info.subBufferSize * sizeof(cl_double),
-            test_info.subBufferSize * sizeof(cl_double)
-        };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            goto exit;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                goto exit;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            goto exit;
-        }
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = {
-            gMinVectorSizeIndex, test_info.threadCount, test_info.k,
-            test_info.programs,  f->nameInCode,         relaxedMode
-        };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            goto exit;
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            }
-        }
-
-        if (error) goto exit;
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-
-#if defined(__APPLE__)
-    gettimeofday(&time_val, NULL);
-    end_time = time_val.tv_sec + 1e-6 * time_val.tv_usec;
-#endif
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-
-        if (strstr(f->name, "exp"))
-            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
-                p[j] = (double)genrand_real1(d);
-        else if (strstr(f->name, "log"))
-            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
-                p[j] = fabs(DoubleFromUInt32(genrand_int32(d)));
-        else
-            for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
-                p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
-                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
-            if ((error = clSetKernelArg(test_info.k[j][0], 0,
-                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(test_info.k[j][0], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(test_info.programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (i = 0; i < PERF_LOOP_COUNT; i++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, test_info.k[j][0],
-                                                    1, NULL, &localCount, NULL,
-                                                    0, NULL, NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (BUFFER_SIZE / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
-
-#if defined(__APPLE__)
-    vlog("\t(%2.2f seconds)", end_time - start_time);
-#endif
-    vlog("\n");
-
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-
-    return error;
-}
diff --git a/test_conformance/math_brute_force/unary_two_results_double.cpp b/test_conformance/math_brute_force/unary_two_results_double.cpp
new file mode 100644
index 00000000..779c653a
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_two_results_double.cpp
@@ -0,0 +1,523 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
+                             cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global double",
+                        sizeNames[vectorSize],
+                        "* out2, __global double",
+                        sizeNames[vectorSize],
+                        "* in )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global double* out2, __global double* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       double3 f0 = vload3( 0, in + 3 * i );\n"
+        "       double3 iout = NAN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       double3 iout = NAN;\n"
+        "       double3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
+                             info->programs + i, info->relaxedMode);
+}
+
+int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError0 = 0.0f;
+    float maxError1 = 0.0f;
+    int ftz = f->ftz || gForceFTZ;
+    double maxErrorVal0 = 0.0f;
+    double maxErrorVal1 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        if (gWimpyMode)
+        {
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
+        }
+        else
+        {
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+                p[j] = DoubleFromUInt32((uint32_t)i + j);
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        double *r = (double *)gOut_Ref;
+        double *r2 = (double *)gOut_Ref2;
+        double *s = (double *)gIn;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+        {
+            long double dd;
+            r[j] = (double)f->dfunc.f_fpf(s[j], &dd);
+            r2[j] = (double)dd;
+        }
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        uint64_t *t2 = (uint64_t *)gOut_Ref2;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint64_t *q = (uint64_t *)(gOut[k]);
+                uint64_t *q2 = (uint64_t *)(gOut2[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j] || t2[j] != q2[j])
+                {
+                    double test = ((double *)q)[j];
+                    double test2 = ((double *)q2)[j];
+                    long double correct2;
+                    long double correct = f->dfunc.f_fpf(s[j], &correct2);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    float err2 = Bruteforce_Ulp_Error_Double(test2, correct2);
+                    int fail = !(fabsf(err) <= f->double_ulps
+                                 && fabsf(err2) <= f->double_ulps);
+                    if (ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsDoubleResultSubnormal(correct, f->double_ulps))
+                        {
+                            if (IsDoubleResultSubnormal(correct2,
+                                                        f->double_ulps))
+                            {
+                                fail = fail && !(test == 0.0f && test2 == 0.0f);
+                                if (!fail)
+                                {
+                                    err = 0.0f;
+                                    err2 = 0.0f;
+                                }
+                            }
+                            else
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && fabsf(err2) <= f->double_ulps);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                        else if (IsDoubleResultSubnormal(correct2,
+                                                         f->double_ulps))
+                        {
+                            fail = fail
+                                && !(test2 == 0.0f
+                                     && fabsf(err) <= f->double_ulps);
+                            if (!fail) err2 = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (IsDoubleSubnormal(s[j]))
+                        {
+                            long double correct2p, correct2n;
+                            long double correctp =
+                                f->dfunc.f_fpf(0.0, &correct2p);
+                            long double correctn =
+                                f->dfunc.f_fpf(-0.0, &correct2n);
+                            float errp =
+                                Bruteforce_Ulp_Error_Double(test, correctp);
+                            float err2p =
+                                Bruteforce_Ulp_Error_Double(test, correct2p);
+                            float errn =
+                                Bruteforce_Ulp_Error_Double(test, correctn);
+                            float err2n =
+                                Bruteforce_Ulp_Error_Double(test, correct2n);
+                            fail = fail
+                                && ((!(fabsf(errp) <= f->double_ulps))
+                                    && (!(fabsf(err2p) <= f->double_ulps))
+                                    && ((!(fabsf(errn) <= f->double_ulps))
+                                        && (!(fabsf(err2n)
+                                              <= f->double_ulps))));
+                            if (fabsf(errp) < fabsf(err)) err = errp;
+                            if (fabsf(errn) < fabsf(err)) err = errn;
+                            if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
+                            if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
+
+                            // retry per section 6.5.3.4
+                            if (IsDoubleResultSubnormal(correctp,
+                                                        f->double_ulps)
+                                || IsDoubleResultSubnormal(correctn,
+                                                           f->double_ulps))
+                            {
+                                if (IsDoubleResultSubnormal(correct2p,
+                                                            f->double_ulps)
+                                    || IsDoubleResultSubnormal(correct2n,
+                                                               f->double_ulps))
+                                {
+                                    fail = fail
+                                        && !(test == 0.0f && test2 == 0.0f);
+                                    if (!fail) err = err2 = 0.0f;
+                                }
+                                else
+                                {
+                                    fail = fail
+                                        && !(test == 0.0f
+                                             && fabsf(err2) <= f->double_ulps);
+                                    if (!fail) err = 0.0f;
+                                }
+                            }
+                            else if (IsDoubleResultSubnormal(correct2p,
+                                                             f->double_ulps)
+                                     || IsDoubleResultSubnormal(correct2n,
+                                                                f->double_ulps))
+                            {
+                                fail = fail
+                                    && !(test2 == 0.0f
+                                         && (fabsf(err) <= f->double_ulps));
+                                if (!fail) err2 = 0.0f;
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError0)
+                    {
+                        maxError0 = fabsf(err);
+                        maxErrorVal0 = s[j];
+                    }
+                    if (fabsf(err2) > maxError1)
+                    {
+                        maxError1 = fabsf(err2);
+                        maxErrorVal1 = s[j];
+                    }
+                    if (fail)
+                    {
+                        vlog_error(
+                            "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: "
+                            "*{%.13la, %.13la} vs. {%.13la, %.13la}\n",
+                            f->name, sizeNames[k], err, err2,
+                            ((double *)gIn)[j], ((double *)gOut_Ref)[j],
+                            ((double *)gOut_Ref2)[j], test, test2);
+                        error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+        for (j = 0; j < bufferSize / sizeof(double); j++)
+            p[j] = DoubleFromUInt32(genrand_int32(d));
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
+             maxErrorVal1);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results_float.cpp
similarity index 56%
rename from test_conformance/math_brute_force/unary_two_results.cpp
rename to test_conformance/math_brute_force/unary_two_results_float.cpp
index accebd3a..cda80b47 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_float.cpp
@@ -105,93 +105,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global double",
-                        sizeNames[vectorSize],
-                        "* out2, __global double",
-                        sizeNames[vectorSize],
-                        "* in )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i], out2 + i );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global double* out2, __global double* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       double3 f0 = vload3( 0, in + 3 * i );\n"
-        "       double3 iout = NAN;\n"
-        "       f0 = ",
-        name,
-        "( f0, &iout );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "       vstore3( iout, 0, out2 + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       double3 iout = NAN;\n"
-        "       double3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (double3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (double3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, &iout );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               out2[3*i+1] = iout.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               out2[3*i] = iout.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -210,15 +123,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
-}
-
 int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
@@ -752,400 +656,3 @@ exit:
 
     return error;
 }
-
-int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError0 = 0.0f;
-    float maxError1 = 0.0f;
-    int ftz = f->ftz || gForceFTZ;
-    double maxErrorVal0 = 0.0f;
-    double maxErrorVal1 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    Force64BitFPUPrecision();
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        if (gWimpyMode)
-        {
-            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-                p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
-        }
-        else
-        {
-            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-                p[j] = DoubleFromUInt32((uint32_t)i + j);
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        double *r = (double *)gOut_Ref;
-        double *r2 = (double *)gOut_Ref2;
-        double *s = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-        {
-            long double dd;
-            r[j] = (double)f->dfunc.f_fpf(s[j], &dd);
-            r2[j] = (double)dd;
-        }
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray2 failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint64_t *t = (uint64_t *)gOut_Ref;
-        uint64_t *t2 = (uint64_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint64_t *q = (uint64_t *)(gOut[k]);
-                uint64_t *q2 = (uint64_t *)(gOut2[k]);
-
-                // If we aren't getting the correctly rounded result
-                if (t[j] != q[j] || t2[j] != q2[j])
-                {
-                    double test = ((double *)q)[j];
-                    double test2 = ((double *)q2)[j];
-                    long double correct2;
-                    long double correct = f->dfunc.f_fpf(s[j], &correct2);
-                    float err = Bruteforce_Ulp_Error_Double(test, correct);
-                    float err2 = Bruteforce_Ulp_Error_Double(test2, correct2);
-                    int fail = !(fabsf(err) <= f->double_ulps
-                                 && fabsf(err2) <= f->double_ulps);
-                    if (ftz)
-                    {
-                        // retry per section 6.5.3.2
-                        if (IsDoubleResultSubnormal(correct, f->double_ulps))
-                        {
-                            if (IsDoubleResultSubnormal(correct2,
-                                                        f->double_ulps))
-                            {
-                                fail = fail && !(test == 0.0f && test2 == 0.0f);
-                                if (!fail)
-                                {
-                                    err = 0.0f;
-                                    err2 = 0.0f;
-                                }
-                            }
-                            else
-                            {
-                                fail = fail
-                                    && !(test == 0.0f
-                                         && fabsf(err2) <= f->double_ulps);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                        else if (IsDoubleResultSubnormal(correct2,
-                                                         f->double_ulps))
-                        {
-                            fail = fail
-                                && !(test2 == 0.0f
-                                     && fabsf(err) <= f->double_ulps);
-                            if (!fail) err2 = 0.0f;
-                        }
-
-                        // retry per section 6.5.3.3
-                        if (IsDoubleSubnormal(s[j]))
-                        {
-                            long double correct2p, correct2n;
-                            long double correctp =
-                                f->dfunc.f_fpf(0.0, &correct2p);
-                            long double correctn =
-                                f->dfunc.f_fpf(-0.0, &correct2n);
-                            float errp =
-                                Bruteforce_Ulp_Error_Double(test, correctp);
-                            float err2p =
-                                Bruteforce_Ulp_Error_Double(test, correct2p);
-                            float errn =
-                                Bruteforce_Ulp_Error_Double(test, correctn);
-                            float err2n =
-                                Bruteforce_Ulp_Error_Double(test, correct2n);
-                            fail = fail
-                                && ((!(fabsf(errp) <= f->double_ulps))
-                                    && (!(fabsf(err2p) <= f->double_ulps))
-                                    && ((!(fabsf(errn) <= f->double_ulps))
-                                        && (!(fabsf(err2n)
-                                              <= f->double_ulps))));
-                            if (fabsf(errp) < fabsf(err)) err = errp;
-                            if (fabsf(errn) < fabsf(err)) err = errn;
-                            if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
-                            if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
-
-                            // retry per section 6.5.3.4
-                            if (IsDoubleResultSubnormal(correctp,
-                                                        f->double_ulps)
-                                || IsDoubleResultSubnormal(correctn,
-                                                           f->double_ulps))
-                            {
-                                if (IsDoubleResultSubnormal(correct2p,
-                                                            f->double_ulps)
-                                    || IsDoubleResultSubnormal(correct2n,
-                                                               f->double_ulps))
-                                {
-                                    fail = fail
-                                        && !(test == 0.0f && test2 == 0.0f);
-                                    if (!fail) err = err2 = 0.0f;
-                                }
-                                else
-                                {
-                                    fail = fail
-                                        && !(test == 0.0f
-                                             && fabsf(err2) <= f->double_ulps);
-                                    if (!fail) err = 0.0f;
-                                }
-                            }
-                            else if (IsDoubleResultSubnormal(correct2p,
-                                                             f->double_ulps)
-                                     || IsDoubleResultSubnormal(correct2n,
-                                                                f->double_ulps))
-                            {
-                                fail = fail
-                                    && !(test2 == 0.0f
-                                         && (fabsf(err) <= f->double_ulps));
-                                if (!fail) err2 = 0.0f;
-                            }
-                        }
-                    }
-                    if (fabsf(err) > maxError0)
-                    {
-                        maxError0 = fabsf(err);
-                        maxErrorVal0 = s[j];
-                    }
-                    if (fabsf(err2) > maxError1)
-                    {
-                        maxError1 = fabsf(err2);
-                        maxErrorVal1 = s[j];
-                    }
-                    if (fail)
-                    {
-                        vlog_error(
-                            "\nERROR: %sD%s: {%f, %f} ulp error at %.13la: "
-                            "*{%.13la, %.13la} vs. {%.13la, %.13la}\n",
-                            f->name, sizeNames[k], err, err2,
-                            ((double *)gIn)[j], ((double *)gOut_Ref)[j],
-                            ((double *)gOut_Ref2)[j], test, test2);
-                        error = -1;
-                        goto exit;
-                    }
-                }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
-            p[j] = DoubleFromUInt32(genrand_int32(d));
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
-             maxErrorVal1);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
similarity index 52%
rename from test_conformance/math_brute_force/unary_two_results_i.cpp
rename to test_conformance/math_brute_force/unary_two_results_i_double.cpp
index 2ac083d2..3fd616a4 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_double.cpp
@@ -21,91 +21,6 @@
 #include <limits.h>
 #include <string.h>
 
-static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
-                       cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global float",
-                        sizeNames[vectorSize],
-                        "* out, __global int",
-                        sizeNames[vectorSize],
-                        "* out2, __global float",
-                        sizeNames[vectorSize],
-                        "* in )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i], out2 + i );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global float* out, __global int* out2, __global float* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       float3 f0 = vload3( 0, in + 3 * i );\n"
-        "       int3 iout = INT_MIN;\n"
-        "       f0 = ",
-        name,
-        "( f0, &iout );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "       vstore3( iout, 0, out2 + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       int3 iout = INT_MIN;\n"
-        "       float3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0, &iout );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               out2[3*i+1] = iout.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               out2[3*i] = iout.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                              cl_program *p, bool relaxedMode)
 {
@@ -202,15 +117,6 @@ typedef struct BuildKernelInfo
     bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
 } BuildKernelInfo;
 
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernels + i,
-                       info->programs + i, info->relaxedMode);
-}
-
 static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                    void *p)
 {
@@ -226,367 +132,6 @@ static cl_ulong abs_cl_long(cl_long i)
     return (i ^ mask) - mask;
 }
 
-int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    int64_t maxError2 = 0;
-    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-    float maxErrorVal = 0.0f;
-    float maxErrorVal2 = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
-    cl_ulong maxiError;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
-    float float_ulps;
-    if (gIsEmbedded)
-        float_ulps = f->float_embedded_ulps;
-    else
-        float_ulps = f->float_ulps;
-
-    maxiError = float_ulps == INFINITY ? CL_ULONG_MAX : 0;
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        if (gWimpyMode)
-        {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = (uint32_t)i + j * scale;
-        }
-        else
-        {
-            for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = (uint32_t)i + j;
-        }
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-
-            memset_pattern4(gOut2[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
-                                          bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        float *r = (float *)gOut_Ref;
-        int *r2 = (int *)gOut_Ref2;
-        float *s = (float *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            r[j] = (float)f->func.f_fpI(s[j], r2 + j);
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         bufferSize, gOut2[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray2 failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint32_t *t = (uint32_t *)gOut_Ref;
-        int32_t *t2 = (int32_t *)gOut_Ref2;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint32_t *q = (uint32_t *)(gOut[k]);
-                int32_t *q2 = (int32_t *)(gOut2[k]);
-
-                // If we aren't getting the correctly rounded result
-                if (t[j] != q[j] || t2[j] != q2[j])
-                {
-                    float test = ((float *)q)[j];
-                    int correct2 = INT_MIN;
-                    double correct = f->func.f_fpI(s[j], &correct2);
-                    float err = Ulp_Error(test, correct);
-                    cl_long iErr = (int64_t)q2[j] - (int64_t)correct2;
-                    int fail = !(fabsf(err) <= float_ulps
-                                 && abs_cl_long(iErr) <= maxiError);
-                    if (ftz)
-                    {
-                        // retry per section 6.5.3.2
-                        if (IsFloatResultSubnormal(correct, float_ulps))
-                        {
-                            fail = fail && !(test == 0.0f && iErr == 0);
-                            if (!fail) err = 0.0f;
-                        }
-
-                        // retry per section 6.5.3.3
-                        if (IsFloatSubnormal(s[j]))
-                        {
-                            int correct5, correct6;
-                            double correct3 = f->func.f_fpI(0.0, &correct5);
-                            double correct4 = f->func.f_fpI(-0.0, &correct6);
-                            float err2 = Ulp_Error(test, correct3);
-                            float err3 = Ulp_Error(test, correct4);
-                            cl_long iErr2 =
-                                (long long)q2[j] - (long long)correct5;
-                            cl_long iErr3 =
-                                (long long)q2[j] - (long long)correct6;
-
-                            // Did +0 work?
-                            if (fabsf(err2) <= float_ulps
-                                && abs_cl_long(iErr2) <= maxiError)
-                            {
-                                err = err2;
-                                iErr = iErr2;
-                                fail = 0;
-                            }
-                            // Did -0 work?
-                            else if (fabsf(err3) <= float_ulps
-                                     && abs_cl_long(iErr3) <= maxiError)
-                            {
-                                err = err3;
-                                iErr = iErr3;
-                                fail = 0;
-                            }
-
-                            // retry per section 6.5.3.4
-                            if (fail
-                                && (IsFloatResultSubnormal(correct2, float_ulps)
-                                    || IsFloatResultSubnormal(correct3,
-                                                              float_ulps)))
-                            {
-                                fail = fail
-                                    && !(test == 0.0f
-                                         && (abs_cl_long(iErr2) <= maxiError
-                                             || abs_cl_long(iErr3)
-                                                 <= maxiError));
-                                if (!fail)
-                                {
-                                    err = 0.0f;
-                                    iErr = 0;
-                                }
-                            }
-                        }
-                    }
-                    if (fabsf(err) > maxError)
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                    }
-                    if (llabs(iErr) > maxError2)
-                    {
-                        maxError2 = llabs(iErr);
-                        maxErrorVal2 = s[j];
-                    }
-
-                    if (fail)
-                    {
-                        vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: "
-                                   "*{%a, %d} vs. {%a, %d}\n",
-                                   f->name, sizeNames[k], err, (int)iErr,
-                                   ((float *)gIn)[j], ((float *)gOut_Ref)[j],
-                                   ((int *)gOut_Ref2)[j], test, q2[j]);
-                        error = -1;
-                        goto exit;
-                    }
-                }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        for (j = 0; j < bufferSize / sizeof(float); j++)
-            p[j] = genrand_int32(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
-                                        &gOutBuffer2[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(float));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
-                      f->name, sizeNames[j]);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}
-
 int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
diff --git a/test_conformance/math_brute_force/unary_two_results_i_float.cpp b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
new file mode 100644
index 00000000..82bbb81b
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_two_results_i_float.cpp
@@ -0,0 +1,492 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <limits.h>
+#include <string.h>
+
+static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
+                       cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global float",
+                        sizeNames[vectorSize],
+                        "* out, __global int",
+                        sizeNames[vectorSize],
+                        "* out2, __global float",
+                        sizeNames[vectorSize],
+                        "* in )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i], out2 + i );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global float* out, __global int* out2, __global float* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       float3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 iout = INT_MIN;\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "       vstore3( iout, 0, out2 + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       int3 iout = INT_MIN;\n"
+        "       float3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (float3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (float3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0, &iout );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               out2[3*i+1] = iout.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               out2[3*i] = iout.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernels + i,
+                       info->programs + i, info->relaxedMode);
+}
+
+static cl_ulong abs_cl_long(cl_long i)
+{
+    cl_long mask = i >> 63;
+    return (i ^ mask) - mask;
+}
+
+int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    int64_t maxError2 = 0;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(float), bufferSize);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
+    cl_ulong maxiError;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
+    float float_ulps;
+    if (gIsEmbedded)
+        float_ulps = f->float_embedded_ulps;
+    else
+        float_ulps = f->float_ulps;
+
+    maxiError = float_ulps == INFINITY ? CL_ULONG_MAX : 0;
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        if (gWimpyMode)
+        {
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j * scale;
+        }
+        else
+        {
+            for (j = 0; j < bufferSize / sizeof(float); j++)
+                p[j] = (uint32_t)i + j;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+
+            memset_pattern4(gOut2[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], CL_FALSE, 0,
+                                          bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        float *r = (float *)gOut_Ref;
+        int *r2 = (int *)gOut_Ref2;
+        float *s = (float *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            r[j] = (float)f->func.f_fpI(s[j], r2 + j);
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        int32_t *t2 = (int32_t *)gOut_Ref2;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+                int32_t *q2 = (int32_t *)(gOut2[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j] || t2[j] != q2[j])
+                {
+                    float test = ((float *)q)[j];
+                    int correct2 = INT_MIN;
+                    double correct = f->func.f_fpI(s[j], &correct2);
+                    float err = Ulp_Error(test, correct);
+                    cl_long iErr = (int64_t)q2[j] - (int64_t)correct2;
+                    int fail = !(fabsf(err) <= float_ulps
+                                 && abs_cl_long(iErr) <= maxiError);
+                    if (ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsFloatResultSubnormal(correct, float_ulps))
+                        {
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (IsFloatSubnormal(s[j]))
+                        {
+                            int correct5, correct6;
+                            double correct3 = f->func.f_fpI(0.0, &correct5);
+                            double correct4 = f->func.f_fpI(-0.0, &correct6);
+                            float err2 = Ulp_Error(test, correct3);
+                            float err3 = Ulp_Error(test, correct4);
+                            cl_long iErr2 =
+                                (long long)q2[j] - (long long)correct5;
+                            cl_long iErr3 =
+                                (long long)q2[j] - (long long)correct6;
+
+                            // Did +0 work?
+                            if (fabsf(err2) <= float_ulps
+                                && abs_cl_long(iErr2) <= maxiError)
+                            {
+                                err = err2;
+                                iErr = iErr2;
+                                fail = 0;
+                            }
+                            // Did -0 work?
+                            else if (fabsf(err3) <= float_ulps
+                                     && abs_cl_long(iErr3) <= maxiError)
+                            {
+                                err = err3;
+                                iErr = iErr3;
+                                fail = 0;
+                            }
+
+                            // retry per section 6.5.3.4
+                            if (fail
+                                && (IsFloatResultSubnormal(correct2, float_ulps)
+                                    || IsFloatResultSubnormal(correct3,
+                                                              float_ulps)))
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (abs_cl_long(iErr2) <= maxiError
+                                             || abs_cl_long(iErr3)
+                                                 <= maxiError));
+                                if (!fail)
+                                {
+                                    err = 0.0f;
+                                    iErr = 0;
+                                }
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = s[j];
+                    }
+                    if (llabs(iErr) > maxError2)
+                    {
+                        maxError2 = llabs(iErr);
+                        maxErrorVal2 = s[j];
+                    }
+
+                    if (fail)
+                    {
+                        vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: "
+                                   "*{%a, %d} vs. {%a, %d}\n",
+                                   f->name, sizeNames[k], err, (int)iErr,
+                                   ((float *)gIn)[j], ((float *)gOut_Ref)[j],
+                                   ((int *)gOut_Ref2)[j], test, q2[j]);
+                        error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        uint32_t *p = (uint32_t *)gIn;
+        for (j = 0; j < bufferSize / sizeof(float); j++)
+            p[j] = genrand_int32(d);
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gOutBuffer2[j]),
+                                        &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(float));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sf%s",
+                      f->name, sizeNames[j]);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+        vlog("\t{%8.2f, %lld} @ %a", maxError, maxError2, maxErrorVal);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary_u_double.cpp b/test_conformance/math_brute_force/unary_u_double.cpp
new file mode 100644
index 00000000..d3b92186
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_u_double.cpp
@@ -0,0 +1,385 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <string.h>
+
+static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
+                             cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global double",
+                        sizeNames[vectorSize],
+                        "* out, __global ulong",
+                        sizeNames[vectorSize],
+                        "* in )\n"
+                        "{\n"
+                        "   size_t i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global ulong* in                 )\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
+        "       double3 f0 = ",
+        name,
+        "( u0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       ulong3 u0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, "
+        "0xdeaddeaddeaddeadUL ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               u0 = (ulong3)( in[3*i], in[3*i+1], "
+        "0xdeaddeaddeaddeadUL ); \n"
+        "               break;\n"
+        "       }\n"
+        "       double3 f0 = ",
+        name,
+        "( u0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
+                             info->programs + i, info->relaxedMode);
+}
+
+static cl_ulong random64(MTdata d)
+{
+    return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
+}
+
+int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ;
+    double maxErrorVal = 0.0f;
+    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
+
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
+    Force64BitFPUPrecision();
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_ulong *p = (cl_ulong *)gIn;
+        for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d);
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        double *r = (double *)gOut_Ref;
+        cl_ulong *s = (cl_ulong *)gIn;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+            r[j] = (double)f->dfunc.f_u(s[j]);
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint64_t *t = (uint64_t *)gOut_Ref;
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint64_t *q = (uint64_t *)(gOut[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    double test = ((double *)q)[j];
+                    long double correct = f->dfunc.f_u(s[j]);
+                    float err = Bruteforce_Ulp_Error_Double(test, correct);
+                    int fail = !(fabsf(err) <= f->double_ulps);
+
+                    // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
+                    if (fail)
+                    {
+                        if (ftz)
+                        {
+                            // retry per section 6.5.3.2
+                            if (IsDoubleResultSubnormal(correct,
+                                                        f->double_ulps))
+                            {
+                                fail = fail && (test != 0.0);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = s[j];
+                    }
+                    if (fail)
+                    {
+                        vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: "
+                                   "*%.13la vs. %.13la\n",
+                                   f->name, sizeNames[k], err,
+                                   ((uint64_t *)gIn)[j],
+                                   ((double *)gOut_Ref)[j], test);
+                        error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (gMeasureTimes)
+    {
+        // Init input array
+        double *p = (double *)gIn;
+
+        for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d);
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = GetTime();
+                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                                    &localCount, NULL, 0, NULL,
+                                                    NULL)))
+                {
+                    vlog_error("FAILED -- could not execute kernel\n");
+                    goto exit;
+                }
+
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    goto exit;
+                }
+
+                uint64_t endTime = GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (bufferSize / sizeof(double));
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
+                      f->name, sizeNames[j]);
+        }
+        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
+    }
+
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u_float.cpp
similarity index 54%
rename from test_conformance/math_brute_force/unary_u.cpp
rename to test_conformance/math_brute_force/unary_u_float.cpp
index 3b8f1f69..74b3b760 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u_float.cpp
@@ -99,88 +99,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
 }
 
-static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
-                             cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global double",
-                        sizeNames[vectorSize],
-                        "* out, __global ulong",
-                        sizeNames[vectorSize],
-                        "* in )\n"
-                        "{\n"
-                        "   size_t i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global double* out, __global ulong* in                 )\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
-        "       double3 f0 = ",
-        name,
-        "( u0 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       ulong3 u0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, "
-        "0xdeaddeaddeaddeadUL ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               u0 = (ulong3)( in[3*i], in[3*i+1], "
-        "0xdeaddeaddeaddeadUL ); \n"
-        "               break;\n"
-        "       }\n"
-        "       double3 f0 = ",
-        name,
-        "( u0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
 typedef struct BuildKernelInfo
 {
     cl_uint offset; // the first vector size to build
@@ -199,15 +117,6 @@ static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
                        info->programs + i, info->relaxedMode);
 }
 
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernels + i,
-                             info->programs + i, info->relaxedMode);
-}
-
 int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
 {
     uint64_t i;
@@ -514,267 +423,3 @@ exit:
 
     return error;
 }
-
-static cl_ulong random64(MTdata d)
-{
-    return (cl_ulong)genrand_int32(d) | ((cl_ulong)genrand_int32(d) << 32);
-}
-
-int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
-{
-    uint64_t i;
-    uint32_t j, k;
-    int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
-    float maxError = 0.0f;
-    int ftz = f->ftz || gForceFTZ;
-    double maxErrorVal = 0.0f;
-    size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
-    Force64BitFPUPrecision();
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                       f->nameInCode, relaxedMode };
-        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                                   &build_info)))
-            return error;
-    }
-
-    for (i = 0; i < (1ULL << 32); i += step)
-    {
-        // Init input array
-        cl_ulong *p = (cl_ulong *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_ulong); j++) p[j] = random64(d);
-
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-        // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                goto exit;
-            }
-        }
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
-            {
-                vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
-            }
-        }
-
-        // Get that moving
-        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
-
-        // Calculate the correctly rounded reference result
-        double *r = (double *)gOut_Ref;
-        cl_ulong *s = (cl_ulong *)gIn;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-            r[j] = (double)f->dfunc.f_u(s[j]);
-
-        // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
-            {
-                vlog_error("ReadArray failed %d\n", error);
-                goto exit;
-            }
-        }
-
-        if (gSkipCorrectnessTesting) break;
-
-        // Verify data
-        uint64_t *t = (uint64_t *)gOut_Ref;
-        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
-        {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-            {
-                uint64_t *q = (uint64_t *)(gOut[k]);
-
-                // If we aren't getting the correctly rounded result
-                if (t[j] != q[j])
-                {
-                    double test = ((double *)q)[j];
-                    long double correct = f->dfunc.f_u(s[j]);
-                    float err = Bruteforce_Ulp_Error_Double(test, correct);
-                    int fail = !(fabsf(err) <= f->double_ulps);
-
-                    // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
-                    if (fail)
-                    {
-                        if (ftz)
-                        {
-                            // retry per section 6.5.3.2
-                            if (IsDoubleResultSubnormal(correct,
-                                                        f->double_ulps))
-                            {
-                                fail = fail && (test != 0.0);
-                                if (!fail) err = 0.0f;
-                            }
-                        }
-                    }
-                    if (fabsf(err) > maxError)
-                    {
-                        maxError = fabsf(err);
-                        maxErrorVal = s[j];
-                    }
-                    if (fail)
-                    {
-                        vlog_error("\n%s%sD: %f ulp error at 0x%16.16llx: "
-                                   "*%.13la vs. %.13la\n",
-                                   f->name, sizeNames[k], err,
-                                   ((uint64_t *)gIn)[j],
-                                   ((double *)gOut_Ref)[j], test);
-                        error = -1;
-                        goto exit;
-                    }
-                }
-            }
-        }
-
-        if (0 == (i & 0x0fffffff))
-        {
-            if (gVerboseBruteForce)
-            {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
-            }
-            else
-            {
-                vlog(".");
-            }
-            fflush(stdout);
-        }
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (gMeasureTimes)
-    {
-        // Init input array
-        double *p = (double *)gIn;
-
-        for (j = 0; j < bufferSize / sizeof(double); j++) p[j] = random64(d);
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
-        {
-            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
-            return error;
-        }
-
-
-        // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
-            {
-                LogBuildError(programs[j]);
-                goto exit;
-            }
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                                    &localCount, NULL, 0, NULL,
-                                                    NULL)))
-                {
-                    vlog_error("FAILED -- could not execute kernel\n");
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
-                      f->name, sizeNames[j]);
-        }
-        for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
-    }
-
-    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
-    vlog("\n");
-
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-
-    return error;
-}