From 8ad1088af97687d249111046e7e96e49e3458b30 Mon Sep 17 00:00:00 2001
From: Marco Antognini <marco.antognini@arm.com>
Date: Wed, 10 Feb 2021 10:38:31 +0000
Subject: [PATCH] Reduce difference between files in math_brute_force (#1138)

* Reduce differences between files

This will help reduce code duplication is future commits.

Some code is moved around, some variables are renamed and some
statements are slightly altered to reduce differences between files in
math_brute_force, yet the semantics remains the same.

The differences were identified using n-way diffs. Many differences
remain however.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>

* Workaround clang-format limitation

Introduces some insignificant spaces to force clang-format to reduce the
indentation and reduce differences between files.

Signed-off-by: Marco Antognini <marco.antognini@arm.com>
---
 test_conformance/math_brute_force/binary.cpp  | 109 +++++++-------
 .../math_brute_force/binaryOperator.cpp       | 140 +++++++++---------
 .../math_brute_force/binary_i.cpp             |  50 +++----
 .../math_brute_force/binary_two_results_i.cpp |  35 +++--
 test_conformance/math_brute_force/i_unary.cpp |  55 ++++---
 .../math_brute_force/macro_binary.cpp         |  69 +++++----
 .../math_brute_force/macro_unary.cpp          |  34 +++--
 test_conformance/math_brute_force/mad.cpp     |  62 +++++---
 test_conformance/math_brute_force/ternary.cpp |  77 +++++-----
 test_conformance/math_brute_force/unary.cpp   |  52 +++----
 .../math_brute_force/unary_two_results.cpp    |  39 ++---
 .../math_brute_force/unary_two_results_i.cpp  |  47 +++---
 test_conformance/math_brute_force/unary_u.cpp | 137 ++++++++---------
 13 files changed, 483 insertions(+), 423 deletions(-)

diff --git a/test_conformance/math_brute_force/binary.cpp b/test_conformance/math_brute_force/binary.cpp
index 1784c725..e6b9cbbc 100644
--- a/test_conformance/math_brute_force/binary.cpp
+++ b/test_conformance/math_brute_force/binary.cpp
@@ -25,9 +25,6 @@ int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata,
 int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata,
                                             bool relaxedMode);
 
-const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
-const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
-
 extern const vtbl _binary = { "binary", TestFunc_Float_Float_Float,
                               TestFunc_Double_Double_Double };
 
@@ -36,6 +33,8 @@ extern const vtbl _binary_nextafter = {
     TestFunc_Double_Double_Double_nextafter
 };
 
+const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
+const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
 
 static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        cl_kernel *k, cl_program *p, bool relaxedMode)
@@ -76,7 +75,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -163,7 +163,8 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 d0, d1;\n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -208,6 +209,35 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                        relaxedMode);
 }
 
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
+} BuildKernelInfo;
+
+static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                  void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernel(info->nameInCode, i, info->kernel_count,
+                       info->kernels[i], info->programs + i, info->relaxedMode);
+}
+
+static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                   void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
+                             info->kernels[i], info->programs + i,
+                             info->relaxedMode);
+}
+
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -311,38 +341,9 @@ static const float specialValuesFloat[] = {
     +0.0f
 };
 
-static size_t specialValuesFloatCount =
+static const size_t specialValuesFloatCount =
     sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode; // Whether to build with -cl-fast-relaxed-math.
-} BuildKernelInfo;
-
-static cl_int BuildKernel_FloatFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                  void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernel(info->nameInCode, i, info->kernel_count,
-                       info->kernels[i], info->programs + i, info->relaxedMode);
-}
-
-static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                   void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelDouble(info->nameInCode, i, info->kernel_count,
-                             info->kernels[i], info->programs + i,
-                             info->relaxedMode);
-}
-
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -426,11 +427,11 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
     test_info.ftz =
         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
-
+    test_info.relaxedMode = relaxedMode;
     test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
     test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
     test_info.isNextafter = isNextafter;
-    test_info.relaxedMode = relaxedMode;
+
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
@@ -543,12 +544,11 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
             vlog("passed");
     }
 
-
     if (gMeasureTimes)
     {
         // Init input arrays
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
         for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = (genrand_int32(d) & ~0x40000000) | 0x20000000;
@@ -561,6 +561,7 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -568,7 +569,6 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
             return error;
         }
 
-
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
@@ -633,7 +633,6 @@ static int TestFunc_Float_Float_Float_common(const Func *f, MTdata d,
         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
@@ -684,23 +683,21 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     int skipNanInf = job->skipNanInf;
     int isNextafter = job->isNextafter;
     cl_uint *t = 0;
-    float *r = 0, *s = 0, *s2 = 0;
+    cl_float *r = 0;
+    cl_float *s = 0;
+    cl_float *s2 = 0;
     cl_int copysign_test = 0;
     RoundingMode oldRoundMode;
     int skipVerification = 0;
 
     if (relaxedMode)
     {
+        func = job->f->rfunc;
         if (strcmp(name, "pow") == 0 && gFastRelaxedDerived)
         {
-            func = job->f->rfunc;
             ulps = INFINITY;
             skipVerification = 1;
         }
-        else
-        {
-            func = job->f->rfunc;
-        }
     }
 
     // start the map of the output arrays
@@ -744,7 +741,8 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         {
             fp[j] = specialValuesFloat[x];
             fp2[j] = specialValuesFloat[y];
-            if (++x >= specialValuesFloatCount)
+            ++x;
+            if (x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
@@ -1203,13 +1201,11 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         fflush(stdout);
     }
 
-
 exit:
     if (overflow) free(overflow);
     return error;
 }
 
-
 // A table of more difficult cases to get right
 static const double specialValuesDouble[] = {
     -NAN,
@@ -1444,10 +1440,10 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
+
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
     }
 
-
     // Init the kernels
     {
         BuildKernelInfo build_info = {
@@ -1460,6 +1456,7 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
@@ -1500,6 +1497,7 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -1507,7 +1505,6 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
             return error;
         }
 
-
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
@@ -1573,7 +1570,6 @@ static int TestFunc_Double_Double_Double_common(const Func *f, MTdata d,
         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
@@ -1622,7 +1618,9 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
 
     int isNextafter = job->isNextafter;
     cl_ulong *t;
-    cl_double *r, *s, *s2;
+    cl_double *r;
+    cl_double *s;
+    cl_double *s2;
 
     Force64BitFPUPrecision();
 
@@ -1970,6 +1968,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
         }
         fflush(stdout);
     }
+
 exit:
     return error;
 }
diff --git a/test_conformance/math_brute_force/binaryOperator.cpp b/test_conformance/math_brute_force/binaryOperator.cpp
index bd1a3143..0957c6af 100644
--- a/test_conformance/math_brute_force/binaryOperator.cpp
+++ b/test_conformance/math_brute_force/binaryOperator.cpp
@@ -44,10 +44,11 @@ static int BuildKernel(const char *name, const char *operator_symbol,
                         "* in2 )\n"
                         "{\n"
                         "   size_t i = get_global_id(0);\n"
-                        "   out[i] =  in1[i] ",
+                        "   out[i] = in1[i] ",
                         operator_symbol,
                         " in2[i];\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void ",
         name,
@@ -70,7 +71,8 @@ static int BuildKernel(const char *name, const char *operator_symbol,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -137,8 +139,9 @@ static int BuildKernelDouble(const char *name, const char *operator_symbol,
                         operator_symbol,
                         " in2[i];\n"
                         "}\n" };
+
     const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
         "__kernel void ",
         name,
         "_kernel",
@@ -160,7 +163,8 @@ static int BuildKernelDouble(const char *name, const char *operator_symbol,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 d0, d1;\n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -235,43 +239,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->programs + i, info->relaxedMode);
 }
 
-// Thread specific data for a worker thread
-typedef struct ThreadInfo
-{
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    float maxError; // max error value. Init to 0.
-    double
-        maxErrorValue; // position of the max error value (param 1).  Init to 0.
-    double maxErrorValue2; // position of the max error value (param 2).  Init
-                           // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
-
-typedef struct TestInfo
-{
-    size_t subBufferSize; // Size of the sub-buffer in elements
-    const Func *f; // A pointer to the function info
-    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
-    cl_kernel
-        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
-                               // worker thread:  k[vector_size][thread_id]
-    ThreadInfo *
-        tinfo; // An array of thread specific information for each worker thread
-    cl_uint threadCount; // Number of worker threads
-    cl_uint jobCount; // Number of jobs
-    cl_uint step; // step between each chunk and the next.
-    cl_uint scale; // stride between individual test values
-    float ulps; // max_allowed ulps
-    int ftz; // non-zero if running in flush to zero mode
-    bool relaxedMode; // True if the test is being run in relaxed mode, false
-                      // otherwise.
-
-    // no special fields
-} TestInfo;
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -375,9 +342,46 @@ static const float specialValuesFloat[] = {
     +0.0f
 };
 
-static size_t specialValuesFloatCount =
+static const size_t specialValuesFloatCount =
     sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+    bool relaxedMode; // True if the test is being run in relaxed mode, false
+                      // otherwise.
+
+    // no special fields
+} TestInfo;
+
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
@@ -398,6 +402,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -405,7 +410,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
                * RoundUpToNextPowerOfTwo(test_info.threadCount));
     }
 
-    test_info.step = test_info.subBufferSize * test_info.scale;
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
     if (test_info.step / test_info.subBufferSize != test_info.scale)
     {
         // there was overflow
@@ -481,8 +486,8 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
                            region.origin, region.size);
                 goto exit;
             }
@@ -513,6 +518,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
@@ -536,7 +542,6 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             vlog("passed");
     }
 
-
     if (gMeasureTimes)
     {
         // Init input arrays
@@ -554,6 +559,7 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -626,7 +632,6 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
@@ -665,30 +670,31 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
     fptr func = job->f->func;
+    int ftz = job->ftz;
     bool relaxedMode = job->relaxedMode;
     float ulps = getAllowedUlpError(job->f, relaxedMode);
-    if (relaxedMode)
-    {
-        func = job->f->rfunc;
-    }
-
-
-    int ftz = job->ftz;
     MTdata d = tinfo->d;
     cl_uint j, k;
     cl_int error;
     cl_uchar *overflow = (cl_uchar *)malloc(buffer_size);
     const char *name = job->f->name;
-    cl_uint *t;
-    cl_float *r, *s, *s2;
+    cl_uint *t = 0;
+    cl_float *r = 0;
+    cl_float *s = 0;
+    cl_float *s2 = 0;
     RoundingMode oldRoundMode;
 
+    if (relaxedMode)
+    {
+        func = job->f->rfunc;
+    }
+
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_uint *out[VECTOR_SIZE_COUNT];
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
             buffer_size, 0, NULL, e + j, &error);
         if (error || NULL == out[j])
@@ -711,7 +717,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         specialValuesFloatCount * specialValuesFloatCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
 
-
     if (job_id <= (cl_uint)indx)
     {
         // Insert special values
@@ -877,7 +882,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     // an in order queue.
     for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
@@ -889,9 +894,9 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
     // Wait for the last buffer
-    out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
+    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
     if (error || NULL == out[j])
     {
         vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
@@ -1136,6 +1141,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         }
         fflush(stdout);
     }
+
 exit:
     if (overflow) free(overflow);
     return error;
@@ -1267,6 +1273,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     float maxError = 0.0f;
     double maxErrorVal = 0.0;
     double maxErrorVal2 = 0.0;
+
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     // Init test_info
@@ -1275,6 +1282,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -1373,7 +1381,6 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
     }
 
-
     // Init the kernels
     {
         BuildKernelInfo build_info = { gMinVectorSizeIndex,
@@ -1389,6 +1396,7 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
@@ -1412,7 +1420,6 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
             vlog("passed");
     }
 
-
     if (gMeasureTimes)
     {
         // Init input arrays
@@ -1503,7 +1510,6 @@ int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
@@ -1551,7 +1557,9 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     cl_int error;
     const char *name = job->f->name;
     cl_ulong *t;
-    cl_double *r, *s, *s2;
+    cl_double *r;
+    cl_double *s;
+    cl_double *s2;
 
     Force64BitFPUPrecision();
 
diff --git a/test_conformance/math_brute_force/binary_i.cpp b/test_conformance/math_brute_force/binary_i.cpp
index a8535281..f931c5be 100644
--- a/test_conformance/math_brute_force/binary_i.cpp
+++ b/test_conformance/math_brute_force/binary_i.cpp
@@ -15,8 +15,8 @@
 //
 #include "Utility.h"
 
-#include <string.h>
 #include <limits.h>
+#include <string.h>
 #include "FunctionList.h"
 
 int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode);
@@ -228,7 +228,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->relaxedMode);
 }
 
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -331,9 +330,9 @@ static const float specialValuesFloat[] = {
     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
     +0.0f
 };
-static size_t specialValuesFloatCount =
-    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
+static const size_t specialValuesFloatCount =
+    sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
 static const int specialValuesInt[] = {
     0,           1,           2,          3,          126,        127,
@@ -484,8 +483,8 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
                            region.origin, region.size);
                 goto exit;
             }
@@ -497,6 +496,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             goto exit;
         }
+
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
     }
 
@@ -537,7 +537,6 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             vlog("passed");
     }
 
-
     if (gMeasureTimes)
     {
         // Init input arrays
@@ -555,6 +554,7 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -627,7 +627,6 @@ int TestFunc_Float_Float_Int(const Func *f, MTdata d, bool relaxedMode)
         vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
@@ -658,7 +657,6 @@ exit:
     return error;
 }
 
-
 static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
 {
     const TestInfo *job = (const TestInfo *)data;
@@ -666,23 +664,24 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     size_t buffer_size = buffer_elements * sizeof(cl_float);
     cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
-    float ulps = job->ulps;
     fptr func = job->f->func;
     int ftz = job->ftz;
+    float ulps = job->ulps;
     MTdata d = tinfo->d;
     cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
-    cl_uint *t;
-    cl_float *r, *s;
-    cl_int *s2;
+    cl_uint *t = 0;
+    cl_float *r = 0;
+    cl_float *s = 0;
+    cl_int *s2 = 0;
 
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_uint *out[VECTOR_SIZE_COUNT];
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
             buffer_size, 0, NULL, e + j, &error);
         if (error || NULL == out[j])
@@ -700,9 +699,11 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
+
     int totalSpecialValueCount =
         specialValuesFloatCount * specialValuesIntCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
     if (job_id <= (cl_uint)indx)
     { // test edge cases
         float *fp = (float *)p;
@@ -716,7 +717,8 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         {
             fp[j] = specialValuesFloat[x];
             ip2[j] = specialValuesInt[y];
-            if (++x >= specialValuesFloatCount)
+            ++x;
+            if (x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
@@ -820,7 +822,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     // an in order queue.
     for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
@@ -832,9 +834,9 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     }
 
     // Wait for the last buffer
-    out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                            CL_TRUE, CL_MAP_READ, 0,
-                                            buffer_size, 0, NULL, NULL, &error);
+    out[j] = (cl_uint *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
+                                           0, NULL, NULL, &error);
     if (error || NULL == out[j])
     {
         vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
@@ -1057,6 +1059,7 @@ static const double specialValuesDouble[] = {
     MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
     +0.0,
 };
+
 static size_t specialValuesDoubleCount =
     sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
@@ -1165,12 +1168,9 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
 
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with
-             * parent buffer */
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
                 &region, &error);
-            /* Qualcomm fix: end */
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of gInBuffer "
@@ -1190,7 +1190,6 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
     }
 
-
     // Init the kernels
     {
         BuildKernelInfo build_info = {
@@ -1320,7 +1319,6 @@ int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
         vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
     vlog("\n");
 
-
 exit:
     // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
@@ -1367,7 +1365,8 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     cl_int error;
     const char *name = job->f->name;
     cl_ulong *t;
-    cl_double *r, *s;
+    cl_double *r;
+    cl_double *s;
     cl_int *s2;
 
     Force64BitFPUPrecision();
@@ -1398,6 +1397,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     int totalSpecialValueCount =
         specialValuesDoubleCount * specialValuesInt2Count;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
     if (job_id <= (cl_uint)indx)
     { // test edge cases
         cl_double *fp = (cl_double *)p;
diff --git a/test_conformance/math_brute_force/binary_two_results_i.cpp b/test_conformance/math_brute_force/binary_two_results_i.cpp
index a0aa9d25..2ecf1c28 100644
--- a/test_conformance/math_brute_force/binary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i.cpp
@@ -40,7 +40,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in1, __global float",
                         sizeNames[vectorSize],
-                        "* in2)\n"
+                        "* in2 )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -71,7 +71,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       int3 i0 = 0xdeaddead;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -83,7 +85,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "               f1 = (float3)( in2[3*i], in2[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       int3 i0 = 0xdeaddead;\n"
         "       f0 = ",
         name,
         "( f0, f1, &i0 );\n"
@@ -132,12 +133,12 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* in1, __global double",
                         sizeNames[vectorSize],
-                        "* in2)\n"
+                        "* in2 )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
-                        "( in1[i], in2[i], out2 + i );\n"
+                        "( in1[i], in2[i], out2[i] );\n"
                         "}\n" };
 
     const char *c3[] = {
@@ -164,7 +165,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 d0, d1;\n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       int3 i0 = 0xdeaddead;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -176,7 +179,6 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "               d1 = (double3)( in2[3*i], in2[3*i+1], NAN ); \n"
         "               break;\n"
         "       }\n"
-        "       int3 i0 = 0xdeaddead;\n"
         "       d0 = ",
         name,
         "( d0, d1, &i0 );\n"
@@ -309,20 +311,22 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
-    float float_ulps;
-    int64_t maxError2 = 0;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+    int64_t maxError2 = 0;
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
 
     cl_uint threadCount = GetThreadCount();
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
+    float float_ulps;
     if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
@@ -485,7 +489,7 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         {
             for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
-                uint32_t *q = (uint32_t *)gOut[k];
+                uint32_t *q = (uint32_t *)(gOut[k]);
                 int32_t *q2 = (int32_t *)gOut2[k];
 
                 // Check for exact match to correctly rounded result
@@ -695,9 +699,11 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     if (gMeasureTimes)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
+        cl_uint *p = (cl_uint *)gIn;
         for (j = 0; j < bufferSize / sizeof(float); j++)
+        {
             p[j] = genrand_int32(d);
+        }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
@@ -823,9 +829,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
                                    gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                    &build_info)))
-        {
             return error;
-        }
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -1185,7 +1189,6 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             {
                 vlog(".");
             }
-
             fflush(stdout);
         }
     }
@@ -1202,7 +1205,7 @@ int TestFunc_DoubleI_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     {
         // Init input array
         double *p = (double *)gIn;
-        for (j = 0; j < bufferSize / sizeof(double); j++)
+        for (j = 0; j < bufferSize / sizeof(cl_double); j++)
             p[j] = DoubleFromUInt32(genrand_int32(d));
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
diff --git a/test_conformance/math_brute_force/i_unary.cpp b/test_conformance/math_brute_force/i_unary.cpp
index 7f2f79a3..b736b24f 100644
--- a/test_conformance/math_brute_force/i_unary.cpp
+++ b/test_conformance/math_brute_force/i_unary.cpp
@@ -42,6 +42,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         name,
                         "( in[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -87,7 +88,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "}\n"
     };
 
-
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -114,7 +114,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out, __global double",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -177,7 +177,6 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
-
     char testName[32];
     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
@@ -219,7 +218,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     int error;
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
-    int ftz = f->ftz || 0 == (gFloatCapabilities & CL_FP_DENORM) || gForceFTZ;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
     int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
@@ -234,27 +233,30 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
+        cl_uint *p = (cl_uint *)gIn;
         if (gWimpyMode)
         {
             for (j = 0; j < bufferSize / sizeof(float); j++)
-                p[j] = (uint32_t)i + j * scale;
+                p[j] = (cl_uint)i + j * scale;
         }
         else
         {
             for (j = 0; j < bufferSize / sizeof(float); j++)
                 p[j] = (uint32_t)i + j;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
@@ -281,7 +283,8 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -396,8 +399,9 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -447,6 +451,7 @@ int TestFunc_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     }
 
     vlog("\n");
+
 exit:
     RestoreFPState(&oldMode);
     // Release
@@ -481,13 +486,13 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -504,6 +509,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             for (j = 0; j < bufferSize / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
@@ -529,8 +535,9 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
             if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
                                         &gOutBuffer[j])))
             {
@@ -616,6 +623,7 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             {
                 vlog(".");
             }
+
             fflush(stdout);
         }
     }
@@ -698,7 +706,6 @@ int TestFunc_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 
     vlog("\n");
 
-
 exit:
     RestoreFPState(&oldMode);
     // Release
diff --git a/test_conformance/math_brute_force/macro_binary.cpp b/test_conformance/math_brute_force/macro_binary.cpp
index b0b82149..0c37068e 100644
--- a/test_conformance/math_brute_force/macro_binary.cpp
+++ b/test_conformance/math_brute_force/macro_binary.cpp
@@ -64,7 +64,8 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -92,7 +93,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
         "}\n"
     };
 
-
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -110,7 +110,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                        relaxedMode);
 }
 
-
 static int BuildKernelDouble(const char *name, int vectorSize,
                              cl_uint kernel_count, cl_kernel *k, cl_program *p,
                              bool relaxedMode)
@@ -153,7 +152,8 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 f0, f1;\n"
+        "       double3 f0;\n"
+        "       double3 f1;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -190,7 +190,6 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
-
     char testName[32];
     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
@@ -228,7 +227,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->relaxedMode);
 }
 
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -379,6 +377,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -460,8 +459,8 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
                            region.origin, region.size);
                 goto exit;
             }
@@ -489,7 +488,6 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
-
     // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
@@ -506,8 +504,8 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     if (gMeasureTimes)
     {
         // Init input arrays
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
         for (j = 0; j < BUFFER_SIZE / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
@@ -520,6 +518,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           BUFFER_SIZE, gIn2, 0, NULL, NULL)))
         {
@@ -531,8 +530,9 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(test_info.k[j][0], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -586,6 +586,7 @@ int TestMacro_Int_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                       f->name, sizeNames[j]);
         }
     }
+
     vlog("\n");
 
 exit:
@@ -631,8 +632,10 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
-    cl_int *t, *r;
-    cl_float *s, *s2;
+    cl_int *t = 0;
+    cl_int *r = 0;
+    cl_float *s = 0;
+    cl_float *s2 = 0;
 
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
@@ -657,6 +660,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
     j = 0;
+
     int totalSpecialValueCount =
         specialValuesFloatCount * specialValuesFloatCount;
     int indx = (totalSpecialValueCount - 1) / buffer_elements;
@@ -674,7 +678,8 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         {
             fp[j] = specialValuesFloat[x];
             fp2[j] = specialValuesFloat[y];
-            if (++x >= specialValuesFloatCount)
+            ++x;
+            if (x >= specialValuesFloatCount)
             {
                 x = 0;
                 y++;
@@ -690,7 +695,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         p2[j] = genrand_int32(d);
     }
 
-
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
                                       buffer_size, p, 0, NULL, NULL)))
     {
@@ -895,6 +899,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
             }
         }
     }
+
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
@@ -1044,7 +1049,6 @@ static const double specialValuesDouble[] = {
 static size_t specialValuesDoubleCount =
     sizeof(specialValuesDouble) / sizeof(specialValuesDouble[0]);
 
-
 static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *p);
 
 int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
@@ -1061,6 +1065,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -1136,12 +1141,9 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with
-             * parent buffer */
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
                 &region, &error);
-            /* Qualcomm fix: end */
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of gInBuffer "
@@ -1161,7 +1163,6 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
     }
 
-
     // Init the kernels
     {
         BuildKernelInfo build_info = {
@@ -1174,6 +1175,7 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
@@ -1189,8 +1191,8 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     if (gMeasureTimes)
     {
         // Init input arrays
-        uint64_t *p = (uint64_t *)gIn;
-        uint64_t *p2 = (uint64_t *)gIn2;
+        cl_ulong *p = (cl_ulong *)gIn;
+        cl_ulong *p2 = (cl_ulong *)gIn2;
         for (j = 0; j < BUFFER_SIZE / sizeof(double); j++)
         {
             p[j] =
@@ -1216,8 +1218,9 @@ int TestMacro_Int_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(test_info.k[j][0], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -1319,8 +1322,10 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
-    cl_long *t, *r;
-    cl_double *s, *s2;
+    cl_long *t;
+    cl_long *r;
+    cl_double *s;
+    cl_double *s2;
 
     Force64BitFPUPrecision();
 
@@ -1378,7 +1383,6 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
         ((cl_ulong *)p2)[j] = genrand_int64(d);
     }
 
-
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
                                       buffer_size, p, 0, NULL, NULL)))
     {
@@ -1493,11 +1497,12 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     t = (cl_long *)r;
     for (j = 0; j < buffer_elements; j++)
     {
-        cl_long *q = (cl_long *)out[0];
+        cl_long *q = out[0];
 
         // If we aren't getting the correctly rounded result
         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
         {
+            // If we aren't getting the correctly rounded result
             if (ftz)
             {
                 if (IsDoubleSubnormal(s[j]))
@@ -1528,7 +1533,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
                 }
             }
 
-            uint64_t err = t[j] - q[j];
+            cl_ulong err = t[j] - q[j];
             if (q[j] > t[j]) err = q[j] - t[j];
             vlog_error("\nERROR: %s: %lld ulp error at {%.13la, %.13la}: *%lld "
                        "vs. %lld  (index: %d)\n",
@@ -1575,7 +1580,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
                     }
                 }
 
-                uint64_t err = -t[j] - q[j];
+                cl_ulong err = -t[j] - q[j];
                 if (q[j] > -t[j]) err = q[j] + t[j];
                 vlog_error("\nERROR: %sD%s: %lld ulp error at {%.13la, "
                            "%.13la}: *%lld vs. %lld  (index: %d)\n",
diff --git a/test_conformance/math_brute_force/macro_unary.cpp b/test_conformance/math_brute_force/macro_unary.cpp
index bf08a170..ced72be8 100644
--- a/test_conformance/math_brute_force/macro_unary.cpp
+++ b/test_conformance/math_brute_force/macro_unary.cpp
@@ -34,13 +34,14 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                         sizeNames[vectorSize],
                         "* out, __global float",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -115,7 +116,7 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                         sizeNames[vectorSize],
                         "* out, __global double",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -178,7 +179,6 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
-
     char testName[32];
     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
@@ -258,6 +258,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -279,6 +280,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
     test_info.f = f;
     test_info.ftz =
         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
+
     // cl_kernels aren't thread safe, so we make one for each vector size for
     // every thread
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
@@ -328,8 +330,8 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer for region {%zd, %zd}\n",
                            region.origin, region.size);
                 goto exit;
             }
@@ -355,6 +357,7 @@ int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
@@ -501,7 +504,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-
     // Get that moving
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
 
@@ -569,7 +571,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-
     // Get that moving
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
@@ -594,6 +595,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
             return error;
         }
     }
+
     // Wait for the last buffer
     out[j] = (cl_int *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
                                           CL_TRUE, CL_MAP_READ, 0, buffer_size,
@@ -711,12 +713,14 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     size_t i, j;
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
     // Init test_info
     memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_double));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -782,12 +786,9 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
 
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with
-             * parent buffer */
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
                 &region, &error);
-            /* Qualcomm fix: end */
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of gInBuffer "
@@ -817,6 +818,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
@@ -846,8 +848,9 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(test_info.k[j][0], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -900,6 +903,7 @@ int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
     vlog("\n");
 
 exit:
+    // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
@@ -936,9 +940,9 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint base = job_id * (cl_uint)job->step;
     ThreadInfo *tinfo = job->tinfo + thread_id;
     dptr dfunc = job->f->dfunc;
+    int ftz = job->ftz;
     cl_uint j, k;
     cl_int error;
-    int ftz = job->ftz;
     const char *name = job->f->name;
 
     Force64BitFPUPrecision();
@@ -1027,7 +1031,6 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-
     // Get that moving
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
@@ -1052,6 +1055,7 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
             return error;
         }
     }
+
     // Wait for the last buffer
     out[j] = (cl_long *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
                                            CL_TRUE, CL_MAP_READ, 0, buffer_size,
@@ -1062,14 +1066,12 @@ static cl_int TestDouble(cl_uint job_id, cl_uint thread_id, void *data)
         return error;
     }
 
-
     // Verify data
     cl_long *t = (cl_long *)r;
     for (j = 0; j < buffer_elements; j++)
     {
         cl_long *q = out[0];
 
-
         // If we aren't getting the correctly rounded result
         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
         {
diff --git a/test_conformance/math_brute_force/mad.cpp b/test_conformance/math_brute_force/mad.cpp
index fb144e4b..872caa0b 100644
--- a/test_conformance/math_brute_force/mad.cpp
+++ b/test_conformance/math_brute_force/mad.cpp
@@ -44,6 +44,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         name,
                         "( in1[i], in2[i], in3[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -66,7 +67,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1, f2;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       float3 f2;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -133,6 +136,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         name,
                         "( in1[i], in2[i], in3[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
         "__kernel void math_kernel",
@@ -156,7 +160,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 d0, d1, d2;\n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       double3 d2;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -247,37 +253,42 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
     uint64_t step = getTestStep(sizeof(float), bufferSize);
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        uint32_t *p3 = (uint32_t *)gIn3;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
         for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
                                           bufferSize, gIn3, 0, NULL, NULL)))
         {
@@ -379,15 +390,15 @@ int TestFunc_mad(const Func *f, MTdata d, bool relaxedMode)
         if (gWimpyMode)
             vlog("Wimp pass");
         else
-            vlog("pass");
+            vlog("passed");
     }
 
     if (gMeasureTimes)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        uint32_t *p3 = (uint32_t *)gIn3;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
         for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
@@ -508,18 +519,18 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
     double maxErrorVal2 = 0.0f;
     double maxErrorVal3 = 0.0f;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
     uint64_t step = getTestStep(sizeof(double), bufferSize);
 
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -534,18 +545,21 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
                                           bufferSize, gIn3, 0, NULL, NULL)))
         {
@@ -647,7 +661,7 @@ int TestFunc_mad_Double(const Func *f, MTdata d, bool relaxedMode)
         if (gWimpyMode)
             vlog("Wimp pass");
         else
-            vlog("pass");
+            vlog("passed");
     }
 
     if (gMeasureTimes)
diff --git a/test_conformance/math_brute_force/ternary.cpp b/test_conformance/math_brute_force/ternary.cpp
index b3eea0f4..1b03b209 100644
--- a/test_conformance/math_brute_force/ternary.cpp
+++ b/test_conformance/math_brute_force/ternary.cpp
@@ -52,7 +52,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
-        "( __global float* out, __global float* in, __global float* in2 , "
+        "( __global float* out, __global float* in, __global float* in2, "
         "__global float* in3)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
@@ -71,7 +71,9 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       float3 f0, f1, f2;\n"
+        "       float3 f0;\n"
+        "       float3 f1;\n"
+        "       float3 f2;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -143,7 +145,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
         "__kernel void math_kernel",
         sizeNames[vectorSize],
-        "( __global double* out, __global double* in, __global double* in2 , "
+        "( __global double* out, __global double* in, __global double* in2, "
         "__global double* in3)\n"
         "{\n"
         "   size_t i = get_global_id(0);\n"
@@ -162,7 +164,9 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "       size_t parity = i & 1;   // Figure out how many elements are "
         "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
         "buffer size \n"
-        "       double3 d0, d1, d2;\n"
+        "       double3 d0;\n"
+        "       double3 d1;\n"
+        "       double3 d2;\n"
         "       switch( parity )\n"
         "       {\n"
         "           case 1:\n"
@@ -235,7 +239,6 @@ static cl_int BuildKernel_DoubleFn(cl_uint job_id, cl_uint thread_id UNUSED,
                              info->programs + i, info->relaxedMode);
 }
 
-
 // A table of more difficult cases to get right
 static const float specialValuesFloat[] = {
     -NAN,
@@ -315,7 +318,7 @@ static const float specialValuesFloat[] = {
     +0.0f
 };
 
-static size_t specialValuesFloatCount =
+static const size_t specialValuesFloatCount =
     sizeof(specialValuesFloat) / sizeof(specialValuesFloat[0]);
 
 
@@ -324,6 +327,9 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     uint64_t i;
     uint32_t j, k;
     int error;
+
+    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
+
     cl_program programs[VECTOR_SIZE_COUNT];
     cl_kernel kernels[VECTOR_SIZE_COUNT];
     float maxError = 0.0f;
@@ -332,33 +338,34 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-
     uint64_t step = getTestStep(sizeof(float), bufferSize);
-    int skipNanInf = (0 == strcmp("fma", f->nameInCode)) && !gInfNanSupport;
+
     cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
+
     float float_ulps;
-
-    logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
-
     if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
 
+    int skipNanInf = (0 == strcmp("fma", f->nameInCode)) && !gInfNanSupport;
+
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        uint32_t *p3 = (uint32_t *)gIn3;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
         j = 0;
         if (i == 0)
         { // test edge cases
@@ -393,18 +400,21 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             p2[j] = genrand_int32(d);
             p3[j] = genrand_int32(d);
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
                                           bufferSize, gIn3, 0, NULL, NULL)))
         {
@@ -493,7 +503,6 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                     (float)f->func.f_fma(s[j], s2[j], s3[j], CORRECTLY_ROUNDED);
         }
 
-
         // Read the data back
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
@@ -963,9 +972,9 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     if (gMeasureTimes)
     {
         // Init input array
-        uint32_t *p = (uint32_t *)gIn;
-        uint32_t *p2 = (uint32_t *)gIn2;
-        uint32_t *p3 = (uint32_t *)gIn3;
+        cl_uint *p = (cl_uint *)gIn;
+        cl_uint *p2 = (cl_uint *)gIn2;
+        cl_uint *p3 = (cl_uint *)gIn3;
         for (j = 0; j < bufferSize / sizeof(float); j++)
         {
             p[j] = genrand_int32(d);
@@ -1160,21 +1169,21 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
     double maxErrorVal = 0.0f;
     double maxErrorVal2 = 0.0f;
     double maxErrorVal3 = 0.0f;
-    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
-
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(double), bufferSize);
 
+    logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
+
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -1213,18 +1222,21 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
             p2[j] = DoubleFromUInt32(genrand_int32(d));
             p3[j] = DoubleFromUInt32(genrand_int32(d));
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
                                           bufferSize, gIn2, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
             return error;
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
                                           bufferSize, gIn3, 0, NULL, NULL)))
         {
@@ -1287,7 +1299,6 @@ int TestFunc_Double_Double_Double_Double(const Func *f, MTdata d,
             }
         }
 
-
         // Get that moving
         if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
diff --git a/test_conformance/math_brute_force/unary.cpp b/test_conformance/math_brute_force/unary.cpp
index ae3f54e8..5085b9b4 100644
--- a/test_conformance/math_brute_force/unary.cpp
+++ b/test_conformance/math_brute_force/unary.cpp
@@ -37,13 +37,14 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
                         sizeNames[vectorSize],
                         "* out, __global float",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -89,7 +90,6 @@ static int BuildKernel(const char *name, int vectorSize, cl_uint kernel_count,
         "}\n"
     };
 
-
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -118,7 +118,7 @@ static int BuildKernelDouble(const char *name, int vectorSize,
                         sizeNames[vectorSize],
                         "* out, __global double",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -181,7 +181,6 @@ static int BuildKernelDouble(const char *name, int vectorSize,
         kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
-
     char testName[32];
     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
@@ -249,7 +248,7 @@ typedef struct TestInfo
     int isRangeLimited; // 1 if the function is only to be evaluated over a
                         // range
     float half_sin_cos_tan_limit;
-    bool relaxedMode; // True if test is to be run in relaxed mode, false
+    bool relaxedMode; // True if test is running in relaxed mode, false
                       // otherwise.
 } TestInfo;
 
@@ -269,10 +268,10 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     // Init test_info
     memset(&test_info, 0, sizeof(test_info));
     test_info.threadCount = GetThreadCount();
-
     test_info.subBufferSize = BUFFER_SIZE
         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
     test_info.scale = getTestScale(sizeof(cl_float));
+
     if (gWimpyMode)
     {
         test_info.subBufferSize = gWimpyBufferSize
@@ -345,8 +344,8 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                 &region, &error);
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
-                vlog_error("Error: Unable to create sub-buffer of gInBuffer "
-                           "for region {%zd, %zd}\n",
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gInBuffer for region {%zd, %zd}\n",
                            region.origin, region.size);
                 goto exit;
             }
@@ -390,6 +389,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting || skipTestingRelaxed)
     {
         error = ThreadPool_Do(TestFloat, test_info.jobCount, &test_info);
@@ -443,8 +443,9 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_float);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_float) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(test_info.k[j][0], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -479,9 +480,9 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
                 }
 
                 uint64_t endTime = GetTime();
-                double current_time = SubtractTime(endTime, startTime);
-                sum += current_time;
-                if (current_time < bestTime) bestTime = current_time;
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
             }
 
             if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
@@ -497,6 +498,7 @@ int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
     vlog("\n");
 
 exit:
+    // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
@@ -553,7 +555,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint *out[VECTOR_SIZE_COUNT];
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
             buffer_size, 0, NULL, e + j, &error);
         if (error || NULL == out[j])
@@ -627,7 +629,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
             return error;
         }
 
-        // run the kernel
+        // Run the kernel
         size_t vectorCount =
             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
@@ -655,7 +657,6 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-
     // Get that moving
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
 
@@ -670,7 +671,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
     // an in order queue.
     for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
     {
-        out[j] = (uint32_t *)clEnqueueMapBuffer(
+        out[j] = (cl_uint *)clEnqueueMapBuffer(
             tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
@@ -680,6 +681,7 @@ static cl_int TestFloat(cl_uint job_id, cl_uint thread_id, void *data)
             return error;
         }
     }
+
     // Wait for the last buffer
     out[j] = (uint32_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
                                             CL_TRUE, CL_MAP_READ, 0,
@@ -1246,12 +1248,9 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
 
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            /* Qualcomm fix: 9461 read-write flags must be compatible with
-             * parent buffer */
             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
                 &region, &error);
-            /* Qualcomm fix: end */
             if (error || NULL == test_info.tinfo[i].outBuf[j])
             {
                 vlog_error("Error: Unable to create sub-buffer of gInBuffer "
@@ -1281,6 +1280,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
             goto exit;
     }
 
+    // Run the kernels
     if (!gSkipCorrectnessTesting)
     {
         error = ThreadPool_Do(TestDouble, test_info.jobCount, &test_info);
@@ -1334,8 +1334,9 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            size_t vectorSize = sizeValues[j] * sizeof(cl_double);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeof(cl_double) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
             if ((error = clSetKernelArg(test_info.k[j][0], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -1370,9 +1371,9 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
                 }
 
                 uint64_t endTime = GetTime();
-                double current_time = SubtractTime(endTime, startTime);
-                sum += current_time;
-                if (current_time < bestTime) bestTime = current_time;
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
             }
 
             if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
@@ -1393,6 +1394,7 @@ int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
     vlog("\n");
 
 exit:
+    // Release
     for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
     {
         clReleaseProgram(test_info.programs[i]);
diff --git a/test_conformance/math_brute_force/unary_two_results.cpp b/test_conformance/math_brute_force/unary_two_results.cpp
index 77d40b0d..a3be1d8d 100644
--- a/test_conformance/math_brute_force/unary_two_results.cpp
+++ b/test_conformance/math_brute_force/unary_two_results.cpp
@@ -36,7 +36,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out2, __global float",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -93,6 +93,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "   }\n"
         "}\n"
     };
+
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -121,7 +122,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out2, __global double",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -179,6 +180,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "   }\n"
         "}\n"
     };
+
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -242,17 +244,19 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
     cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
     int isFract = 0 == strcmp("fract", f->nameInCode);
     int skipNanInf = isFract && !gInfNanSupport;
-    float float_ulps = getAllowedUlpError(f, relaxedMode);
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
+    float float_ulps = getAllowedUlpError(f, relaxedMode);
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -282,6 +286,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
                 }
             }
         }
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
@@ -454,7 +459,6 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
                     if (relaxedMode || skipNanInf)
                     {
                         if (skipNanInf && overflow[j]) continue;
-
                         // Note: no double rounding here.  Reference functions
                         // calculate in single precision.
                         if (IsFloatInfinity(correct) || IsFloatNaN(correct)
@@ -670,6 +674,7 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
         uint32_t *p = (uint32_t *)gIn;
         for (j = 0; j < bufferSize / sizeof(float); j++)
             p[j] = genrand_int32(d);
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
@@ -706,7 +711,6 @@ int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
             double bestTime = INFINITY;
             for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
-
                 uint64_t startTime = GetTime();
                 if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
                                                     &localCount, NULL, 0, NULL,
@@ -775,13 +779,13 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -1103,7 +1107,6 @@ int TestFunc_Double2_Double(const Func *f, MTdata d, bool relaxedMode)
             double bestTime = INFINITY;
             for (k = 0; k < PERF_LOOP_COUNT; k++)
             {
-
                 uint64_t startTime = GetTime();
                 if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
                                                     &localCount, NULL, 0, NULL,
diff --git a/test_conformance/math_brute_force/unary_two_results_i.cpp b/test_conformance/math_brute_force/unary_two_results_i.cpp
index f3c73434..6c56ed1f 100644
--- a/test_conformance/math_brute_force/unary_two_results_i.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i.cpp
@@ -37,13 +37,14 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out2, __global float",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i], out2 + i );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -93,6 +94,7 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
         "   }\n"
         "}\n"
     };
+
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -121,13 +123,14 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out2, __global double",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i], out2 + i );\n"
                         "}\n" };
+
     const char *c3[] = {
         "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
         "__kernel void math_kernel",
@@ -178,6 +181,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         "   }\n"
         "}\n"
     };
+
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
 
@@ -240,13 +244,13 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-    float float_ulps;
     uint64_t step = getTestStep(sizeof(float), bufferSize);
     int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(float)) + 1);
     cl_ulong maxiError;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
+    float float_ulps;
     if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
@@ -255,12 +259,14 @@ int TestFunc_FloatI_Float(const Func *f, MTdata d, bool relaxedMode)
     maxiError = float_ulps == INFINITY ? CL_ULONG_MAX : 0;
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     for (i = 0; i < (1ULL << 32); i += step)
     {
@@ -600,22 +606,21 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
     double maxErrorVal2 = 0.0f;
     cl_ulong maxiError = f->double_ulps == INFINITY ? CL_ULONG_MAX : 0;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-
-    uint64_t step = getTestStep(sizeof(double), bufferSize);
-    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(double)) + 1);
+    uint64_t step = getTestStep(sizeof(cl_double), bufferSize);
+    int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(cl_double)) + 1);
 
     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
 
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -624,12 +629,12 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
         double *p = (double *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < bufferSize / sizeof(double); j++)
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j * scale);
         }
         else
         {
-            for (j = 0; j < bufferSize / sizeof(double); j++)
+            for (j = 0; j < bufferSize / sizeof(cl_double); j++)
                 p[j] = DoubleFromUInt32((uint32_t)i + j);
         }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
@@ -928,7 +933,7 @@ int TestFunc_DoubleI_Double(const Func *f, MTdata d, bool relaxedMode)
             double clocksPerOp = bestTime * (double)gDeviceFrequency
                 * gComputeDevices * gSimdSize * 1e6
                 / (bufferSize / sizeof(double));
-            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sd%s",
+            vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element", "%sD%s",
                       f->name, sizeNames[j]);
         }
         for (; j < gMaxVectorSizeIndex; j++) vlog("\t     -- ");
diff --git a/test_conformance/math_brute_force/unary_u.cpp b/test_conformance/math_brute_force/unary_u.cpp
index 53f5db38..df6724ca 100644
--- a/test_conformance/math_brute_force/unary_u.cpp
+++ b/test_conformance/math_brute_force/unary_u.cpp
@@ -33,13 +33,14 @@ static int BuildKernel(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out, __global uint",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
                         name,
                         "( in[i] );\n"
                         "}\n" };
+
     const char *c3[] = {
         "__kernel void math_kernel",
         sizeNames[vectorSize],
@@ -112,7 +113,7 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         sizeNames[vectorSize],
                         "* out, __global ulong",
                         sizeNames[vectorSize],
-                        "* in)\n"
+                        "* in )\n"
                         "{\n"
                         "   int i = get_global_id(0);\n"
                         "   out[i] = ",
@@ -120,51 +121,53 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
                         "( in[i] );\n"
                         "}\n" };
 
-    const char *c3[] = { "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
-                         "__kernel void math_kernel",
-                         sizeNames[vectorSize],
-                         "( __global double* out, __global ulong* in)\n"
-                         "{\n"
-                         "   size_t i = get_global_id(0);\n"
-                         "   if( i + 1 < get_global_size(0) )\n"
-                         "   {\n"
-                         "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
-                         "       double3 f0 = ",
-                         name,
-                         "( u0 );\n"
-                         "       vstore3( f0, 0, out + 3*i );\n"
-                         "   }\n"
-                         "   else\n"
-                         "   {\n"
-                         "       size_t parity = i & 1;   // Figure out how "
-                         "many elements are left over after BUFFER_SIZE % "
-                         "(3*sizeof(float)). Assume power of two buffer size \n"
-                         "       ulong3 u0;\n"
-                         "       switch( parity )\n"
-                         "       {\n"
-                         "           case 1:\n"
-                         "               u0 = (ulong3)( in[3*i], "
-                         "0xdeaddeaddeaddeadUL, 0xdeaddeaddeaddeadUL ); \n"
-                         "               break;\n"
-                         "           case 0:\n"
-                         "               u0 = (ulong3)( in[3*i], in[3*i+1], "
-                         "0xdeaddeaddeaddeadUL ); \n"
-                         "               break;\n"
-                         "       }\n"
-                         "       double3 f0 = ",
-                         name,
-                         "( u0 );\n"
-                         "       switch( parity )\n"
-                         "       {\n"
-                         "           case 0:\n"
-                         "               out[3*i+1] = f0.y; \n"
-                         "               // fall through\n"
-                         "           case 1:\n"
-                         "               out[3*i] = f0.x; \n"
-                         "               break;\n"
-                         "       }\n"
-                         "   }\n"
-                         "}\n" };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global double* out, __global ulong* in                 )\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       ulong3 u0 = vload3( 0, in + 3 * i );\n"
+        "       double3 f0 = ",
+        name,
+        "( u0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       ulong3 u0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               u0 = (ulong3)( in[3*i], 0xdeaddeaddeaddeadUL, "
+        "0xdeaddeaddeaddeadUL ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               u0 = (ulong3)( in[3*i], in[3*i+1], "
+        "0xdeaddeaddeaddeadUL ); \n"
+        "               break;\n"
+        "       }\n"
+        "       double3 f0 = ",
+        name,
+        "( u0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
 
     const char **kern = c;
     size_t kernSize = sizeof(c) / sizeof(c[0]);
@@ -175,7 +178,6 @@ static int BuildKernelDouble(const char *name, int vectorSize, cl_kernel *k,
         kernSize = sizeof(c3) / sizeof(c3[0]);
     }
 
-
     char testName[32];
     snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
              sizeNames[vectorSize]);
@@ -221,27 +223,28 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
     float maxErrorVal = 0.0f;
     size_t bufferSize = (gWimpyMode) ? gWimpyBufferSize : BUFFER_SIZE;
-
     uint64_t step = getTestStep(sizeof(float), bufferSize);
     int scale = (int)((1ULL << 32) / (16 * bufferSize / sizeof(double)) + 1);
     int isRangeLimited = 0;
-    float float_ulps;
     float half_sin_cos_tan_limit = 0;
 
     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
 
+    float float_ulps;
     if (gIsEmbedded)
         float_ulps = f->float_embedded_ulps;
     else
         float_ulps = f->float_ulps;
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_FloatFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
-        return error;
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_FloatFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
 
     if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
     {
@@ -317,7 +320,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
                      clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
                                             &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error("FAILURE -- could not execute kernel\n");
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
@@ -419,7 +422,6 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
         }
     }
 
-
     if (!gSkipCorrectnessTesting)
     {
         if (gWimpyMode)
@@ -477,7 +479,7 @@ int TestFunc_Float_UInt(const Func *f, MTdata d, bool relaxedMode)
                                                     &localCount, NULL, 0, NULL,
                                                     NULL)))
                 {
-                    vlog_error("FAILURE -- could not execute kernel\n");
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }
 
@@ -540,13 +542,13 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
     Force64BitFPUPrecision();
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode, relaxedMode };
-    if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                       f->nameInCode, relaxedMode };
+        if ((error = ThreadPool_Do(BuildKernel_DoubleFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
 
     for (i = 0; i < (1ULL << 32); i += step)
@@ -599,7 +601,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
                      clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
                                             &localCount, NULL, 0, NULL, NULL)))
             {
-                vlog_error("FAILURE -- could not execute kernel\n");
+                vlog_error("FAILED -- could not execute kernel\n");
                 goto exit;
             }
         }
@@ -627,7 +629,6 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
 
         if (gSkipCorrectnessTesting) break;
 
-
         // Verify data
         uint64_t *t = (uint64_t *)gOut_Ref;
         for (j = 0; j < bufferSize / sizeof(cl_double); j++)
@@ -741,7 +742,7 @@ int TestFunc_Double_ULong(const Func *f, MTdata d, bool relaxedMode)
                                                     &localCount, NULL, 0, NULL,
                                                     NULL)))
                 {
-                    vlog_error("FAILURE -- could not execute kernel\n");
+                    vlog_error("FAILED -- could not execute kernel\n");
                     goto exit;
                 }